# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import random
from random import sample
import datetime
import sys
import ast
random.seed(100)

# Dataset Size
Specify number of employees, jobs, and maximum number of previous/applied/searched for jobs.

In [2]:
E = 500              # Number of employees
J = 2000             # Number of jobs
maxPrevJobs = 5      # Maximum number of previous jobs employee had
maxJobsApplied = 10  # Maximum number of jobs employee applied to
maxJobsSearched = 10 # Maximum number of jobs employee searched for
maxNetwork = 10      # Maximum number of employees an employee is connected to

# Generate Initial Employee Data

Create E rows of the following data:


- Employee ID
- Degree
- Location
- Technical Skills

### Employee ID
0-indexed (0-499)

In [3]:
employeeID = np.array(range(E))

### Degree
Randomly assign one of the following degrees to each employee:
- Computer Science
- Data Science
- Information Science
- Cybersecurity

In [4]:
degrees = ["Computer Science", "Data Science", "Information Science", "Cyber Security"]

degreeList = []
for i in range(E):
    degreeList.append(degrees[random.randrange(len(degrees))])

### Location
Randomly assign one of the following locations to each employee:
- NYC
- Boston
- San Francisco
- Seattle
- Houston

In [5]:
locations = ["NYC", "Boston", "San Francisco", "Seattle", "Houston"]

locationList = []
for i in range(E):
    locationList.append(locations[random.randrange(len(locations))])

### Technical Skills

In [6]:
## Technical skills [str list]
languages = ['Java', 'Python', 'C', 'C++', 'R', 'Scala', 'Ruby', 'Javascript', 'React', 
             'Node.js', 'Angular', 'Swift', 'Bash', 'SQL', 'HTML/CSS']
operatingSystems = ['Linux', 'MacOS', 'Windows']
otherSoftwares = ['NumPy', 'Scikit-learn', 'Spring', 'MySQLWorkbench', 'PostGres', 'Oracle', 
                'Azure', 'AWS']

allTechnicalSkills = []

# Add randomly selected langauges, operating systems, and frameworks/libraries/tools to each employee
for i in range(E):
    technicalSkills = []  
    
    ### LANGUAGE
    # Generate a number indicating how many programming languages employee knows
    numLanguages = random.randrange(len(languages))
    # Randomly choose languages from list 'numLanguages' number of times
    languages_known = random.sample(range(len(languages)), numLanguages+1)
    for i in languages_known:
        technicalSkills.append(languages[i])
        
    ### OS
    # Number of operating systems employee knows
    numOS = random.randrange(len(operatingSystems))
    # Randomly choose the operating systems the employee knows (at least 1)
    systems = random.sample(range(len(operatingSystems)), numOS+1)
    for i in systems:
        technicalSkills.append(operatingSystems[i])

    ### Other Software (libraries, frameworks, database)
    # Number of softwares employee knows
    numSoftwares = random.randrange(len(otherSoftwares))
    # Randomly choose softwares from list 'numSoftwares' number of times
    softwares_known = random.sample(range(len(otherSoftwares)), numSoftwares)
    for i in softwares_known:
        technicalSkills.append(otherSoftwares[i])

    # Perform validations to ensure the languages/software added make sense
    
    # If employee knows React or Node.js, they must also know Javascript
    if 'React' in technicalSkills or 'Node.js' in technicalSkills:
        if 'Javascript' not in technicalSkills:
            technicalSkills.append('Javascript')
    
    # If employee knows NumPy, Scikit-learn, or AWS, they must also know Python
    if 'NumPy' in technicalSkills or 'Scikit-learn' in technicalSkills or 'AWS' in technicalSkills:
        if 'Python' not in technicalSkills:
            technicalSkills.append('Python')
    
    # If employee knows Spring, they must also know Java
    if 'Spring' in technicalSkills and 'Java' not in technicalSkills:
        technicalSkills.append('Java')
    
    # If employee knows PostGres, Oracle, or MySQLWorkbench, they must also know SQL
    if 'PostGres' in technicalSkills or 'Oracle' in technicalSkills or 'MySQLWorkbench' in technicalSkills:
        if 'SQL' not in technicalSkills:
            technicalSkills.append('SQL')
            
    allTechnicalSkills.append(technicalSkills)

allTechnicalSkills = np.array(allTechnicalSkills)

  allTechnicalSkills = np.array(allTechnicalSkills)


### Create Initial Employee Dataframe
Create an employee dataframe using columns generated so far so that additional columns may be generated based on these.

In [7]:
employeeData = pd.DataFrame({'employeeID': employeeID, 'degree':degreeList, 'location':locationList,
                             'technicalSkills': allTechnicalSkills})
employeeData

Unnamed: 0,employeeID,degree,location,technicalSkills
0,0,Data Science,Houston,"[Ruby, C, Javascript, Scala, R, Angular, React..."
1,1,Cyber Security,Seattle,"[R, C, Javascript, Node.js, Python, SQL, Ruby,..."
2,2,Cyber Security,Boston,"[R, Swift, Python, Ruby, SQL, C++, Angular, No..."
3,3,Data Science,San Francisco,"[Swift, React, R, Python, Bash, HTML/CSS, C++,..."
4,4,Cyber Security,Houston,"[Bash, Linux, MacOS, Windows, Spring, Azure, O..."
...,...,...,...,...
495,495,Cyber Security,San Francisco,"[Node.js, Swift, Scala, Python, R, React, SQL,..."
496,496,Cyber Security,San Francisco,"[SQL, Angular, Java, Python, HTML/CSS, React, ..."
497,497,Information Science,San Francisco,"[C++, Java, Javascript, C, React, Angular, Bas..."
498,498,Cyber Security,NYC,"[C, HTML/CSS, Angular, React, Java, SQL, Node...."


# Generate Job Data
Create J rows of the following data:
- jobID
- degree
- location
- technicalSkills
- preferredSkills
- jobTitle

### Job ID
0-indexed (0-1999)

In [8]:
jobID = np.array(range(J))

### Degree
Randomly assign one of the following degrees to each job:
- Computer Science
- Data Science
- Information Science
- Cybersecurity

In [9]:
degrees = ["Computer Science", "Data Science", "Information Science", "Cyber Security"]

jobDegreeList = []
for i in range(J):
    jobDegreeList.append(degrees[random.randrange(len(degrees))])

### Location
Randomly assign one of the following locations to each job:
- NYC
- Boston
- San Francisco
- Seattle
- Houston

In [10]:
locations = ["NYC", "Boston", "San Francisco", "Seattle", "Houston"]

jobLocationList = []
for i in range(J):
    jobLocationList.append(locations[random.randrange(len(locations))])

### Technical Skills

Similar to employee data generation but limited to half the total length of language/otherSoftware arrays for max possible languages/softwares

In [11]:
## Technical skills [str list]
languages = ['Java', 'Python', 'C', 'C++', 'R', 'Scala', 'Ruby', 'Javascript', 'React', 
             'Node.js', 'Angular', 'Swift', 'Bash', 'SQL', 'HTML/CSS']
otherSoftwares = ['NumPy', 'Scikit-learn', 'Spring', 'MySQLWorkbench', 'PostGres', 'Oracle', 
                'Azure', 'AWS']

allTechnicalSkills = []

# Add randomly selected languages, operating systems, and frameworks/libraries/tools to each job posting
for i in range(J):
    technicalSkills = []  
    
    ### LANGUAGE
    # Generate a number indicating how many programming languages employee knows
    numLanguages = random.randint(1, int(len(languages)/2))
    # Randomly choose languages from list 'numLanguages' number of times
    languages_known = random.sample(range(len(languages)), numLanguages+1)
    for i in languages_known:
        technicalSkills.append(languages[i])
        
    ### Other Software (libraries, frameworks, database)
    # Number of softwares employee knows
    numSoftwares = random.randint(1, int(len(otherSoftwares)/2))
    # Randomly choose softwares from list 'numSoftwares' number of times
    softwares_known = random.sample(range(len(otherSoftwares)), numSoftwares)
    for i in softwares_known:
        technicalSkills.append(otherSoftwares[i])

    # Perform validations to ensure the languages/software added make sense
    
    # If job posting requires React or Node.js, it must also require Javascript
    if 'React' in technicalSkills or 'Node.js' in technicalSkills:
        if 'Javascript' not in technicalSkills:
            technicalSkills.append('Javascript')
    
    # If job posting requires NumPy, Scikit-learn, or AWS, it must also require Python
    if 'NumPy' in technicalSkills or 'Scikit-learn' in technicalSkills or 'AWS' in technicalSkills:
        if 'Python' not in technicalSkills:
            technicalSkills.append('Python')
    
    # If job posting requires Spring, it must also require Java
    if 'Spring' in technicalSkills and 'Java' not in technicalSkills:
        technicalSkills.append('Java')
    
    # If job posting requires PostGres, Oracle, or MySQLWorkbench, it must also require SQL
    if 'PostGres' in technicalSkills or 'Oracle' in technicalSkills or 'MySQLWorkbench' in technicalSkills:
        if 'SQL' not in technicalSkills:
            technicalSkills.append('SQL')
            
    allTechnicalSkills.append(technicalSkills)

allTechnicalSkills = np.array(allTechnicalSkills)

  allTechnicalSkills = np.array(allTechnicalSkills)


### Preferred skills

For job postings with at least 2 technical skills required, remove some skills and make them preferred (up to half can become preferred skills; does not make sense to have all preferred and no required).
Ensure the langauge extracted from required to preferred status makes sense (e.g. If React and Javascript are required, don't make Javascript preferred).

In [12]:
count = 0
allPreferredSkills = []
for i in allTechnicalSkills:
    skillset = i
    preferredSkills = []
    # Check if the technical skill list has at least 2 skills excluding an OS
    skills_filtered = filter(lambda a: a not in ['MacOS', 'Linux', 'Windows'], skillset)
    # Add to array to perform operations on filtered list such as len()
    skills_NoOS = []
    for x in skills_filtered:
        skills_NoOS.append(x)
    
    # Create preferred skills only if at least 2 skills that are not OS-related
    if(len(skills_NoOS) > 1):
        
        # Then make *up to* half of the skills (minimum 1) (excluding OS) preferred skills (randomly selected)
        half_skill_ct = int(len(skills_NoOS)/2)
        if(half_skill_ct > 1):
            num_skills_preferred = random.randrange(1,int(len(skills_NoOS)/2))
        else:
            num_skills_preferred = 1
        preferred_skills = random.sample(range(len(skills_NoOS)), num_skills_preferred)

        # Remove from technical skills (required) and add to preferred skills
        for j in preferred_skills:
            skill = skills_NoOS[j]
            # Perform validations
            # If the skills are the following and related skills are not required, proceed with 
            # making them required -> preferred
            condition1 = True
            condition2 = True
            condition3 = True
            condition4 = True 
            condition1 = skill == 'Javascript' and 'Node.js' not in skillset and 'React' not in skillset
            condition2 = skill == 'Python' and 'NumPy' not in skillset and 'Scikit-learn' not in skillset
            condition3 = skill == 'Java' and 'Spring' not in skillset
            condition4 = skill == 'SQL' and 'PostGres' not in skillset and 'Oracle' not in skillset and 'MySQLWorkbench' not in skillset
            
            # If the skill is JS, py, java, or sql and meets the conditions, proceed
            if(skill not in allTechnicalSkills[count]):
                pass
            elif(condition1 or condition2 or condition3 or condition4):
                allTechnicalSkills[count].remove(skill)
                preferredSkills.append(skill)
            
            # Proceed if the langauge is anything else besides the following as there will be no issues:
            elif(skill not in ['Javascript', 'Python', 'Java', 'SQL']):
                allTechnicalSkills[count].remove(skill)
                preferredSkills.append(skill)
            else:
                pass
            
    else:
        preferredSkills.append([])
    allPreferredSkills.append(preferredSkills)
    count+=1
allPreferredSkills = np.array(allPreferredSkills)

  allPreferredSkills = np.array(allPreferredSkills)


### Create Initial Job Dataframe
Create a job dataframe using columns generated so far so that job title may be added.

In [13]:
jobs = pd.DataFrame({'jobID': jobID, 'degree': jobDegreeList, 'location': jobLocationList,
                     'technicalSkills':allTechnicalSkills, 'preferredSkills': allPreferredSkills})

### Job Title
Add job title for each job, making sure it aligns with the required/preferred languages (e.g. "Frontend Developer" should have Javascript) and degree (e.g. data science vs computer science).

In [14]:
# Degrees: Computer Science, Data Science, Cybersecurity, Information Science

ds_jobs = ["Data Engineer","Data Analyst","Data Scientist","Database Application Specialist", "Data Architect", "Machine Learning Engineer"]
is_jobs = ["Data Engineer", "Data Analyst","Intelligence Specialist","Database Applications Specialist","Systems analyst", "Systems administrator"]
cyber_jobs = ["Information Security Analyst","Penetration Tester","Data Recovery Professional","Network Security Engineer","Cryptographer"]

allJobTitles = []

for i in range(J):
    jobTitle = ""
    degree = jobs["degree"][i]
    technicalSkills = jobs["technicalSkills"][i]

    
    if(degree == "Computer Science"):       
        if('Javascript' in technicalSkills or 'HTML/CSS' in technicalSkills or 'React' in technicalSkills or 'Node.js' in technicalSkills):
            if(('Python' in technicalSkills or 'Java' in technicalSkills) and 'SQL' in technicalSkills):
                jobTitle = "Full Stack Developer"
            else:    
                if('React' in technicalSkills):
                    sweJobs = ["Web Developer", "Front End Engineer", "React Developer", "UI Developer"]
                    jobTitle = sweJobs[random.randrange(len(sweJobs))]
                else:
                    sweJobs = ["Web Developer", "Front End Engineer", "Javascript Developer", "UI Developer"]
                    jobTitle = sweJobs[random.randrange(len(sweJobs))]
        else:
            
            if('Swift' in technicalSkills):
                sweJobs = ["Software Engineer", "Software Developer", 
                        "IOS Developer", "Programmer",
                        "Machine Learning Engineer", 
                        "Back-end developer", "DevOps Engineer"] 
                jobTitle = sweJobs[random.randrange(len(sweJobs))]
            elif('Java' in technicalSkills):
                sweJobs = ["Software Engineer", "Software Developer", 
                        "Java Developer", "Programmer",
                        "Machine Learning Engineer", 
                        "Back-end developer", "DevOps Engineer"] 
                jobTitle = sweJobs[random.randrange(len(sweJobs))]
            elif('SQL' in technicalSkills or 'Oracle' in technicalSkills):
                sweJobs = ["Software Engineer", "Software Developer", 
                        "SQL Developer", "Programmer",
                        "Machine Learning Engineer", 
                        "Back-end developer", "DevOps Engineer", "Oracle SQL Developer"] 
                jobTitle = sweJobs[random.randrange(len(sweJobs))]
            else:
                sweJobs = ["Software Engineer", "Software Developer", "Programmer"]
                jobTitle = sweJobs[random.randrange(len(sweJobs))]
    elif(degree == "Information Science"):
        jobTitle = is_jobs[random.randrange(len(is_jobs))]
    elif(degree == "Data Science"):
        jobTitle = ds_jobs[random.randrange(len(ds_jobs))]
    else: # Cybersecurity
        jobTitle = cyber_jobs[random.randrange(len(cyber_jobs))]
    
    allJobTitles.append(jobTitle)
    
allJobTitles = np.array(allJobTitles)

### Save Job Dataframe

In [15]:
jobs['jobTitle'] = allJobTitles
jobs.to_csv('data/job_data.csv')

# Generate Remaining Employee Data
- Previous Jobs
- Jobs Applied
- Jobs Searched
- Network

### Previous Jobs
Create a join table (for SQL) connecting each employee to each previous job they held.
Additionally, create previousJobs column for employee dataset. 

In [16]:
# make empty lists for storing values of joint table
prevJobs_UserId = []
prevJobs_JobId = []

allPrevJobs = []

# iterate over every employee
for i in range(E):
    
    # assign random number for how many jobs this employee held before
    numPrevJobs = random.randint(0, maxPrevJobs) 
    
    # get employee technical skills
    technicalSkills = employeeData["technicalSkills"][i]
    
    # initialize empty list for storing previous job IDs
    prevJobs = []
    j = 0
    
    # run while loop until number of job IDs added equals random number of jobs employee held
    while j < numPrevJobs:
        
        # choose random jobID to consider
        jobId = random.randrange(J)
        jobTechnicalSkills = jobs["technicalSkills"][jobId]
        
        # check what technical skills are overlapping between random chosen job and employee
        intersectingSkills = list(set(technicalSkills) & set(jobTechnicalSkills))
        
        # add random job ID to list of previous jobs if the number of overlapping skills is at
        # least 4 or the job and the employee both hold the same degree
        if len(intersectingSkills) >= 4 or jobs["degree"][jobId] == employeeData["degree"][i]:
            prevJobs_UserId.append(i)
            prevJobs_JobId.append(jobId)
            prevJobs.append(jobId)
            j +=1
            
    allPrevJobs.append(prevJobs)

allPrevJobs = np.array(allPrevJobs)
allPrevJobs

employeeData['previousJobs'] = allPrevJobs.tolist()

len(prevJobs_UserId)
len(prevJobs_JobId)

df_prev_jobs = pd.DataFrame({'employeeID': prevJobs_UserId, 'jobID':prevJobs_JobId})
df_prev_jobs.to_csv('data/previousJobs.csv')
df_prev_jobs

  allPrevJobs = np.array(allPrevJobs)


Unnamed: 0,employeeID,jobID
0,0,1317
1,0,724
2,0,717
3,0,1378
4,0,62
...,...,...
1210,498,1159
1211,499,733
1212,499,392
1213,499,1163


### Jobs Applied
Create a join table (for SQL) connecting each employee to each job they applied to.
Additionally, create jobsApplied column for employee dataset.  

In [17]:
# make empty lists for storing values of joint table
appliedJobs_UserId = []
appliedJobs_JobId = []

allAppliedJobs = []

# iterate over every employee
for i in range(E):
    
    # assign random number for how many jobs this employee applied to
    numAppliedJobs = random.randint(0, maxJobsApplied)
    
    # get employee technical skills
    technicalSkills = employeeData["technicalSkills"][i]
    
    # initialize empty list for storing applied-to job IDs
    appliedJobs = []
    j = 0
    
    # run while loop until number of job IDs added equals random number of jobs employee held
    while j < numAppliedJobs:
        
        # choose random jobID to consider
        jobId = random.randrange(J)
        
        # If the randomly selected job is not in the employee's previous job list
        if(jobId not in employeeData["previousJobs"][i]):
            jobTechnicalSkills = jobs["technicalSkills"][jobId]

            # check what technical skills are overlapping between random chosen job and employee
            intersectingSkills = list(set(technicalSkills) & set(jobTechnicalSkills))

            # add random job ID to list of previous jobs if the number of overlapping skills is at
            # least 3 or the job or the employee both hold the same degree. This number is lower than
            # that for previous jobs (4) since employees are more likey to apply to a job with less 
            # matching technical skills than actually have had one.
            if len(intersectingSkills) >= 3 or jobs["degree"][jobId] == employeeData["degree"][i]:
                appliedJobs_UserId.append(i)
                appliedJobs_JobId.append(jobId)
                appliedJobs.append(jobId)
                j +=1
            
    allAppliedJobs.append(appliedJobs)

allAppliedJobs = np.array(allAppliedJobs)
allAppliedJobs

employeeData['appliedJobs'] = allAppliedJobs.tolist()

len(prevJobs_UserId)
len(prevJobs_JobId)

df_applied_jobs = pd.DataFrame({'employeeID': appliedJobs_UserId, 'jobID':appliedJobs_JobId})
df_applied_jobs.to_csv('data/appliedJobs.csv')
df_applied_jobs

  allAppliedJobs = np.array(allAppliedJobs)


Unnamed: 0,employeeID,jobID
0,0,364
1,0,1225
2,0,1041
3,0,1958
4,0,229
...,...,...
2455,497,587
2456,497,515
2457,497,1858
2458,497,62


### Jobs Searched
Create a join table (for SQL) connecting each employee to each job they searched for.
Additionally, create jobsSearched column for employee dataset. Each job they applied to is a job they searched for, but may have searched for more jobs.

In [18]:
# make empty lists for storing values of joint table
searchedJobs_UserId = []
searchedJobs_JobId = []

allSearchedJobs = []

# iterate over every employee
for i in range(E):
    
    # Jobs employee applied to
    appliedJobs = employeeData["appliedJobs"][i]
    
    # initialize list for storing searched-for job IDs, by default populated by 
    # jobs the employee applied to 
    searchedJobs = appliedJobs
    

    # assign random number for how many jobs this employee searched for
    # calculated as the maximum number of searches allowed minus the number of jobs
    # already searched for (i.e. applied to)
    maxSearched = maxJobsSearched - len(searchedJobs)
    if (maxSearched > 0):
        numsearchedJobs = random.randint(0, maxSearched)

        # get employee technical skills
        technicalSkills = employeeData["technicalSkills"][i]


        j = 0

        # run while loop until number of job IDs added equals random number of jobs employee searched for
        while j < numsearchedJobs:

            # choose random jobID to consider
            jobId = random.randrange(J)

            # If the randomly selected job is not in the employee's previous job list
            if(jobId not in employeeData["previousJobs"][i]):
                jobTechnicalSkills = jobs["technicalSkills"][jobId]

                # check what technical skills are overlapping between random chosen job and employee
                intersectingSkills = list(set(technicalSkills) & set(jobTechnicalSkills))

                # add random job ID to list of previous jobs if the number of overlapping skills is at
                # least 2 or the job or the employee both hold the same degree. This number is lower than
                # that for previous jobs (4) since employees are more likey to apply to a job with less 
                # matching technical skills than actually have had one.
                if len(intersectingSkills) >= 2 or jobs["degree"][jobId] == employeeData["degree"][i]:
                    searchedJobs_UserId.append(i)
                    searchedJobs_JobId.append(jobId)
                    searchedJobs.append(jobId)
                    j +=1

    allSearchedJobs.append(searchedJobs)

allSearchedJobs = np.array(allSearchedJobs)
allSearchedJobs

employeeData['searchedJobs'] = allSearchedJobs.tolist()

len(prevJobs_UserId)
len(prevJobs_JobId)

df_searched_jobs = pd.DataFrame({'employeeID': searchedJobs_UserId, 'jobID':searchedJobs_JobId})
df_searched_jobs.to_csv('data/searchedJobs.csv')
df_searched_jobs

  allSearchedJobs = np.array(allSearchedJobs)


Unnamed: 0,employeeID,jobID
0,0,1932
1,1,347
2,1,542
3,1,1341
4,2,242
...,...,...
1272,498,1375
1273,499,1296
1274,499,31
1275,499,906


### Network

In [24]:
net={}

employeeID = []
connectedEmployeeID = []

df = employeeData
for index, row in df.iterrows():
    people = []
    for index2, row2 in df.iterrows():
        previousJob1 = row['previousJobs']
        previousJob2 = row2['previousJobs']
        
        if row['employeeID'] != row2['employeeID']:
            for itm in previousJob1:
                if itm in previousJob2:
                    people.append(row2['employeeID'])
    net[row['employeeID']] = set(people)
        

net

for index, row in df.iterrows():
    people_samleLoc = []
    for index2, row2 in df.iterrows():
        loc1 = row['location']
        loc2 = row2['location']
        if row['employeeID'] != row2['employeeID']:
            if itm == loc2:
                people_samleLoc.append(row2['employeeID'])
    for ele in people_samleLoc:
        net[row['employeeID']].add(ele)
    net[row['employeeID']]=list(net[row['employeeID']])
         
network = []
for v in net.values():
    network.append(v)
    

    
employeeData['network']=network
network

employeeID = []
connectedEmployeeID = []
scores = []
scoresIndividually = []
        
# gets the score for how similar B is to A out of 100:
# 100 = 80 (fraction out of 80 for similar past jobs) + 20 (0 if not same location)
def get_score(jobsA, jobsB, locA, locB):
    setA = set(jobsA)
    simSet = setA.intersection(jobsB)
    simList = list(simSet)
    # proprtion of similarity out of 80 for previousJobs
    if len(jobsA) == 0:
        job_score = 0
    else:
        job_score = len(simList) / len(jobsA) * 80
    
    # if location is the same, 20/20 points, otherwise 0/20
    if locA == locB:
        loc_score = 20;
    else:
        loc_score = 0;
        
    return job_score + loc_score


# returns list of connectedness for a network given a person
def get_connectedness(person, network):
    
    if len(network) == 0:
        return []

    similarity_degs = []
    myJobs = df.iloc[person]['previousJobs']
    myLoc = df.iloc[person]['location']
    
    for netId in network: # for each person in the network:
        netJobs = df.iloc[netId]['previousJobs']
        netLoc = df.iloc[netId]['location']
        net_score = get_score(myJobs, netJobs, myLoc, netLoc)
        similarity_degs.append(net_score)
        scoresIndividually.append(net_score)
    return similarity_degs

for index, row in df.iterrows():
    score = get_connectedness(index, row['network'])
    scores.append(score)

for key, val in net.items():
    for e in val:
        employeeID.append(key)
        connectedEmployeeID.append(e)
        
df_network = pd.DataFrame({'employeeID': employeeID, 'connectedEmployeeID':connectedEmployeeID, 'score': scoresIndividually })
df_network.to_csv('data/network.csv')
df_network.head()

Unnamed: 0,employeeID,connectedEmployeeID,score
0,0,125,36.0
1,0,451,16.0
2,0,29,16.0
3,0,94,16.0
4,1,397,20.0


### Save Final Employee Dataset

In [20]:
employeeData.to_csv('data/employee_data.csv')