In [47]:
import csv
import pandas as pd

from ast import literal_eval

In [48]:
job_data = []
with open ('../bert-model/data/jobs_data.csv') as j_file:
    csv_reader = csv.reader(j_file)
    next(csv_reader)
    for row in csv_reader:
        row[2] = [skill.strip(" '\"") for skill in row[2].strip('[]').split(',')]
        job_data.append(row)

jobs_df = pd.DataFrame(job_data, columns=['title', 'description', 'skills'])

course_data = []
with open ('../bert-model/data/courses_data.csv') as c_data:
    csv_reader = csv.reader(c_data)
    next(csv_reader)
    for row in csv_reader:
        row[1] = [skill.strip(" '\"") for skill in row[1].strip('[]').split(',')]
        course_data.append(row)

courses_df = pd.DataFrame(course_data, columns=['course', 'skills'])

In [49]:
numskills_j_data = []

for i in range(0, len(job_data)):
    numskills_j_data.append(len(job_data[i][2]))

In [50]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [51]:
numskills_j_df = []
for _, row in jobs_df.iterrows():
    numskills_j_df.append(len(row['skills']))

In [52]:
def all_class_comparison(jobs_df, courses_df, all_course_skills):
    gaps_df = pd.DataFrame(columns=['job title', 'covered skills', 'missing skills', 'intersecting course skills', 'gt label']) #'nonrequired skills taught', 'gt label'])

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        courses = zip(courses_df['course'], courses_df['skills'])

        missing_skills = job_skills - all_course_skills
        covered_skills = job_skills.intersection(all_course_skills)
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))

        #non_required = []
        intersecting = []

        for course, skills in courses:
            course_inter = list(job_skills.intersection(set(skills)))
            if len(course_inter) > 0:
                intersecting.append((course, course_inter))
            #non_required.append((course, list(set(skills) - job_skills)))

        gaps_entry = pd.DataFrame([{
                                    'job title': job_title,
                                    'covered skills': covered_skills,
                                    'missing skills': missing_skills, 
                                    'intersecting course skills': intersecting, 
                                    #'nonrequired skills taught': non_required,
                                    'gt label': (1.0 - (num_missing/num_job_skills))      
                                }])
        
        gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)

        
    return gaps_df

In [53]:
all_course_skills = get_all_acquired_skills(courses_df)

gaps_df = all_class_comparison(jobs_df, courses_df, all_course_skills)

gaps_df

  gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)


Unnamed: 0,job title,covered skills,missing skills,intersecting course skills,gt label
0,Adobe_AI_ML_Engineer,"{Python, Testing, Quality Assurance, Debugging...","{Knowledge sharing, Bug fixing, Code/API docum...","[(CS314, [Teamwork, Communication]), (CS150, [...",0.193548
1,Adobe_Junior_SDE,"{Security, Teamwork, Java}","{Innovation, Knowledge sharing, Cloud Technolo...","[(CS314, [Teamwork, Java]), (CS165, [Java]), (...",0.071429
2,Adobe_Software_Engineering_Intern,"{Python, Java, C++, Algorithms, Networking, Te...","{Interpersonal Skills, OOP/Func Paradigms, Mac...","[(CS314, [Networking, Teamwork, Java, Communic...",0.333333
3,Adobe_Software_Quality_Engineer,"{Testing, Quality Assurance, Communication}","{HTML, React.js, Time management, Implementati...","[(CS314, [Communication]), (CS164, [Testing, Q...",0.150000
4,Adobe_Software_Solutions_Architect,"{Teamwork, Optimization, Security, Research, C...","{CRM systems, Innovation, Data management, Imp...","[(CS314, [Optimization, Teamwork, Communicatio...",0.094340
...,...,...,...,...,...
74,Tesla_Software_Compliance_Engineer,"{Python, C, Testing, C++, Debugging, Java, Com...","{Innovation, Software architecture, SIL/HIL/In...","[(CS314, [Java, Communication]), (CS165, [Java...",0.184211
75,Tesla_Software_Engineer_Firmware,"{C, Software Design, Static Analysis, Debuggin...","{Innovation, System debugging, Knowledge shari...","[(CS314, [Communication]), (CS270, [C]), (CS41...",0.151515
76,Tesla_Software_Engineer_Recruiting,"{Web Applications, Optimization, Design Patterns}","{Web development, Performance, Leadership, SQL...","[(CS314, [Optimization]), (CS414, [Design Patt...",0.157895
77,Yahoo_Frontend_Software_Dev_Engineer,"{Python, Java, Web Applications, Regression Te...","{Innovation, Knowledge sharing, Large-scale sy...","[(CS314, [Teamwork, Java]), (CS165, [Java]), (...",0.131579


In [54]:
numskills_gaps_df = []
for _, row in gaps_df.iterrows():
    numskills_gaps_df.append(len(row['missing skills']) + len(row['covered skills']))

print(numskills_j_data == numskills_j_df)
print(numskills_j_data == numskills_gaps_df)

True
True


In [55]:
gaps_df.to_csv('skill_comparison.csv', index=False)

In [56]:
gaps_df.to_json('skill_comparison.json', orient='records', indent=2)

In [57]:
from collections import defaultdict

def skills_by_course(jobs_df, courses_df, all_course_skills):
    skills_taught = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(list)
        )
    )
    course_entries = list(zip(courses_df["course"], courses_df["skills"]))

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        intersecting_skills = job_skills.intersection(all_course_skills)

        if len(intersecting_skills) == 0:
            skills_taught[job_title]['status'] = 'None of the required skills are taught in any course'
        else:
            for rs in job_skills:

                courses_taught = []
                for course, skills in course_entries:
                    if rs in skills:
                        courses_taught.append(course)

                num_taught = len(courses_taught)
                if num_taught == 0:
                    skills_taught[job_title]['required_skills'][rs] = 'None'
                elif num_taught == 1:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught[0]
                else:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught
        
    return skills_taught



In [58]:
skills_taught = skills_by_course(jobs_df, courses_df, all_course_skills)

In [59]:
import json

def default_to_regular_dict(d):
    if isinstance(d, defaultdict):
        d = {k: default_to_regular_dict(v) for k, v in d.items()}
    return d

skills_taught_regular = default_to_regular_dict(skills_taught)
with open("skills_by_course.json", "w") as f:
    json.dump(skills_taught_regular, f, indent=4)