In [82]:
import csv
import pandas as pd

from ast import literal_eval

In [83]:
job_data = []
with open ('../bert-model/data/jobs_data.csv') as j_file:
    csv_reader = csv.reader(j_file)
    next(csv_reader)
    for row in csv_reader:
        row[2] = [skill.strip(" '\"") for skill in row[2].strip('[]').split(',')]
        job_data.append(row)

jobs_df = pd.DataFrame(job_data, columns=['title', 'description', 'skills'])

course_data = []
with open ('../bert-model/data/courses_data.csv') as c_data:
    csv_reader = csv.reader(c_data)
    next(csv_reader)
    for row in csv_reader:
        row[1] = [skill.strip(" '\"") for skill in row[1].strip('[]').split(',')]
        course_data.append(row)

courses_df = pd.DataFrame(course_data, columns=['course', 'skills'])

In [84]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [85]:
def all_class_comparison(jobs_df, courses_df, all_course_skills):
    gaps_df = pd.DataFrame(columns=['job title', 'courses', 'missing skills', 'intersecting skills', 'gt label'])

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        missing_skills = job_skills - all_course_skills
        intersecting_skills = job_skills.intersection(all_course_skills)
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))

        courses = list(courses_df["course"])


        gaps_entry = pd.DataFrame([{
                                    'job title': job_title, 
                                    'courses': courses, 
                                    'missing skills': list(missing_skills), 
                                    'intersecting skills': list(intersecting_skills), 
                                    'gt label': (1.0 - (num_missing/num_job_skills))      
                                }])
        
        gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)

        
    return gaps_df

In [86]:
all_course_skills = get_all_acquired_skills(courses_df)

gaps_df = all_class_comparison(jobs_df, courses_df, all_course_skills)

gaps_df

  gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)


Unnamed: 0,job title,courses,missing skills,intersecting skills,gt label
0,Adobe_AI_ML_Engineer,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Code/API documentation, Code Review, React.js...","[Communication, Debugging, Python, Quality Ass...",0.193548
1,Adobe_Junior_SDE,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Monitoring, Cloud computing, Agile developmen...","[Teamwork, Java, Security]",0.071429
2,Adobe_Software_Engineering_Intern,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Design, Data structures, Cloud Engineering, O...","[Communication, Java, Algorithms, C++, Python,...",0.333333
3,Adobe_Software_Quality_Engineer,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[TypeScript, Time management, React.js, Analyt...","[Quality Assurance, Testing, Communication]",0.150000
4,Adobe_Software_Solutions_Architect,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Automation, HTML, Team collaboration, JavaScr...","[Communication, Optimization, Security, Teamwo...",0.094340
...,...,...,...,...,...
74,Tesla_Software_Compliance_Engineer,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Standards Implementation, Collaboration, Embe...","[Communication, C, Java, C++, Debugging, Pytho...",0.184211
75,Tesla_Software_Engineer_Firmware,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Embedded microprocessor, Embedded systems, Te...","[Communication, C, Software Design, Static Ana...",0.151515
76,Tesla_Software_Engineer_Recruiting,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[Design, TypeScript, Golang, SQL, Creative Sol...","[Design Patterns, Web Applications, Optimization]",0.157895
77,Yahoo_Frontend_Software_Dev_Engineer,"[CS462, CS314, CS165, CS201, CS370, CS150, CS1...","[TypeScript, Code Review, React.js, HTML, Syst...","[Java, Web Applications, Python, Teamwork, Reg...",0.131579


In [87]:
gaps_df.to_csv('skill_comparison.csv', index=False)

In [88]:
gaps_df.to_json('skill_comparison.json', orient='records', indent=2)

In [95]:
from collections import defaultdict

def skills_by_course(jobs_df, courses_df, all_course_skills):
    skills_taught = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(list)
        )
    )
    course_entries = list(zip(courses_df["course"], courses_df["skills"]))

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        missing_skills = job_skills - all_course_skills
        intersecting_skills = job_skills.intersection(all_course_skills)

        if len(intersecting_skills) == 0:
            skills_taught[job_title]['status'] = 'None of the required skills are taught in any course'
        else:
            for rs in job_skills:
                
                courses_taught = []
                for course, skills in course_entries:
                    if rs in skills:
                        courses_taught.append(course)

                num_taught = len(courses_taught)
                if num_taught == 0:
                    skills_taught[job_title]['required_skills'][rs] = 'None'
                elif num_taught == 1:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught[0]
                else:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught
        
        skills_taught[job_title]['missing_skills'] = list(missing_skills)
        
    return skills_taught



In [96]:
skills_taught = skills_by_course(jobs_df, courses_df, all_course_skills)

In [97]:
import json

def default_to_regular_dict(d):
    if isinstance(d, defaultdict):
        d = {k: default_to_regular_dict(v) for k, v in d.items()}
    return d

skills_taught_regular = default_to_regular_dict(skills_taught)
with open("skills_by_course.json", "w") as f:
    json.dump(skills_taught_regular, f, indent=4)