In [1]:
import csv
import pandas as pd

from ast import literal_eval

In [2]:
job_data = []
with open ('../bert-model/data/jobs_data.csv') as j_file:
    csv_reader = csv.reader(j_file)
    next(csv_reader)
    for row in csv_reader:
        row[2] = [skill.strip(" '\"") for skill in row[2].strip('[]').split(',')]
        job_data.append(row)

jobs_df = pd.DataFrame(job_data, columns=['title', 'description', 'skills'])

course_data = []
with open ('../bert-model/data/courses_data.csv') as c_data:
    csv_reader = csv.reader(c_data)
    next(csv_reader)
    for row in csv_reader:
        row[1] = [skill.strip(" '\"") for skill in row[1].strip('[]').split(',')]
        course_data.append(row)

courses_df = pd.DataFrame(course_data, columns=['course', 'skills'])

In [3]:
def get_all_acquired_skills(courses_df):
    all_acquired_skills = set() 
    
    for skills in courses_df['skills']:
        all_acquired_skills.update(skills)

    return all_acquired_skills

In [26]:
def all_class_comparison(jobs_df, courses_df, all_course_skills):
    gaps_df = pd.DataFrame(columns=['job title', 'covered skills', 'missing skills', 'intersecting course skills', 'nonrequired skills taught', 'gt label'])

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        courses = zip(courses_df['course'], courses_df['skills'])

        missing_skills = job_skills - all_course_skills
        covered_skills = job_skills.intersection(all_course_skills)
        num_missing = float(len(missing_skills))
        num_job_skills = float(len(job_skills))

        non_required = []
        intersecting = []

        for course, skills in courses:
            course_inter = list(job_skills.intersection(set(skills)))
            if len(course_inter) > 0:
                intersecting.append((course, course_inter))
            non_required.append((course, list(set(skills) - job_skills)))

        gaps_entry = pd.DataFrame([{
                                    'job title': job_title,
                                    'covered skills': covered_skills,
                                    'missing skills': missing_skills, 
                                    'intersecting course skills': intersecting, 
                                    'nonrequired skills taught': non_required,
                                    'gt label': (1.0 - (num_missing/num_job_skills))      
                                }])
        
        gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)

        
    return gaps_df

In [27]:
all_course_skills = get_all_acquired_skills(courses_df)

gaps_df = all_class_comparison(jobs_df, courses_df, all_course_skills)

gaps_df

  gaps_df = pd.concat([gaps_df, gaps_entry], ignore_index=True)


Unnamed: 0,job title,covered skills,missing skills,intersecting course skills,nonrequired skills taught,gt label
0,Adobe_AI_ML_Engineer,"{Quality Assurance, Debugging, Testing, Teamwo...","{Front-end frameworks, Reliability, Analytical...","[(CS314, [Communication, Teamwork]), (CS150, [...","[(CS462, [Collisions, Camera Rendering, Implem...",0.193548
1,Adobe_Junior_SDE,"{Java, Teamwork, Security}","{Observability, Innovation, Adobe Commerce, Co...","[(CS314, [Java, Teamwork]), (CS165, [Java]), (...","[(CS462, [Collisions, Camera Rendering, Implem...",0.071429
2,Adobe_Software_Engineering_Intern,"{C++, Algorithms, Java, Teamwork, Communicatio...","{Cloud Technology, Node.js, Machine learning, ...","[(CS314, [Java, Teamwork, Communication, Netwo...","[(CS462, [Collisions, Camera Rendering, Implem...",0.333333
3,Adobe_Software_Quality_Engineer,"{Quality Assurance, Communication, Testing}","{TypeScript, Deployment, Open source tools, Pr...","[(CS314, [Communication]), (CS164, [Quality As...","[(CS462, [Collisions, Camera Rendering, Implem...",0.150000
4,Adobe_Software_Solutions_Architect,"{Security, Research, Teamwork, Communication, ...","{Solution architecture, Innovation, Cross-Devi...","[(CS314, [Communication, Teamwork, Optimizatio...","[(CS462, [Collisions, Camera Rendering, Implem...",0.094340
...,...,...,...,...,...,...
74,Tesla_Software_Compliance_Engineer,"{C++, Debugging, C, Testing, Java, Communicati...","{Embedded environments/protocols, C#, Innovati...","[(CS314, [Communication, Java]), (CS165, [Java...","[(CS462, [Collisions, Camera Rendering, Implem...",0.184211
75,Tesla_Software_Engineer_Firmware,"{Static Analysis, Debugging, C, Communication,...","{Innovation, Algorithm Development, Analytical...","[(CS314, [Communication]), (CS270, [C]), (CS41...","[(CS462, [Collisions, Camera Rendering, Implem...",0.151515
76,Tesla_Software_Engineer_Recruiting,"{Web Applications, Optimization, Design Patterns}","{TypeScript, Maintaining software applications...","[(CS314, [Optimization]), (CS414, [Design Patt...","[(CS462, [Collisions, Camera Rendering, Implem...",0.157895
77,Yahoo_Frontend_Software_Dev_Engineer,"{Web Applications, Regression Testing, Java, T...","{Innovation, Quality focus, Continuous integra...","[(CS314, [Java, Teamwork]), (CS165, [Java]), (...","[(CS462, [Collisions, Camera Rendering, Implem...",0.131579


In [28]:
gaps_df.to_csv('skill_comparison.csv', index=False)

In [29]:
gaps_df.to_json('skill_comparison.json', orient='records', indent=2)

In [98]:
from collections import defaultdict

def skills_by_course(jobs_df, courses_df, all_course_skills):
    skills_taught = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(list)
        )
    )
    course_entries = list(zip(courses_df["course"], courses_df["skills"]))

    for _, job in jobs_df.iterrows():
        job_title = job['title']
        job_skills = set(job['skills'])
        intersecting_skills = job_skills.intersection(all_course_skills)

        if len(intersecting_skills) == 0:
            skills_taught[job_title]['status'] = 'None of the required skills are taught in any course'
        else:
            for rs in job_skills:

                courses_taught = []
                for course, skills in course_entries:
                    if rs in skills:
                        courses_taught.append(course)

                num_taught = len(courses_taught)
                if num_taught == 0:
                    skills_taught[job_title]['required_skills'][rs] = 'None'
                elif num_taught == 1:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught[0]
                else:
                    skills_taught[job_title]['required_skills'][rs] = courses_taught
        
    return skills_taught



In [99]:
skills_taught = skills_by_course(jobs_df, courses_df, all_course_skills)

In [100]:
import json

def default_to_regular_dict(d):
    if isinstance(d, defaultdict):
        d = {k: default_to_regular_dict(v) for k, v in d.items()}
    return d

skills_taught_regular = default_to_regular_dict(skills_taught)
with open("skills_by_course.json", "w") as f:
    json.dump(skills_taught_regular, f, indent=4)