In [2]:
import re
import csv
import torch
import random
import pandas as pd

from tqdm import tqdm
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from sklearn.model_selection import ParameterGrid, train_test_split
from sentence_transformers import SentenceTransformer, losses, util, InputExample


In [3]:
with open ('data/all-csu-codes.csv', 'r') as c_data:
    csv_reader = csv.reader(c_data) 
    courses_data = list(csv_reader)
    
courses_df = pd.DataFrame(columns=['Courses', 'Skills'])

for idx in range(0, len(courses_data)):
    skill_list = courses_data[idx][1:-1]
    skill_list = [skill.title() for skill in skill_list]
    skill_list = [re.sub(r'\b(vs|Vs)\b', 'VS', skill) for skill in skill_list]

    row = pd.DataFrame({'Courses': courses_data[idx][0], 'Skills':[skill_list]})
    courses_df = pd.concat([courses_df, row], ignore_index=True)

courses_df.to_csv('data/dpr_courses_data.csv', index=False)

In [4]:
with open('data/descriptions.txt', 'r') as j_data:
    csv_reader = csv.reader(j_data, delimiter='|')
    jobs_data = list(csv_reader)

jobs_df = pd.DataFrame(columns=['Job_Title', 'Job_Description', 'Required_Skills'])
                    
for row in jobs_data:
    if len(row) == 3:
        job_title = row[0].strip().strip('"') 

        job_description = row[1].strip().strip('"')
        job_description = re.sub(r'\bDESCRIPTION\b', '', job_description)
    
        skills = row[2].strip().strip('"')
        skill_list = [skill.strip().strip('"') for skill in skills.split(',')]
        cap_skill_list = [skill.title() for skill in skill_list]
        cleaned_skills = [re.sub(r'\s?\(.*?\)', '', skill) for skill in cap_skill_list]

        row = pd.DataFrame({'Job_Title': job_title, 'Job_Description': job_description, 'Required_Skills': [skill_list]})
        jobs_df = pd.concat([jobs_df, row], ignore_index=True)
            
jobs_df.to_csv('data/dpr_jobs.csv', index=False)

In [5]:
class SkillExtractor:
    def __init__(self):
        # Using a reliable skill extraction model
        self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
        self.skill_db = pd.read_csv("data/skill_database.csv")  # Your custom skill database
        
    def extract(self, job_description):
        """Extract skills using NER and map to standardized names"""
        inputs = self.tokenizer(job_description, return_tensors="pt", truncation=True)
        outputs = self.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        
        # Extract skills (simple approach - customize as needed)
        skills = []
        current_skill = []
        for token, pred in zip(tokens, predictions[0]):
            if pred.item() in [1, 3]:  # B-ENT or I-ENT
                current_skill.append(token.replace("##", ""))
            elif current_skill:
                skills.append("".join(current_skill).title())
                current_skill = []
        
        return self._standardize_skills(list(set(skills)))
    
    def _standardize_skills(self, raw_skills):
        """Map extracted skills to standardized names"""
        standardized = []
        for skill in raw_skills:
            # Simple matching - enhance with fuzzy matching if needed
            match = self.skill_db[self.skill_db['aliases'].str.contains(skill, case=False, na=False)]
            if not match.empty:
                standardized.append(match.iloc[0]['canonical_name'])
            else:
                standardized.append(skill)
        return list(set(standardized))

In [6]:
class CurriculumGapAnalyzer:
    def __init__(self, courses_df):
        self.model = None
        self.courses_df = courses_df
        self.schedules = self._generate_schedules()
        
    def train(self, jobs_df, param_grid):
        """Train with grid search on all comparison types"""
        examples = self._create_training_examples(jobs_df)
        train_examples, val_examples = train_test_split(examples, test_size=0.2)
        
        best_f1 = -1
        for params in tqdm(ParameterGrid(param_grid), desc="Grid Search"):
            model = SentenceTransformer(params['model_name'])
            train_loader = DataLoader(
                [InputExample(texts=[x[0], x[1]], label=x[2]) 
                for x in train_examples],
                batch_size=params['batch_size'],
                shuffle=True
            )
            
            model.fit(
                train_objectives=[(train_loader, losses.CosineSimilarityLoss(model))],
                epochs=params['epochs'],
                warmup_steps=params['warmup_steps'],
                show_progress_bar=False
            )
            
            val_f1 = self._evaluate(model, val_examples, params['threshold'])
            if val_f1 > best_f1:
                best_f1 = val_f1
                self.model = model
                self.best_params = params
        
        return self.model
    
    def _create_training_examples(self, jobs_df):
        """Generate examples for all comparison types"""
        examples = []
        all_skills = set().union(*self.courses_df['Skills'].tolist())
        
        for _, job in jobs_df.iterrows():
            job_skills = set(job['Required_Skills'])
            
            # 1. Individual courses
            for _, course in self.courses_df.iterrows():
                context = f"COURSE:{course['Courses']}|SKILLS:{'|'.join(course['Skills'])}"
                examples.append(InputExample(
                    texts=[f"JOB:{job['Job_Title']}", context],
                    label=1.0 if job_skills & set(course['Skills']) else 0.0
                ))
            
            # 2. Generated schedules
            for i, schedule in enumerate(self.schedules):
                sched_skills = set().union(*schedule['Skills'].tolist())
                context = f"SCHEDULE_{i}:{'|'.join(schedule['Courses'].tolist())}"
                examples.append(InputExample(
                    texts=[f"JOB:{job['Job_Title']}", context],
                    label=1.0 if job_skills & sched_skills else 0.0
                ))
            
            # 3. Full curriculum
            examples.append(InputExample(
                texts=[f"JOB:{job['Job_Title']}", f"FULL_CURRICULUM:{'|'.join(all_skills)}"],
                label=1.0 if job_skills & all_skills else 0.0
            ))
        
        return examples
    
    def _evaluate(self, model, examples, threshold):
        """Calculate F1 score"""
        anchors = [x.texts[0] for x in examples]
        comparisons = [x.texts[1] for x in examples]
        labels = [x.label for x in examples]
        
        anchor_emb = model.encode(anchors, convert_to_tensor=True)
        compare_emb = model.encode(comparisons, convert_to_tensor=True)
        sims = torch.cosine_similarity(anchor_emb, compare_emb)
        preds = (sims > threshold).cpu().numpy()
        return f1_score(labels, preds)
    
    def _generate_schedules(self, n=5):
        """Create realistic course schedules"""
        core = ['CS150', 'CS164', 'CS152', 'CS162', 'CS201', 'CS165', 
                'CS220', 'CS270', 'CS250', 'CS314', 'CS370', 'CS320', 'CS214']
        electives = self.courses_df[~self.courses_df['Courses'].isin(core)]
        schedules = []
        
        for _ in range(n):
            l4 = random.sample(electives[electives['Courses'].str.startswith('CS4')]['Courses'].tolist(), 2)
            remaining = electives[~electives['Courses'].isin(l4)]
            l3 = random.sample(remaining[remaining['Courses'].str.startswith('CS3')]['Courses'].tolist(), 2)
            other = random.sample(remaining[~remaining['Courses'].isin(l3)]['Courses'].tolist(), 1)
            schedules.append(self.courses_df[self.courses_df['Courses'].isin(core + l4 + l3 + other)])
        
        return schedules
    
    def analyze_gaps(self, job_description, extractor):
        """Full analysis pipeline for new job descriptions"""
        # Extract skills
        job_skills = extractor.extract(job_description)
        
        # Encode job context
        job_embedding = self.model.encode(f"JOB_DESC:{job_description}", convert_to_tensor=True)
        
        results = {
            'missing_skills': [],
            'individual_courses': {},
            'schedules': {},
            'full_curriculum': None
        }
        
        # 1. Compare to individual courses
        for _, course in self.courses_df.iterrows():
            course_embedding = self.model.encode(
                f"COURSE:{course['Courses']}|SKILLS:{'|'.join(course['Skills'])}", 
                convert_to_tensor=True
            )
            sim = util.cos_sim(job_embedding, course_embedding).item()
            results['individual_courses'][course['Courses']] = {
                'similarity': sim,
                'missing': [s for s in job_skills if s not in course['Skills']]
            }
        
        # 2. Compare to schedules
        for i, schedule in enumerate(self.schedules):
            sched_embedding = self.model.encode(
                f"SCHEDULE_{i}:{'|'.join(schedule['Courses'].tolist())}",
                convert_to_tensor=True
            )
            sim = util.cos_sim(job_embedding, sched_embedding).item()
            sched_skills = set().union(*schedule['Skills'].tolist())
            results['schedules'][f"Schedule_{i}"] = {
                'similarity': sim,
                'missing': [s for s in job_skills if s not in sched_skills]
            }
        
        # 3. Compare to full curriculum
        all_skills = set().union(*self.courses_df['Skills'].tolist())
        curriculum_embedding = self.model.encode(
            f"FULL_CURRICULUM:{'|'.join(all_skills)}",
            convert_to_tensor=True
        )
        results['full_curriculum'] = {
            'similarity': util.cos_sim(job_embedding, curriculum_embedding).item(),
            'missing': [s for s in job_skills if s not in all_skills]
        }
        
        # Aggregate missing skills
        results['missing_skills'] = list(set(
            results['full_curriculum']['missing'] +
            [s for sched in results['schedules'].values() for s in sched['missing']] +
            [s for course in results['individual_courses'].values() for s in course['missing']]
        ))
        
        return results

In [7]:
skill_extractor = SkillExtractor()
gap_analyzer = CurriculumGapAnalyzer(courses_df)

# Train model
param_grid = {
    'model_name': ['all-mpnet-base-v2'],
    'batch_size': [32],
    'epochs': [5],
    'warmup_steps': [500],
    'threshold': [0.6]
}
gap_analyzer.train(jobs_df, param_grid)

# Example usage with new job description
sample_job = "We're hiring a Full Stack Developer with 3+ years experience in React, Node.js, and AWS."
analysis = gap_analyzer.analyze_gaps(sample_job, skill_extractor)

print("\nMissing Skills:", analysis['missing_skills'])
print("\nCourse Comparison Summary:")
for course, data in list(analysis['individual_courses'].items())[:3]:
    print(f"{course}: Similarity={data['similarity']:.2f}, Missing={data['missing']}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FileNotFoundError: [Errno 2] No such file or directory: 'data/skill_database.csv'