In [1]:
!pip install "transformers[torch]" "accelerate>=0.26.0"




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# train_model.py

import pandas as pd
import random
import os
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

print("--- Starting Model Training Script ---")

# --- 1. Load and Prepare Data ---
print("Loading and preparing datasets...")
resume_file_path = 'UpdatedResumeDataSet.csv'
jd_file_path = 'job_descriptions.csv'

resume_df = pd.read_csv(resume_file_path)
jd_df = pd.read_csv(jd_file_path)

# Rename columns for clarity
resume_df.rename(columns={'Category': 'category', 'Resume': 'resume_text'}, inplace=True)
jd_df.rename(columns={'Job Title': 'job_title', 'Job Description': 'jd_text'}, inplace=True)

# Clean data by dropping rows with missing values
resume_df.dropna(subset=['resume_text', 'category'], inplace=True)
jd_df.dropna(subset=['jd_text', 'job_title'], inplace=True)

# --- 2. THE OPTIMIZATION: Pre-process JDs into a Dictionary ---
print("Pre-processing and grouping job descriptions by category...")
jd_groups = {}
for category in resume_df['category'].unique():
    # Find all JDs that contain the category name in their title
    matches = jd_df[jd_df['job_title'].str.contains(category, case=False, na=False)]['jd_text'].tolist()
    if matches:
        jd_groups[category] = matches

print(f"Found and grouped JDs for {len(jd_groups)} categories.")

# --- 3. Create Labeled Examples Efficiently ---
print("Creating positive and negative training examples...")
train_examples = []
all_jd_categories = list(jd_groups.keys())

for _, resume_row in resume_df.iterrows():
    resume_category = resume_row['category']
    resume_text = resume_row['resume_text']

    # Create a POSITIVE example (label=1.0)
    if resume_category in jd_groups:
        matching_jd_text = random.choice(jd_groups[resume_category])
        train_examples.append(InputExample(texts=[resume_text, matching_jd_text], label=1.0))

    # Create a NEGATIVE example (label=0.0)
    possible_wrong_categories = [cat for cat in all_jd_categories if cat != resume_category]
    if possible_wrong_categories:
        wrong_category = random.choice(possible_wrong_categories)
        non_matching_jd_text = random.choice(jd_groups[wrong_category])
        train_examples.append(InputExample(texts=[resume_text, non_matching_jd_text], label=0.0))

random.shuffle(train_examples)
print(f"Created a total of {len(train_examples)} labeled examples.")

# --- 4. Fine-Tune the Model ---
print("Starting model fine-tuning...")

# Load the base model we want to fine-tune
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create a DataLoader to batch our training examples
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Use CosineSimilarityLoss, which is ideal for this task
train_loss = losses.CosineSimilarityLoss(model)

# Tune the model (1 epoch is a good start)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100)

print("Model fine-tuning complete.")

# --- 5. Save the Model ---
output_path = "./output/recruitment-model-v1"
os.makedirs(output_path, exist_ok=True)
model.save(output_path)

print(f"✅ Model saved successfully to: {output_path}")

  from .autonotebook import tqdm as notebook_tqdm


--- Starting Model Training Script ---
Loading and preparing datasets...
Pre-processing and grouping job descriptions by category...
Found and grouped JDs for 8 categories.
Creating positive and negative training examples...
Created a total of 1295 labeled examples.
Starting model fine-tuning...


 62%|██████▏   | 500/810 [36:38<13:17,  2.57s/it]  

{'loss': 0.0279, 'grad_norm': 0.2207440584897995, 'learning_rate': 8.571428571428571e-06, 'epoch': 6.17}


100%|██████████| 810/810 [48:49<00:00,  3.62s/it]


{'train_runtime': 2929.9582, 'train_samples_per_second': 4.42, 'train_steps_per_second': 0.276, 'train_loss': 0.01812775245419255, 'epoch': 10.0}
Model fine-tuning complete.
✅ Model saved successfully to: ./output/recruitment-model-v1
