In [1]:
!pip install numpy==1.26 torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2




In [2]:
import numpy, torch, torchvision, torchaudio
print("numpy:", numpy.__version__)
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)
print("torchaudio:", torchaudio.__version__)


numpy: 1.26.0
torch: 2.2.2+cpu
torchvision: 0.17.2+cpu
torchaudio: 2.2.2+cpu


In [3]:
# 🚨 Install dependencies (only once)
!pip install transformers spacy scikit-learn pandas tqdm --quiet
!python -m spacy download en_core_web_sm

# 📦 Imports
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from tqdm import tqdm

# 📥 Load Dataset
data_path = r"C:\Users\sagni\Downloads\Resume Selector\UpdatedResumeDataSet.csv"
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
print(df.head())

# 🧹 Clean and Preprocess with SpaCy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(str(text).lower())
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df['clean_text'] = df['Resume'].apply(preprocess)
df['label'] = pd.factorize(df['Category'])[0]
label2id = dict(zip(pd.factorize(df['Category'])[1], range(len(pd.factorize(df['Category'])[1]))))
id2label = {v: k for k, v in label2id.items()}
print(f"Labels: {label2id}")

# 🔀 Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(df['clean_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42)

# 📚 Tokenizer and Dataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts, self.labels, self.tokenizer, self.max_len = texts, labels, tokenizer, max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts.iloc[idx], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {key: val.squeeze(0) for key, val in encodings.items()}, torch.tensor(int(self.labels.iloc[idx]), dtype=torch.long)

train_dataset = ResumeDataset(X_train, y_train, tokenizer)
val_dataset = ResumeDataset(X_val, y_val, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 🏗️ Model Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label2id))
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# 🔥 Training
epochs = 2  # increase for better accuracy
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        inputs, labels_batch = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels_batch = labels_batch.to(device)
        outputs = model(**inputs, labels=labels_batch)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

# 📊 Validation
model.eval()
y_pred, y_true = [], []
with torch.no_grad():
    for batch in val_loader:
        inputs, labels_batch = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        logits = model(**inputs).logits
        preds = torch.argmax(logits, dim=1).cpu().detach().numpy()
        y_pred.extend(preds)
        y_true.extend(labels_batch.numpy())
print("\n📈 Classification Report:\n", classification_report(y_true, y_pred, target_names=list(label2id.keys())))

# 💾 Save Model
model.save_pretrained('./resume_model')
tokenizer.save_pretrained('./resume_model')
print("✅ Model and tokenizer saved to './resume_model'")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 12.7 MB/s eta 0:00:01
     ------ --------------------------------- 2.1/12.8 MB 5.6 MB/s eta 0:00:02
     --------- ------------------------------ 2.9/12.8 MB 5.4 MB/s eta 0:00:02
     ------------- -------------------------- 4.2/12.8 MB 5.6 MB/s eta 0:00:02
     --------------- ------------------------ 5.0/12.8 MB 5.1 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 4.9 MB/s eta 0:00:02
     ------------------- -------------------- 6.3/12.8 MB 4.6 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.2 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.1 MB/s eta 0:00:02
     ------------------------ ----------

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [17:28<00:00, 10.81s/it]


Epoch 1 Loss: 2.9417


Epoch 2/2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 97/97 [17:28<00:00, 10.81s/it]


Epoch 2 Loss: 1.9478


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



📈 Classification Report:
                            precision    recall  f1-score   support

             Data Science       1.00      1.00      1.00         8
                       HR       1.00      1.00      1.00         9
                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         7
            Web Designing       0.82      1.00      0.90         9
      Mechanical Engineer       1.00      1.00      1.00         8
                    Sales       1.00      1.00      1.00         8
       Health and fitness       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
           Java Developer       0.89      1.00      0.94        17
         Business Analyst       1.00      1.00      1.00         6
            SAP Developer       1.00      1.00      1.00         5
       Automation Testing       0.00      0.00      0.00         5
   Electrical Engineering       0.