In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


In [3]:
# Upload resumes.csv to your Colab environment first (use the file upload button in Colab)
df = pd.read_csv('resumes.csv')
print(df.head())
print(df['label'].value_counts())


                                         resume_text         label
0  Data scientist proficient in Big Data, Pandas,...  Data Science
1  Data scientist proficient in Python, Statistic...  Data Science
2  Experienced software engineer with expertise i...           SWE
3  Machine learning engineer skilled in Computer ...           MLE
4  Machine learning engineer skilled in Spark, Da...           MLE
label
Data Science    179
SWE             162
MLE             159
Name: count, dtype: int64


In [4]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
print(list(le.classes_))


['Data Science', 'MLE', 'SWE']


In [47]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['resume_text'].tolist(),
    df['label_encoded'].tolist(),
    test_size=0.2,  # 20% for testing
    random_state=42,
    stratify=df['label_encoded']
)
print(f"Train size: {len(train_texts)}, Test size: {len(test_texts)}")


Train size: 400, Test size: 100


In [48]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


In [49]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

train_dataset = ResumeDataset(train_encodings, train_labels)
test_dataset = ResumeDataset(test_encodings, test_labels)


In [50]:
class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)


In [51]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch'
)

In [63]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [64]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)
print("Test Accuracy:", accuracy_score(test_labels, y_pred))
print(classification_report(test_labels, y_pred, target_names=le.classes_))

Test Accuracy: 0.32
              precision    recall  f1-score   support

Data Science       0.00      0.00      0.00        36
         MLE       0.32      1.00      0.48        32
         SWE       0.00      0.00      0.00        32

    accuracy                           0.32       100
   macro avg       0.11      0.33      0.16       100
weighted avg       0.10      0.32      0.16       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [55]:
model.save_pretrained('./bert_resume_classifier')
tokenizer.save_pretrained('./bert_resume_classifier')

('./bert_resume_classifier/tokenizer_config.json',
 './bert_resume_classifier/special_tokens_map.json',
 './bert_resume_classifier/vocab.txt',
 './bert_resume_classifier/added_tokens.json')

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [57]:
def predict_resume(text):
    # Ensure model is on the correct device
    device = next(model.parameters()).device
    # Tokenize and move inputs to the model's device
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Get model outputs
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred_label = torch.argmax(probs).item()
    return le.inverse_transform([pred_label])[0], probs[0][pred_label].item()


In [58]:
test_text = "Data scientist proficient in Jupyter, SQL, Statistics, Pandas, Machine Learning. Conducted analysis on sales transactions to uncover trends and support business decisions. Holds a Ph.D. in Machine Learning from MIT."
predicted_class, confidence = predict_resume(test_text)
print(f"Predicted: {predicted_class} (confidence: {confidence:.2f})")


Predicted: MLE (confidence: 0.59)
