In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm

Data: https://media.githubusercontent.com/media/noran-mohamed/Resume-Classification-Dataset/refs/heads/main/Preprocessed_Data.csv

Replace with your data - Make sure it contains minimum 100 data points each class

In [2]:
from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
file_path = '/content/drive/MyDrive/dsa_pre/Preprocessed_Data.csv'
df = pd.read_csv(file_path)

test_df = df.sample(frac=0.1, random_state=42)

Mounted at /content/drive


In [3]:
df = df.drop(test_df.index)

In [4]:
test_df.head()

Unnamed: 0,Category,Text
4105,Digital Media,kishore mynapalli freelance digital marketer p...
538,Banking,maureen jones sometown id 55555 555 5555555 mj...
7335,Banking,jessica claire resumesampleexamplecom 555 4321...
1606,Health and Fitness,john sample 123 main street town ca t8n656 cel...
12831,SQL Developer,jessica claire montgjessicay street san franci...


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode labels into numeric format
label_encoder = LabelEncoder()
df["section_encoded"] = label_encoder.fit_transform(df["Category"])

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Text"].values,
    df["section_encoded"].values,
    test_size=0.2,
    random_state=42
)

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Define dataset
max_len = 128
train_dataset = ClassificationDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ClassificationDataset(val_texts, val_labels, tokenizer, max_len)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
# Model and Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained( "bert-base-uncased",
                                 num_labels=len(label_encoder.classes_))
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Training Loop
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(train_loader)}")

Epoch 0: 100%|██████████| 603/603 [05:13<00:00,  1.93it/s, loss=1.25]


Epoch 0 Loss: 1.4843249027655887


Epoch 1: 100%|██████████| 603/603 [05:13<00:00,  1.92it/s, loss=1.35]


Epoch 1 Loss: 0.8413682579499967


Epoch 2: 100%|██████████| 603/603 [05:15<00:00,  1.91it/s, loss=0.14]


Epoch 2 Loss: 0.6265893319303519


Epoch 3: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.254]


Epoch 3 Loss: 0.49582764226788806


Epoch 4: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.022]


Epoch 4 Loss: 0.369764585329056


Epoch 5: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.366]


Epoch 5 Loss: 0.31082290564208087


Epoch 6: 100%|██████████| 603/603 [05:12<00:00,  1.93it/s, loss=0.0597]


Epoch 6 Loss: 0.24171733169524529


Epoch 7: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.509]


Epoch 7 Loss: 0.18313360589542496


Epoch 8: 100%|██████████| 603/603 [05:15<00:00,  1.91it/s, loss=0.205]


Epoch 8 Loss: 0.15909349461276065


Epoch 9: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.00917]


Epoch 9 Loss: 0.1607571928599729


Epoch 10: 100%|██████████| 603/603 [05:12<00:00,  1.93it/s, loss=0.0108]


Epoch 10 Loss: 0.14182166333710328


Epoch 11: 100%|██████████| 603/603 [05:11<00:00,  1.93it/s, loss=0.0522]


Epoch 11 Loss: 0.1263804557817412


Epoch 12: 100%|██████████| 603/603 [05:12<00:00,  1.93it/s, loss=0.113]


Epoch 12 Loss: 0.12262590323531596


Epoch 13: 100%|██████████| 603/603 [05:11<00:00,  1.93it/s, loss=0.00548]


Epoch 13 Loss: 0.1054347230694496


Epoch 14: 100%|██████████| 603/603 [05:12<00:00,  1.93it/s, loss=0.0226]


Epoch 14 Loss: 0.10635355306761478


Epoch 15: 100%|██████████| 603/603 [05:30<00:00,  1.82it/s, loss=0.0144]


Epoch 15 Loss: 0.10043072099890096


Epoch 16: 100%|██████████| 603/603 [05:11<00:00,  1.93it/s, loss=0.0116]


Epoch 16 Loss: 0.10136762166598111


Epoch 17: 100%|██████████| 603/603 [05:14<00:00,  1.92it/s, loss=0.199]


Epoch 17 Loss: 0.08097586659699135


Epoch 18: 100%|██████████| 603/603 [05:12<00:00,  1.93it/s, loss=0.0154]


Epoch 18 Loss: 0.09543527645571698


Epoch 19: 100%|██████████| 603/603 [05:11<00:00,  1.93it/s, loss=0.0101]

Epoch 19 Loss: 0.08834716799834227





In [11]:
# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.7992


In [12]:
# Save model
model.save_pretrained("/content/drive/MyDrive/dsa_pre/bert_resume_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/dsa_pre/bert_resume_classifier")
torch.save(label_encoder, "/content/drive/MyDrive/dsa_pre/label_encoder.pth")

Inference

In [13]:
from sklearn.preprocessing import LabelEncoder
import torch.serialization

# Allow the specific class from sklearn to be unpickled
torch.serialization.add_safe_globals({'LabelEncoder': LabelEncoder})

# Load model, tokenizer, and label encoder
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/dsa_pre/bert_resume_classifier")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/dsa_pre/bert_resume_classifier")
label_encoder = torch.load("/content/drive/MyDrive/dsa_pre/label_encoder.pth", weights_only=False)

# Move Model to Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Sample Input
sample_text = test_df["Text"][12831]
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=128
)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted Section: {label_encoder.inverse_transform([predicted_class])[0]}")

Predicted Section: SQL Developer


In [14]:
test_df.loc[12831]

Unnamed: 0,12831
Category,SQL Developer
Text,jessica claire montgjessicay street san franci...


In [15]:
# Sample Input
sample_text = test_df["Text"][4105]
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=128
)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted Section: {label_encoder.inverse_transform([predicted_class])[0]}")

Predicted Section: Digital Media


In [16]:
test_df.loc[4105]

Unnamed: 0,4105
Category,Digital Media
Text,kishore mynapalli freelance digital marketer p...


In [17]:
# Sample Input
sample_text = test_df["Text"][7335]
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=128
)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted Section: {label_encoder.inverse_transform([predicted_class])[0]}")

Predicted Section: Banking


In [18]:
test_df.loc[7335]

Unnamed: 0,7335
Category,Banking
Text,jessica claire resumesampleexamplecom 555 4321...


In [19]:
# Sample Input
sample_text = test_df["Text"][1606]
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=128
)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted Section: {label_encoder.inverse_transform([predicted_class])[0]}")

Predicted Section: Health and Fitness


In [20]:
test_df.loc[1606]

Unnamed: 0,1606
Category,Health and Fitness
Text,john sample 123 main street town ca t8n656 cel...
