## STEP 1: Environment Setup

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [7]:
pip install pandas numpy scikit-learn torch transformers datasets




## STEP 2: Import Libraries

In [8]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)


## STEP 3: Load the Dataset

In [12]:
df = pd.read_csv("/Resume.csv", sep=",")
df.head()


Unnamed: 0,"resume_id,category,resume_text,skills_list,experience_years"
0,"1,HR,hr administrator marketing associate hr a..."
1,"2,HR,hr specialist us hr operations summary ve..."
2,"3,HR,hr director summary over years experience..."
3,"4,HR,hr specialist summary dedicated driven an..."
4,"5,HR,hr manager skill highlights hr skills hr ..."


In [14]:
df = df[df.columns[0]].str.split(",", expand=True)


In [15]:
df.columns = [
    "resume_id",
    "category",
    "resume_text",
    "skills_list",
    "experience_years"
]


In [16]:
df.head()

Unnamed: 0,resume_id,category,resume_text,skills_list,experience_years
0,1,HR,hr administrator marketing associate hr admini...,hr administrator marketing associate hr admini...,0.0
1,2,HR,hr specialist us hr operations summary versati...,hr specialist us hr operations summary versati...,0.0
2,3,HR,hr director summary over years experience in r...,hr director summary over years experience in r...,20.0
3,4,HR,hr specialist summary dedicated driven and dyn...,hr specialist summary dedicated driven and dyn...,20.0
4,5,HR,hr manager skill highlights hr skills hr depar...,hr manager skill highlights hr skills hr depar...,0.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2856 entries, 0 to 2855
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   resume_id         2856 non-null   object
 1   category          2856 non-null   object
 2   resume_text       2836 non-null   object
 3   skills_list       2834 non-null   object
 4   experience_years  2818 non-null   object
dtypes: object(5)
memory usage: 111.7+ KB


STEP 4: Keep Only Required Columns

In [18]:
df = df[['resume_text', 'category']]


In [19]:
df.dropna(inplace=True)


In [20]:
df.head(2)

Unnamed: 0,resume_text,category
0,hr administrator marketing associate hr admini...,HR
1,hr specialist us hr operations summary versati...,HR


STEP 5: Explore Target Classes

In [21]:
df['category'].value_counts()


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
HR,120
INFORMATION-TECHNOLOGY,120
BUSINESS-DEVELOPMENT,120
ADVOCATE,118
ENGINEERING,118
ACCOUNTANT,118
CHEF,118
FINANCE,117
FITNESS,117
SALES,116


STEP 6: Encode Target Labels (TEXT → NUMBERS)

In [22]:
# BERT works with numeric labels only.
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])


In [23]:
# save class mapping
label_encoder.classes_


array(['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS',
       'AUTOMOBILE', 'AVIATION', 'Advocate', 'Arts', 'Automation Testing',
       'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'Backend Developer',
       'Blockchain', 'Business Analyst', 'CHEF', 'CONSTRUCTION',
       'CONSULTANT', 'Civil Engineer', 'Cloud Engineer', 'DESIGNER',
       'DIGITAL-MEDIA', 'Data Science', 'Data Scientist', 'Database',
       'DevOps Engineer', 'DotNet Developer', 'ENGINEERING',
       'ETL Developer', 'Electrical Engineering', 'FINANCE', 'FITNESS',
       'Frontend Developer', 'Full Stack Developer', 'HEALTHCARE', 'HR',
       'Hadoop', 'Health and fitness', 'INFORMATION-TECHNOLOGY',
       'Java Developer', 'Machine Learning Engineer',
       'Mechanical Engineer', 'Mobile App Developer (iOS/Android)',
       'Network Security Engineer', 'Operations Manager', 'PMO',
       'PUBLIC-RELATIONS', 'Python Developer', 'SALES', 'SAP Developer',
       'Sales', 'TEACHER', 'Testing', 'Web Designing'

STEP 7: Train–Validation Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    df['resume_text'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)


STEP 8: Load BERT Tokenizer

In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

STEP 9: Tokenize Resume Text

In [26]:
train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    max_length=512
)

test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=512
)


In [27]:
# What’s happening internally?
# Text → tokens
# Tokens → IDs
# All sequences padded to same length
# Long resumes truncated to 512 tokens

STEP 10: Create PyTorch Dataset Class

In [28]:
class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)


### Create datasets:

In [29]:
train_dataset = ResumeDataset(train_encodings, y_train)
test_dataset = ResumeDataset(test_encodings, y_test)


STEP 11: Load Pre-trained BERT Model

In [30]:
num_labels = df['label'].nunique()

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## “I fine-tuned a pre-trained BERT model by adding a classification head.”

STEP 12: Define Evaluation Metrics

In [31]:
def compute_metrics(etest_pred):
    logits, labels = etest_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }


STEP 13: Training Configuration

In [32]:
import transformers
print(transformers.__version__)


4.57.6


In [33]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"   # 🔥 disables wandb forever
)



STEP 14: Trainer Initialization

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


STEP 15: Train the Model

In [35]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [36]:
import torch
torch.cuda.is_available()


True

In [37]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,3.171623,0.086268,0.026783
2,3.275000,2.081717,0.556338,0.486348
3,3.275000,1.334222,0.751761,0.719336


TrainOutput(global_step=852, training_loss=2.666202598894146, metrics={'train_runtime': 742.2503, 'train_samples_per_second': 9.167, 'train_steps_per_second': 1.148, 'total_flos': 1791059518918656.0, 'train_loss': 2.666202598894146, 'epoch': 3.0})

STEP 16: Evaluate the Model

In [38]:
trainer.evaluate()


{'eval_loss': 1.3342223167419434,
 'eval_accuracy': 0.7517605633802817,
 'eval_f1': 0.7193357908376851,
 'eval_runtime': 16.7042,
 'eval_samples_per_second': 34.003,
 'eval_steps_per_second': 4.25,
 'epoch': 3.0}

STEP 17: Resume Category Prediction (Inference)

In [39]:
def predict_category(resume_text):
    inputs = tokenizer(
        resume_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1)
    return label_encoder.inverse_transform(pred.numpy())[0]


In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [44]:
def predict_category(text):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

    # 🔥 Move inputs to GPU
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predicted_label = torch.argmax(outputs.logits, dim=1).item()

    return label_encoder.inverse_transform([predicted_label])[0]


In [None]:
### test

In [45]:
predict_category("Experienced in Python, Machine Learning, NLP, Pandas")


'Data Scientist'

In [47]:
predict_category("HR professional with recruitment, onboarding, payroll experience")


'HR'

In [49]:
predict_category("Sales executive with CRM, lead generation, client handling")


'SALES'

In [50]:
predict_category("Java developer with Spring Boot, microservices, AWS")


'Python Developer'

Save Model & Tokenizer (LOCALLY)

In [51]:
model.save_pretrained("resume_bert_model")
tokenizer.save_pretrained("resume_bert_model")


('resume_bert_model/tokenizer_config.json',
 'resume_bert_model/special_tokens_map.json',
 'resume_bert_model/vocab.txt',
 'resume_bert_model/added_tokens.json')

Save label encoder classes:

In [52]:
import pickle

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


In [56]:
import os
os.listdir("/content")


['.config', 'label_encoder.pkl', 'resume_bert_model', 'results', 'sample_data']