# LoRA Fine-tuning : Bigbird-roberta-base

In [1]:
!pip install datasets
!pip install evaluate



## Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, DataCollatorWithPadding, BitsAndBytesConfig
from datasets import DatasetDict, Dataset, load_dataset
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import evaluate
import huggingface_hub
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

import torch.nn.functional as F
import evaluate
import torch
import numpy as np

import warnings
warnings.filterwarnings("ignore")


In [None]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"
huggingface_hub.login(token="<access_token>")

## Reading the data

In [4]:
dataset = load_dataset("saideep-arikontham/jd_resume_dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
    test: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
})

In [5]:
train_df = dataset['train'].to_pandas()
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1600
1,1600


In [6]:
val_df = dataset['validation'].to_pandas()
val_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


In [7]:
test_df = dataset['test'].to_pandas()
test_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


## Loading the model

In [8]:
# -------------------------------
# Load tokenizer and base model
# -------------------------------

model_checkpoint = "google/bigbird-roberta-base"

# Define label maps
label2id = {"Bad Fit": 0, "Good Fit": 1}
id2label = {0: "Bad Fit", 1: "Good Fit"}


# Load the model with explicit torch_dtype to avoid the 'normal_kernel_cpu' error
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BigBirdForSequenceClassification(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0-11): 12 x BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [9]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

## Define Preprocessing function

In [10]:
# -------------------------------
# Improved tokenization approach for job and resume matching
# -------------------------------

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add("overqualified")
stop_words.add("underqualified")
stop_words.add("mismatch")
stop_words.add("good")

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text


def preprocess_function(examples):

    # Preprocess the text first
    examples["job_data"] = [preprocess_text(job) for job in examples["job_data"]]
    examples["resume_data"] = [preprocess_text(resume) for resume in examples["resume_data"]]

    # Process job descriptions and resumes separately with appropriate max lengths
    job_max_length = 2046
    resume_max_length = 2046
    max_model_length = 4096

    # Tokenize job descriptions
    job_inputs = tokenizer(
        examples["job_data"],
        truncation=True,
        max_length=job_max_length,
        padding="max_length"
    )

    # Tokenize resumes
    resume_inputs = tokenizer(
        examples["resume_data"],
        truncation=True,
        max_length=resume_max_length,
        padding="max_length"
    )

    # Combine the tokenized inputs
    combined_inputs = {
        "input_ids": [],
        "attention_mask": []
    }

    for job_ids, job_mask, resume_ids, resume_mask in zip(
        job_inputs["input_ids"], job_inputs["attention_mask"],
        resume_inputs["input_ids"], resume_inputs["attention_mask"]
    ):
        # Get separator token ID
        separator_id = tokenizer.sep_token_id
        if separator_id is None:
            separator_id = tokenizer.eos_token_id

        # Combine tokens with separator
        combined_ids = job_ids + [separator_id] + resume_ids
        combined_mask = job_mask + [1] + resume_mask

        # Ensure we don't exceed the model's max input length

        if len(combined_ids) > max_model_length:
            combined_ids = combined_ids[:max_model_length]
            combined_mask = combined_mask[:max_model_length]

        combined_inputs["input_ids"].append(combined_ids)
        combined_inputs["attention_mask"].append(combined_mask)

    # Add labels
    combined_inputs["labels"] = examples["label"]

    return combined_inputs

# Apply the tokenization to all splits
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# create data collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


## Testing Untrained model

### Inference Functions

In [12]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(["overqualified", "underqualified", "mismatch", "good"])

def preprocess_text(text):
    """Preprocess text by removing unwanted symbols, normalizing, and removing stopwords."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)
    text = re.sub(r"-(?!\d)", "", text)  # Preserve hyphens only when followed by a number
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)  # Remove periods unless in numbers
    text = text.replace("\n", " ").replace("\r", " ")
    text = text.replace("show less", "").replace("show more", "")
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def tokenize_data(job_description, resume, tokenizer):
    # Preprocess input texts
    job_description = preprocess_text(job_description)
    resume = preprocess_text(resume)

    # Define the tokenizer settings
    # Define chunk sizes
    job_max_length = 2046
    resume_max_length = 2046
    max_model_length = 4096

    # Tokenize job description
    job_inputs = tokenizer(
        job_description,
        truncation=True,
        max_length=job_max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Tokenize resume
    resume_inputs = tokenizer(
        resume,
        truncation=True,
        max_length=resume_max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Get separator token ID
    separator_id = tokenizer.sep_token_id
    if separator_id is None:
        separator_id = tokenizer.eos_token_id

    # Convert separator ID to correct dtype
    separator_tensor = torch.tensor([[separator_id]], dtype=job_inputs["input_ids"].dtype)

    # Combine tokens with separator
    combined_ids = torch.cat((job_inputs["input_ids"], separator_tensor, resume_inputs["input_ids"]), dim=1)
    combined_mask = torch.cat((job_inputs["attention_mask"], torch.tensor([[1]], dtype=job_inputs["attention_mask"].dtype), resume_inputs["attention_mask"]), dim=1)


    # Ensure we don't exceed the max length
    combined_ids = combined_ids[:, :max_model_length]
    combined_mask = combined_mask[:, :max_model_length]

    return {
        "input_ids": combined_ids,
        "attention_mask": combined_mask
    }


def predict_resume_fit(job_description, resume, model, tokenizer):
    # Tokenize input
    inputs = tokenize_data(job_description, resume, tokenizer)

    # Ensure model is in evaluation mode
    model.eval()

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits
    logits = outputs.logits

    # Compute softmax probabilities
    probs = F.softmax(logits, dim=-1)

    # Get predicted class
    predicted_class = torch.argmax(probs, dim=-1).item()

    # Get probability of class 1
    class_1_prob = probs[:, 1].item() if probs.shape[1] > 1 else probs.item()

    return predicted_class, class_1_prob



### Checking whether the tokenized_dataset and tokenize_data() function outputs match

In [14]:
sample_train_data = tokenized_dataset["train"][0]  # Get one training sample
print("Training Input IDs:", sample_train_data["input_ids"])
print("Training Attention Mask:", sample_train_data["attention_mask"])


Training Input IDs: [65, 15085, 4807, 386, 19572, 545, 401, 34532, 2146, 6025, 15085, 4807, 4755, 4058, 15085, 26303, 5112, 437, 10298, 597, 2708, 2698, 14843, 4641, 37973, 4856, 3916, 14691, 6594, 15085, 4988, 5843, 7635, 4721, 27438, 6023, 15085, 1175, 14655, 1863, 2551, 3552, 17588, 6376, 1205, 2715, 3552, 2579, 6544, 4755, 386, 19572, 545, 321, 2825, 2013, 21016, 9850, 3673, 2708, 7165, 401, 3492, 1070, 4465, 390, 797, 11033, 5467, 367, 2141, 1198, 26170, 4755, 1742, 5424, 4510, 2989, 1372, 2998, 9026, 23505, 2959, 7212, 10185, 2938, 7635, 2932, 16121, 5402, 4135, 7707, 771, 1305, 5337, 737, 13968, 965, 4069, 2076, 6023, 3186, 3791, 3552, 2095, 10842, 15272, 4643, 15085, 2234, 29670, 15421, 7635, 29953, 11925, 3277, 6400, 378, 3909, 15478, 712, 3909, 14959, 9385, 5557, 8393, 14440, 42074, 11027, 20966, 1727, 36874, 4643, 2579, 7966, 15085, 7502, 20966, 3148, 3186, 3277, 4643, 15421, 2364, 1731, 14864, 26398, 5158, 4643, 9851, 1698, 2579, 1491, 11042, 2095, 14062, 44310, 750, 7059, 

In [15]:
job_description = dataset["train"][0]['job_data']
resume = dataset["train"][0]['resume_data']
inputs = tokenize_data(job_description, resume, tokenizer)

print("Inference Input IDs:", inputs["input_ids"].tolist())
print("Inference Attention Mask:", inputs["attention_mask"].tolist())

Inference Input IDs: [[65, 15085, 4807, 386, 19572, 545, 401, 34532, 2146, 6025, 15085, 4807, 4755, 4058, 15085, 26303, 5112, 437, 10298, 597, 2708, 2698, 14843, 4641, 37973, 4856, 3916, 14691, 6594, 15085, 4988, 5843, 7635, 4721, 27438, 6023, 15085, 1175, 14655, 1863, 2551, 3552, 17588, 6376, 1205, 2715, 3552, 2579, 6544, 4755, 386, 19572, 545, 321, 2825, 2013, 21016, 9850, 3673, 2708, 7165, 401, 3492, 1070, 4465, 390, 797, 11033, 5467, 367, 2141, 1198, 26170, 4755, 1742, 5424, 4510, 2989, 1372, 2998, 9026, 23505, 2959, 7212, 10185, 2938, 7635, 2932, 16121, 5402, 4135, 7707, 771, 1305, 5337, 737, 13968, 965, 4069, 2076, 6023, 3186, 3791, 3552, 2095, 10842, 15272, 4643, 15085, 2234, 29670, 15421, 7635, 29953, 11925, 3277, 6400, 378, 3909, 15478, 712, 3909, 14959, 9385, 5557, 8393, 14440, 42074, 11027, 20966, 1727, 36874, 4643, 2579, 7966, 15085, 7502, 20966, 3148, 3186, 3277, 4643, 15421, 2364, 1731, 14864, 26398, 5158, 4643, 9851, 1698, 2579, 1491, 11042, 2095, 14062, 44310, 750, 7059

In [16]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 2046
resume_chunk_size = 2046
max_length = 4096

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_resume_fit(jd, rd, model, tokenizer)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


cuda


Input ids are automatically padded from 4093 to 4096 to be a multiple of `config.block_size`: 64


Accuracy:  0.5
Precision:  0.0
Recall:  0.0
F1:  0.0
Confusion Matrix:  [[200   0]
 [200   0]]


## Configure LoRA

In [17]:
# -------------------------------
# Configure LoRA fine-tuning
# -------------------------------

# Define a LoRA configuration. Adjust parameters (r, lora_alpha, etc.) as needed.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # for sequence classification
    r=8,                        # low rank parameter; experiment with this value
    lora_alpha=32,              # scaling parameter
    lora_dropout=0.1,           # dropout probability for LoRA layers
    target_modules=["query", "value"],  # adjust based on your model architecture
)

# Wrap your model with LoRA. This freezes most of the model and inserts trainable LoRA layers.
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 887,042 || all params: 128,947,972 || trainable%: 0.6879


## Setting Training parameters

In [18]:

# -------------------------------
# Setup training parameters
# -------------------------------

training_args = TrainingArguments(
    output_dir="final",
    evaluation_strategy="epoch",
    save_strategy="epoch",             # Set save strategy to epoch to match evaluation_strategy
    num_train_epochs=7,                # Adjust number of epochs as desired
    per_device_train_batch_size=5,    # Adjust based on your GPU memory
    per_device_eval_batch_size=5,
    learning_rate=0.000025,
    load_best_model_at_end=True,       # Load the best model when finished training (if metric provided)
    metric_for_best_model="accuracy",  # Choose your metric
    weight_decay=0.1,                  # Strong L2 Regularization (Higher Regularization)
    max_grad_norm=0.5,                  # Aggressive Gradient Clipping
    adam_beta1=0.9,                      # Standard Momentum
    adam_beta2=0.98,                     # Reduces dependency on past gradients
    adam_epsilon=1e-08,                   # Prevents division by zero
    label_smoothing_factor=0.15,         # Helps prevent overconfidence
    warmup_ratio=0.1,                   # 10% of training steps as warm-up
    fp16=True,
    gradient_accumulation_steps = 2
)


## Defining Evaluation metrics

In [19]:
# -------------------------------
# Define a metric function for evaluation
# -------------------------------

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


## Training the model

In [20]:
# -------------------------------
# Create the Trainer and start training
# -------------------------------

# creater trainer object
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msaideepreddy99[0m ([33msaideepreddy99-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.627115,0.6925,0.740032,0.6925,0.676484
2,0.617300,0.590895,0.7725,0.836098,0.7725,0.761203
3,0.617300,0.400477,0.9125,0.918028,0.9125,0.91221
4,0.416500,0.429192,0.9075,0.920374,0.9075,0.906786
5,0.337500,0.464656,0.885,0.904582,0.885,0.883591
6,0.337500,0.43647,0.905,0.918561,0.905,0.904224
7,0.320900,0.437568,0.905,0.918561,0.905,0.904224


TrainOutput(global_step=2240, training_loss=0.4113397836685181, metrics={'train_runtime': 3973.133, 'train_samples_per_second': 5.638, 'train_steps_per_second': 0.564, 'total_flos': 4.79629421838336e+16, 'train_loss': 0.4113397836685181, 'epoch': 7.0})

## Test Results

In [21]:
# -------------------------------
# Evaluate the final model on the test set
# -------------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
test_results

{'eval_loss': 0.5400862693786621,
 'eval_accuracy': 0.825,
 'eval_precision': 0.8270933977455717,
 'eval_recall': 0.825,
 'eval_f1': 0.8247195512820514,
 'eval_runtime': 17.2898,
 'eval_samples_per_second': 23.135,
 'eval_steps_per_second': 4.627,
 'epoch': 7.0}

## Testing model after fine-tuning

In [22]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 2046
resume_chunk_size = 2046
max_length = 4096

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_resume_fit(jd, rd, model, tokenizer)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


cuda
Accuracy:  0.825
Precision:  0.8009259259259259
Recall:  0.865
F1:  0.8317307692307693
Confusion Matrix:  [[157  43]
 [ 27 173]]


In [23]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 2046
resume_chunk_size = 2046
max_length = 4096

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_resume_fit(jd, rd, lora_model, tokenizer)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


cuda
Accuracy:  0.825
Precision:  0.8009259259259259
Recall:  0.865
F1:  0.8317307692307693
Confusion Matrix:  [[157  43]
 [ 27 173]]


In [24]:
# prompt: Push the fine-tuned model adapters to huggingface

# Push the merged model and tokenizer to Hugging Face Hub
lora_model.push_to_hub("saideep-arikontham/bigbird-resume-fit-predictor")

adapter_model.safetensors:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saideep-arikontham/bigbird-resume-fit-predictor/commit/eba410576746d2a6cd299543e34354b418314080', commit_message='Upload model', commit_description='', oid='eba410576746d2a6cd299543e34354b418314080', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saideep-arikontham/bigbird-resume-fit-predictor', endpoint='https://huggingface.co', repo_type='model', repo_id='saideep-arikontham/bigbird-resume-fit-predictor'), pr_revision=None, pr_num=None)