# LoRA Fine-tuning : Roberta-base

In [1]:
!pip install datasets
!pip install evaluate



## Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, DataCollatorWithPadding
from datasets import DatasetDict, Dataset, load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import huggingface_hub
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter

import torch.nn.functional as F
import evaluate
import torch
import numpy as np


import warnings
warnings.filterwarnings("ignore")


In [3]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"
huggingface_hub.login(token="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")

## Reading the data

In [4]:
dataset = load_dataset("saideep-arikontham/jd_resume_dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
    test: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
})

In [5]:
train_df = dataset['train'].to_pandas()
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1600
1,1600


In [6]:
val_df = dataset['validation'].to_pandas()
val_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


In [7]:
test_df = dataset['test'].to_pandas()
test_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,200
1,200


## Loading the model

In [8]:
# -------------------------------
# Load tokenizer and base model
# -------------------------------

model_checkpoint = 'roberta-base'

# define label maps
label2id = {'Bad Fit': 0, 'Good Fit': 1}
id2label = {0:'Bad Fit', 1:'Good Fit'}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [9]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

## Define Preprocessing function

In [10]:
# -------------------------------
# Improved tokenization approach for job and resume matching
# -------------------------------

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add("overqualified")
stop_words.add("underqualified")
stop_words.add("mismatch")
stop_words.add("good")

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)



    return text



def chunk_dataset(dataset, tokenizer):
    """Tokenizes and chunks job and resume data into fixed-size chunks."""

    job_chunk_size = 256
    resume_chunk_size = 256
    max_length = job_chunk_size + resume_chunk_size


    def chunk_function(examples):
        all_input_ids = []  # Store flattened list of input_ids
        all_attention_masks = []  # Store flattened list of attention_masks
        all_labels = []  # Store flattened list of labels

        examples['job_data'] = [preprocess_text(jd) for jd in examples['job_data']]
        examples['resume_data'] = [preprocess_text(jd) for jd in examples['resume_data']]

        for job_text, resume_text, label in zip(examples['job_data'], examples['resume_data'], examples['label']):
            # Tokenize job data
            job_tokens = tokenizer.tokenize(job_text)
            # Tokenize resume data
            resume_tokens = tokenizer.tokenize(resume_text)

            num_job_chunks = (len(job_tokens) + job_chunk_size - 1) // job_chunk_size
            num_resume_chunks = (len(resume_tokens) + resume_chunk_size - 1) // resume_chunk_size

            # Chunk job data
            for i in range(0, len(job_tokens), job_chunk_size):
                job_chunk = job_tokens[i:i + job_chunk_size]

                # Chunk resume data
                for j in range(0, len(resume_tokens), resume_chunk_size):
                    resume_chunk = resume_tokens[j:j + resume_chunk_size]

                    # Combine the chunks and truncate
                    combined_tokens = ['[CLS]'] + job_chunk + ['[SEP]'] + resume_chunk
                    combined_tokens = combined_tokens[:max_length]

                    # Convert to input IDs and attention mask
                    input_ids = tokenizer.convert_tokens_to_ids(combined_tokens)
                    attention_mask = [1] * len(input_ids)  # 1 for real tokens, 0 for padding

                    # Pad to max_length
                    padding_length = max_length - len(input_ids)
                    input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
                    attention_mask = attention_mask + [0] * padding_length

                    all_input_ids.append(input_ids) # Append to the flattened list
                    all_attention_masks.append(attention_mask) # Append to the flattened list
                    all_labels.append(label) # Assign same label to all chunks, append to flattened list

        #print(all_input_ids)  # Now print the flattened input_ids
        return {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'labels': all_labels
        }

    # Apply chunking to your dataset
    dataset = dataset.map(chunk_function, batched=True, remove_columns=dataset.column_names) #remove original columns


    return dataset


tokenized_dataset = DatasetDict()
for split in dataset.keys():
    tokenized_dataset[split] = chunk_dataset(dataset[split], tokenizer)

tokenized_dataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (771 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 19421
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2375
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2366
    })
})

In [11]:
# create data collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


## Testing Untrained model

In [12]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(["overqualified", "underqualified", "mismatch", "good"])

def preprocess_text(text):
    """Preprocess text by removing unwanted symbols, normalizing, and removing stopwords."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)
    text = re.sub(r"-(?!\d)", "", text)  # Preserve hyphens only when followed by a number
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)  # Remove periods unless in numbers
    text = text.replace("\n", " ").replace("\r", " ")
    text = text.replace("show less", "").replace("show more", "")
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:



def predict_with_voting(job_text, resume_text, model, tokenizer, device="cpu", return_probabilities=True):
    """
    Predicts the class using chunking and a voting mechanism.
    """
    model.to(device)
    model.eval()

    with torch.no_grad():
        job_text = preprocess_text(job_text)
        resume_text = preprocess_text(resume_text)

        # Tokenize job data
        job_tokens = tokenizer.tokenize(job_text)
        # Tokenize resume data
        resume_tokens = tokenizer.tokenize(resume_text)

        predictions = []
        probabilities = []

        # Chunk job data
        for i in range(0, len(job_tokens), job_chunk_size):
            job_chunk = job_tokens[i:i + job_chunk_size]

            # Chunk resume data
            for j in range(0, len(resume_tokens), resume_chunk_size):
                resume_chunk = resume_tokens[j:j + resume_chunk_size]

                # Combine the chunks and truncate
                combined_tokens = ['[CLS]'] + job_chunk + ['[SEP]'] + resume_chunk
                combined_tokens = combined_tokens[:max_length]

                # Convert to input IDs and attention mask
                input_ids = tokenizer.convert_tokens_to_ids(combined_tokens)
                attention_mask = [1] * len(input_ids)  # 1 for real tokens, 0 for padding

                # Pad to max_length
                padding_length = max_length - len(input_ids)
                input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
                attention_mask = attention_mask + [0] * padding_length

                # Convert to tensors and send to device
                input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
                attention_mask = torch.tensor([attention_mask], dtype=torch.long).to(device)

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits
                probs = torch.softmax(logits, dim=-1)

                predicted_class = torch.argmax(logits, dim=-1).item()
                predictions.append(predicted_class)
                probabilities.append(probs[0, 1].item())  # Probability of class "1"

        # Hard Voting
        if predictions:
            counts = Counter(predictions)
            most_voted_class = counts.most_common(1)[0][0]
        else:
            most_voted_class = 0  # Default if no predictions

        # Average Probability for Class "1"
        average_probability = np.mean(probabilities) if probabilities else 0.0

    return most_voted_class, average_probability

In [14]:
!nvidia-smi

Sat Mar 15 15:43:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   68C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [15]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 256
resume_chunk_size = 256
max_length = job_chunk_size + resume_chunk_size

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_with_voting(jd, rd, model, tokenizer, device)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


cuda
Accuracy:  0.5
Precision:  0.5
Recall:  1.0
F1:  0.6666666666666666
Confusion Matrix:  [[  0 200]
 [  0 200]]


## Configure LoRA

In [16]:
# -------------------------------
# Configure LoRA fine-tuning
# -------------------------------

# Define a LoRA configuration. Adjust parameters (r, lora_alpha, etc.) as needed.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # for sequence classification
    r=8,                        # low rank parameter; experiment with this value
    lora_alpha=32,              # scaling parameter
    lora_dropout=0.15,           # dropout probability for LoRA layers
    target_modules=["query", "value"]  # adjust based on your model architecture
)

# Wrap your model with LoRA. This freezes most of the model and inserts trainable LoRA layers.
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


## Setting Training parameters

In [17]:

# -------------------------------
# Setup training parameters
# -------------------------------

training_args = TrainingArguments(
    output_dir="test1",
    evaluation_strategy="epoch",
    save_strategy="epoch",             # Set save strategy to epoch to match evaluation_strategy
    num_train_epochs=1,                # Adjust number of epochs as desired
    per_device_train_batch_size=32,    # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    load_best_model_at_end=True,       # Load the best model when finished training (if metric provided)
    metric_for_best_model="accuracy",  # Choose your metric
    weight_decay=0.1,                  # Strong L2 Regularization (Higher Regularization)
    max_grad_norm=0.5,                  # Aggressive Gradient Clipping
    adam_beta1=0.9,                      # Standard Momentum
    adam_beta2=0.98,                     # Reduces dependency on past gradients
    adam_epsilon=1e-08,                   # Prevents division by zero
    label_smoothing_factor=0.1,         # Helps prevent overconfidence
    warmup_ratio=0.1                   # 10% of training steps as warm-up
)


## Defining Evaluation metrics

In [18]:
# -------------------------------
# Define a metric function for evaluation
# -------------------------------

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


## Training the model

In [19]:
# -------------------------------
# Create the Trainer and start training
# -------------------------------

# creater trainer object
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaideepreddy99[0m ([33msaideepreddy99-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5475,0.379908,0.884632,0.885116,0.884632,0.884376


TrainOutput(global_step=607, training_loss=0.5239724311640078, metrics={'train_runtime': 1555.7338, 'train_samples_per_second': 12.483, 'train_steps_per_second': 0.39, 'total_flos': 5162801895665664.0, 'train_loss': 0.5239724311640078, 'epoch': 1.0})

## Test Results

In [20]:
# -------------------------------
# Evaluate the final model on the test set
# -------------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
test_results

{'eval_loss': 0.4239107668399811,
 'eval_accuracy': 0.8655959425190194,
 'eval_precision': 0.868844829732984,
 'eval_recall': 0.8655959425190194,
 'eval_f1': 0.8656727869373435,
 'eval_runtime': 68.9357,
 'eval_samples_per_second': 34.322,
 'eval_steps_per_second': 1.073,
 'epoch': 1.0}

## Testing model after fine-tuning

In [21]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 256
resume_chunk_size = 256
max_length = job_chunk_size + resume_chunk_size

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_with_voting(jd, rd, lora_model, tokenizer, device)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

cuda
Accuracy:  0.95
Precision:  0.9639175257731959
Recall:  0.935
F1:  0.949238578680203
Confusion Matrix:  [[193   7]
 [ 13 187]]


In [22]:
# Model and Tokenizer Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #changed to torch.device
print(device)

# Define chunk sizes
job_chunk_size = 256
resume_chunk_size = 256
max_length = job_chunk_size + resume_chunk_size

# Load test data
y_test = test_df['label'].tolist()
y_pred = []

# Run Predictions
for i in range(test_df.shape[0]):
    jd = test_df.iloc[i]['job_data']
    rd = test_df.iloc[i]['resume_data']

    predicted_class, class_1_prob = predict_with_voting(jd, rd, model, tokenizer, device)
    y_pred.append(predicted_class)

# Evaluate Model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

cuda
Accuracy:  0.95
Precision:  0.9639175257731959
Recall:  0.935
F1:  0.949238578680203
Confusion Matrix:  [[193   7]
 [ 13 187]]


In [23]:
# prompt: Save the fine-tuned LoRA model to google drive

from google.colab import drive
drive.mount('/content/drive')

# Save the merged model to Google Drive
lora_model.save_pretrained("/content/drive/MyDrive/lora_adaptors")


Mounted at /content/drive


In [24]:
# prompt: Push the fine-tuned model adapters to huggingface

# Push the merged model and tokenizer to Hugging Face Hub
lora_model.push_to_hub("saideep-arikontham/roberta-resume-fit-predictor-adaptor")

adapter_model.safetensors:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saideep-arikontham/roberta-resume-fit-predictor-adaptor/commit/41d7f3a0020355858d1fbce2f5455da7a06fb332', commit_message='Upload model', commit_description='', oid='41d7f3a0020355858d1fbce2f5455da7a06fb332', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saideep-arikontham/roberta-resume-fit-predictor-adaptor', endpoint='https://huggingface.co', repo_type='model', repo_id='saideep-arikontham/roberta-resume-fit-predictor-adaptor'), pr_revision=None, pr_num=None)