# LoRA Fine-tuning : Roberta-base

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords

import torch
from collections import Counter

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import DatasetDict, Dataset, load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate


import warnings
warnings.filterwarnings("ignore")


In [2]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"

## Reading the data

In [3]:
dataset = load_dataset("saideep-arikontham/jd_resume_dataset")
dataset

train-00000-of-00001.parquet:   0%|          | 0.00/5.97M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/637k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
    test: Dataset({
        features: ['job_data', 'resume_data', 'label', '__index_level_0__'],
        num_rows: 400
    })
})

In [8]:
train_df = dataset['train'].to_pandas()
train_df['label'].value_counts()

label
0    1600
1    1600
Name: count, dtype: int64

In [9]:
val_df = dataset['validation'].to_pandas()
val_df['label'].value_counts()

label
0    200
1    200
Name: count, dtype: int64

In [10]:
test_df = dataset['test'].to_pandas()
test_df['label'].value_counts()

label
0    200
1    200
Name: count, dtype: int64

In [11]:
# Defining label2id
label2id = {'Bad Fit': 0, 'Good Fit': 1}

## Loading the model

In [13]:
# -------------------------------
# Load tokenizer and base model
# -------------------------------

model_name = "FacebookAI/roberta-base"  # change to your desired pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # adjust num_labels as needed

print(model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Configure LoRA

In [15]:
# -------------------------------
# Configure LoRA fine-tuning
# -------------------------------

# Define a LoRA configuration. Adjust parameters (r, lora_alpha, etc.) as needed.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # for sequence classification
    r=8,                        # low rank parameter; experiment with this value
    lora_alpha=32,              # scaling parameter
    lora_dropout=0.1,           # dropout probability for LoRA layers
    target_modules=["query", "value", "out_proj"]  # adjust based on your model architecture
)

# Wrap your model with LoRA. This freezes most of the model and inserts trainable LoRA layers.
model = get_peft_model(model, lora_config)

## Define Preprocessing function

In [17]:
# -------------------------------
# Improved tokenization approach for job and resume matching
# -------------------------------

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add("overqualified")
stop_words.add("underqualified")
stop_words.add("mismatch")
stop_words.add("good")

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text


def preprocess_function(examples):

    # Preprocess the text first
    examples["job_data"] = [preprocess_text(job) for job in examples["job_data"]]
    examples["resume_data"] = [preprocess_text(resume) for resume in examples["resume_data"]]

    # Process job descriptions and resumes separately with appropriate max lengths
    job_max_length = 256
    resume_max_length = 256

    # Tokenize job descriptions
    job_inputs = tokenizer(
        examples["job_data"],
        truncation=True,
        max_length=job_max_length,
        padding="max_length"
    )

    # Tokenize resumes
    resume_inputs = tokenizer(
        examples["resume_data"],
        truncation=True,
        max_length=resume_max_length,
        padding="max_length"
    )

    # Combine the tokenized inputs
    combined_inputs = {
        "input_ids": [],
        "attention_mask": []
    }

    for job_ids, job_mask, resume_ids, resume_mask in zip(
        job_inputs["input_ids"], job_inputs["attention_mask"],
        resume_inputs["input_ids"], resume_inputs["attention_mask"]
    ):
        # Get separator token ID
        separator_id = tokenizer.sep_token_id
        if separator_id is None:
            separator_id = tokenizer.eos_token_id

        # Combine tokens with separator
        combined_ids = job_ids + [separator_id] + resume_ids
        combined_mask = job_mask + [1] + resume_mask

        # Ensure we don't exceed the model's max input length
        max_model_length = 512
        if len(combined_ids) > max_model_length:
            combined_ids = combined_ids[:max_model_length]
            combined_mask = combined_mask[:max_model_length]

        combined_inputs["input_ids"].append(combined_ids)
        combined_inputs["attention_mask"].append(combined_mask)

    # Add labels
    combined_inputs["labels"] = examples["label"]

    return combined_inputs

# Apply the tokenization to all splits
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saideep_arikontham/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

## Setting Training parameters

In [19]:

# -------------------------------
# Setup training parameters
# -------------------------------

training_args = TrainingArguments(
    output_dir=f"{path}/models/test1",
    evaluation_strategy="epoch",
    save_strategy="epoch",             # Set save strategy to epoch to match evaluation_strategy
    num_train_epochs=3,                # Adjust number of epochs as desired
    per_device_train_batch_size=32,    # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,                # Keep only the latest checkpoint
    load_best_model_at_end=True,       # Load the best model when finished training (if metric provided)
    metric_for_best_model="accuracy",  # Choose your metric
)


## Defining Evaluation metrics

In [21]:
# -------------------------------
# Define a metric function for evaluation
# -------------------------------

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


## Training the model

In [23]:
# -------------------------------
# Create the Trainer and start training
# -------------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

# Train the model with LoRA fine-tuning
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 19.93 GB, other allocations: 432.75 MB, max allowed: 20.40 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Test Results

In [None]:
# -------------------------------
# Evaluate the final model on the test set
# -------------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
test_results

## Predicition Function

In [None]:
# -------------------------------
# Define a prediction function with chunking and voting
# -------------------------------

def predict_job_resume_match_with_chunking(job_text, resume_text, model, tokenizer,
                                           job_chunk_size=256, resume_chunk_size=256,
                                           max_model_length=512):
    """
    Predicts a label for a job-resume pair by chunking both texts and performing
    majority vote on predictions from all chunk combinations.

    Args:
        job_text (str): The job description text.
        resume_text (str): The resume text.
        model: The fine-tuned model.
        tokenizer: The tokenizer corresponding to the model.
        job_chunk_size (int): Maximum tokens per job chunk.
        resume_chunk_size (int): Maximum tokens per resume chunk.
        max_model_length (int): Maximum allowed length for model input.

    Returns:
        final_prediction (int): The predicted label based on majority voting.
    """

    # Tokenize the texts without truncation or adding special tokens
    job_tokens = tokenizer(job_text, truncation=False, add_special_tokens=False)["input_ids"]
    resume_tokens = tokenizer(resume_text, truncation=False, add_special_tokens=False)["input_ids"]

    # Create chunks for the job and resume tokens
    job_chunks = [job_tokens[i:i+job_chunk_size] for i in range(0, len(job_tokens), job_chunk_size)]
    resume_chunks = [resume_tokens[i:i+resume_chunk_size] for i in range(0, len(resume_tokens), resume_chunk_size)]

    predictions = []
    separator_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else tokenizer.eos_token_id

    # Iterate over every combination of job and resume chunks
    for job_chunk in job_chunks:
        for resume_chunk in resume_chunks:
            # Combine the chunks with a separator token
            combined_ids = job_chunk + [separator_id] + resume_chunk

            # Truncate if the combined input exceeds the model's maximum length
            if len(combined_ids) > max_model_length:
                combined_ids = combined_ids[:max_model_length]

            # Create input tensors
            input_ids = torch.tensor([combined_ids])
            attention_mask = torch.tensor([[1] * len(combined_ids)])

            # Get prediction from the model
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

    # Majority vote among predictions
    vote = Counter(predictions)
    final_prediction = vote.most_common(1)[0][0]

    return final_prediction