# Baseline Modelling

In [2]:
!pip install datasets
!pip install evaluate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

## Importing libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords


from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import DatasetDict, Dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate


import warnings
warnings.filterwarnings("ignore")


In [4]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"

## Reading the data

In [5]:
df1 = pd.read_csv(f"/content/synthetic_data_v2.csv")
df = df1.copy()
converted_labels = {"Complete Mismatch":"Bad Fit", "Underwhelming":"Bad Fit", "Good Fit":"Good Fit", "Overqualified":"Good Fit"}
df['label'] = df['label'].replace(converted_labels)
df

Unnamed: 0,job_data,resume_data,label
0,Audit Manager\nMenzies LLP\nWe are looking for...,**Jennifer Oneal**\n*+1 (555) 555-5555* *|* *j...,Bad Fit
1,Audit Manager\nMenzies LLP\nWe are looking for...,**Christina Padilla DVM**\n\n*Email: christina...,Bad Fit
2,Audit Manager\nMenzies LLP\nWe are looking for...,**Andrew Kirby**\n*+44 00000 000000* *|* *andr...,Good Fit
3,Audit Manager\nMenzies LLP\nWe are looking for...,**Erin Hicks**\n*+1 (555) 555-5555* *|* *erin....,Good Fit
4,"Audit Manager\nOverview\nBaker Tilly US, LLP (...",**Randy Smith**\n*+1 (123) 456-7890* *|* *rand...,Bad Fit
...,...,...,...
3995,Test Engineer\nJob Summary:\nPerforms LAT test...,# Darren Roberts\n\n*Email*: darren.roberts@em...,Good Fit
3996,Test Engineer\nCompany Description\nMUST be au...,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...,Bad Fit
3997,Test Engineer\nCompany Description\nMUST be au...,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5...",Bad Fit
3998,Test Engineer\nCompany Description\nMUST be au...,"# William Torres\n\n*Greensboro, NC* *|* *will...",Good Fit


In [6]:
print("Number of rows in the dataset:", df.shape[0])
print("Number of columns in the dataset:", df.shape[1])

Number of rows in the dataset: 4000
Number of columns in the dataset: 3


In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Bad Fit,2000
Good Fit,2000


In [8]:
# Example mapping: adjust as necessary for your labels
label_list = sorted(df['label'].unique())
label2id = {label: i for i, label in enumerate(label_list)}
df['label'] = df['label'].map(label2id)
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,2000
1,2000


## Data Preprocessing

In [9]:
# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add("overqualified")
stop_words.add("underqualified")
stop_words.add("mismatch")
stop_words.add("good")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove unwanted symbols except %, $, /, and .
    text = re.sub(r"[^a-z0-9\s%$/.-]", "", text)

    # Preserve hyphens only when followed by a number (e.g., 2005-2010, 2010-present)
    text = re.sub(r"-(?!\d)", "", text)  # Remove hyphens not followed by a digit

    # Preserve GPA-like formats (e.g., 3.8/4.0)
    text = re.sub(r"(?<!\d)/|/(?!\d)", " ", text)  # Remove '/' unless between numbers

    # Remove periods (".") if they are immediately after a word but not numbers (e.g., "good." → "good", but keep 3.8)
    text = re.sub(r"\b(\w+)\.(?!\d)", r"\1", text)

    # Remove newline characters
    text = text.replace("\n", " ").replace("\r", " ")

    # Remove any show less and show more texts
    text = text.replace("show less", "").replace("show more", "")
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)

    return text

In [11]:
df['job_data'] = df['job_data'].apply(preprocess_text)
df['resume_data'] = df['resume_data'].apply(preprocess_text)

In [12]:

# -------------------------------
# 1. Load and split your dataset
# -------------------------------

# Replace with your data source – here we assume a CSV with a "text" column and label column (e.g., "label")
# For example: data_files={"data": "your_data.csv"}
dataset = Dataset.from_pandas(df)

# First, split into 80% train and 20% test
train_test = dataset.train_test_split(test_size=0.2, seed=42, shuffle=False)

# Then, split the 20% test split equally into validation and test (10% each)
test_val = train_test['test'].train_test_split(test_size=0.5, seed=42, shuffle=False)

dataset = DatasetDict({
    "train": train_test["train"],
    "validation": test_val["train"],
    "test": test_val["test"]
})

# -------------------------------
# 2. Load tokenizer and base model
# -------------------------------

model_name = "FacebookAI/roberta-base"  # change to your desired pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # adjust num_labels as needed

print(model)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [13]:
import torch
import numpy as np
from collections import Counter
from transformers import Trainer, TrainingArguments
import evaluate

# Assume that necessary LoRA classes and functions are imported:
# from peft import LoraConfig, get_peft_model, TaskType

# -------------------------------
# 3. Configure LoRA fine-tuning
# -------------------------------

# Define a LoRA configuration. Adjust parameters (r, lora_alpha, etc.) as needed.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # for sequence classification
    r=8,                        # low rank parameter; experiment with this value
    lora_alpha=32,              # scaling parameter
    lora_dropout=0.1,           # dropout probability for LoRA layers
    target_modules=["query", "value", "out_proj"]  # adjust based on your model architecture
)

# Wrap your model with LoRA. This freezes most of the model and inserts trainable LoRA layers.
model = get_peft_model(model, lora_config)

# -------------------------------
# 4. Improved tokenization approach for job and resume matching
# -------------------------------

def preprocess_function(examples):
    # Process job descriptions and resumes separately with appropriate max lengths
    job_max_length = 256
    resume_max_length = 256

    # Tokenize job descriptions
    job_inputs = tokenizer(
        examples["job_data"],
        truncation=True,
        max_length=job_max_length,
        padding="max_length"
    )

    # Tokenize resumes
    resume_inputs = tokenizer(
        examples["resume_data"],
        truncation=True,
        max_length=resume_max_length,
        padding="max_length"
    )

    # Combine the tokenized inputs
    combined_inputs = {
        "input_ids": [],
        "attention_mask": []
    }

    for job_ids, job_mask, resume_ids, resume_mask in zip(
        job_inputs["input_ids"], job_inputs["attention_mask"],
        resume_inputs["input_ids"], resume_inputs["attention_mask"]
    ):
        # Get separator token ID
        separator_id = tokenizer.sep_token_id
        if separator_id is None:
            separator_id = tokenizer.eos_token_id

        # Combine tokens with separator
        combined_ids = job_ids + [separator_id] + resume_ids
        combined_mask = job_mask + [1] + resume_mask

        # Ensure we don't exceed the model's max input length
        max_model_length = 512
        if len(combined_ids) > max_model_length:
            combined_ids = combined_ids[:max_model_length]
            combined_mask = combined_mask[:max_model_length]

        combined_inputs["input_ids"].append(combined_ids)
        combined_inputs["attention_mask"].append(combined_mask)

    # Add labels
    combined_inputs["labels"] = examples["label"]

    return combined_inputs

# Apply the tokenization to all splits
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# -------------------------------
# 5. Setup training parameters
# -------------------------------

training_args = TrainingArguments(
    output_dir=f"/content",
    evaluation_strategy="epoch",
    save_strategy="epoch",             # Set save strategy to epoch to match evaluation_strategy
    num_train_epochs=3,                # Adjust number of epochs as desired
    per_device_train_batch_size=16,    # Adjust based on your GPU memory
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,                # Keep only the latest checkpoint
    load_best_model_at_end=True,       # Load the best model when finished training (if metric provided)
    metric_for_best_model="accuracy",  # Choose your metric
)

# -------------------------------
# 6. Define a metric function for evaluation
# -------------------------------

# Load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

# -------------------------------
# 7. Create the Trainer and start training
# -------------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics
)

# Train the model with LoRA fine-tuning
trainer.train()

# -------------------------------
# 8. Define a prediction function with chunking and voting
# -------------------------------

def predict_job_resume_match_with_chunking(job_text, resume_text, model, tokenizer,
                                           job_chunk_size=256, resume_chunk_size=256,
                                           max_model_length=512):
    """
    Predicts a label for a job-resume pair by chunking both texts and performing
    majority vote on predictions from all chunk combinations.

    Args:
        job_text (str): The job description text.
        resume_text (str): The resume text.
        model: The fine-tuned model.
        tokenizer: The tokenizer corresponding to the model.
        job_chunk_size (int): Maximum tokens per job chunk.
        resume_chunk_size (int): Maximum tokens per resume chunk.
        max_model_length (int): Maximum allowed length for model input.

    Returns:
        final_prediction (int): The predicted label based on majority voting.
    """

    # Tokenize the texts without truncation or adding special tokens
    job_tokens = tokenizer(job_text, truncation=False, add_special_tokens=False)["input_ids"]
    resume_tokens = tokenizer(resume_text, truncation=False, add_special_tokens=False)["input_ids"]

    # Create chunks for the job and resume tokens
    job_chunks = [job_tokens[i:i+job_chunk_size] for i in range(0, len(job_tokens), job_chunk_size)]
    resume_chunks = [resume_tokens[i:i+resume_chunk_size] for i in range(0, len(resume_tokens), resume_chunk_size)]

    predictions = []
    separator_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else tokenizer.eos_token_id

    # Iterate over every combination of job and resume chunks
    for job_chunk in job_chunks:
        for resume_chunk in resume_chunks:
            # Combine the chunks with a separator token
            combined_ids = job_chunk + [separator_id] + resume_chunk

            # Truncate if the combined input exceeds the model's maximum length
            if len(combined_ids) > max_model_length:
                combined_ids = combined_ids[:max_model_length]

            # Create input tensors
            input_ids = torch.tensor([combined_ids])
            attention_mask = torch.tensor([[1] * len(combined_ids)])

            # Get prediction from the model
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

    # Majority vote among predictions
    vote = Counter(predictions)
    final_prediction = vote.most_common(1)[0][0]

    return final_prediction

# Example usage of the updated prediction function:
# prediction = predict_job_resume_match_with_chunking(
#     "Software Engineer job description...",
#     "Experienced software developer resume...",
#     model,
#     tokenizer
# )


Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaideepreddy99[0m ([33msaideepreddy99-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.675823,0.5,0.25,0.5,0.333333
2,No log,0.195807,0.9175,0.926466,0.9175,0.917064
3,0.476900,0.126706,0.95,0.951626,0.95,0.949955


In [14]:
# -------------------------------
# 8. Evaluate the final model on the test set
# -------------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test results:", test_results)

Test results: {'eval_loss': 0.12025798857212067, 'eval_accuracy': 0.9525, 'eval_precision': 0.9525113127828196, 'eval_recall': 0.9525, 'eval_f1': 0.9524997031231445, 'eval_runtime': 11.9548, 'eval_samples_per_second': 33.459, 'eval_steps_per_second': 2.091, 'epoch': 3.0}
