## Install & Import libraries

In [135]:
# Record the start time of the notebook
import time
start_time = time.time()

In [136]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [137]:
# Install necessary libraries
!pip install -q torch transformers tqdm scikit-learn pandas numpy rouge-score jiwer nltk sympy

In [138]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from tqdm import tqdm
import re
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from jiwer import wer, cer
from nltk.metrics.distance import jaro_winkler_similarity
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [139]:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load and Preprocess Data


In [140]:
# Load your dataset
file_path = '/content/drive/My Drive/JobSeeking/Orfium/normalization_assesment_dataset_10k.csv'
df = pd.read_csv(file_path)

In [141]:
# Preprocessing function
def preprocess_text(text):
    """
    Preprocess the text by:
    - Lowercasing
    - Removing special characters and numbers
    - Stripping leading/trailing whitespaces
    """
    if type(text) == float: # Handle NaN values
      text = str(text)

    text = text.lower()  # Lowercase text
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = " ".join(text.split())  # Remove extra spaces
    return text

# Apply preprocessing to the dataset
df['raw_comp_writers_text'] = df['raw_comp_writers_text'].apply(preprocess_text)
df['CLEAN_TEXT'] = df['CLEAN_TEXT'].apply(preprocess_text)

In [142]:
# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Extract inputs and targets for each split
train_inputs, train_targets = train_df['raw_comp_writers_text'].tolist(), train_df['CLEAN_TEXT'].tolist()
val_inputs, val_targets = val_df['raw_comp_writers_text'].tolist(), val_df['CLEAN_TEXT'].tolist()
test_inputs, test_targets = test_df['raw_comp_writers_text'].tolist(), test_df['CLEAN_TEXT'].tolist()

## Tokenizer and Dataset Preparation

Models Used

1. **T5 (Text-to-Text Transfer Transformer)**
* Model: t5-small
* Description: T5 is a transformer model designed for a wide range of NLP tasks in a unified framework. It casts all NLP tasks as text-to-text problems, meaning that both inputs and outputs are in text format. This makes it highly versatile for tasks like text summarization, translation, and normalization.
* Size: Small version (about 60 million parameters) offers a good balance between performance and efficiency for lightweight tasks.
Use Case: Suitable for general text normalization tasks.

2. **BART (Bidirectional and Auto-Regressive Transformers)**

* Model: facebook/bart-base
* Description: BART is a denoising autoencoder model designed for sequence-to-sequence tasks. It combines the best of both autoencoders and transformers, providing a strong model for text generation, summarization, and translation. It works by corrupting an input sequence and training the model to reconstruct it.
* Size: bart-base is a medium-sized model with around 140 million parameters, offering a balance between speed and performance for many tasks.
Use Case: Works well for text generation and summarization tasks, making it suitable for text normalization tasks where a transformation is needed.

3. **mT5 (Multilingual T5)**

* Model: google/mt5-small
* Description: mT5 is a multilingual version of T5, designed to handle a wide variety of languages. It applies the same text-to-text framework as T5 but is trained on multiple languages, making it ideal for multilingual NLP tasks. It supports over 100 languages.
* Size: The mT5-small variant has around 60 million parameters, making it lightweight and efficient while being multilingual.
Use Case: Ideal for text normalization tasks involving multiple languages or when working with non-English datasets.


In [143]:
models = ["t5-small", "facebook/bart-base", "google/mt5-small"]

# Initialize the tokenizer and model from HuggingFace
model_name = models[1]
tokenizer = AutoTokenizer.from_pretrained(model_name)
batch_size = 8

In [144]:
# Create a PyTorch dataset
class TextNormalizationDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        input_text = self.inputs[item]
        target_text = self.targets[item]

        # Tokenize the inputs and targets
        input_encoding = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")

        # Extract the necessary values from the tokenization process
        item = {
            "input_ids": input_encoding["input_ids"].squeeze(0),
            "attention_mask": input_encoding["attention_mask"].squeeze(0),
            "labels": target_encoding["input_ids"].squeeze(0),
        }

        # Move tensors to device (GPU or CPU)
        item["input_ids"] = item["input_ids"].to(device)
        item["attention_mask"] = item["attention_mask"].to(device)
        item["labels"] = item["labels"].to(device)

        return item

# Create the data loaders
train_dataset = TextNormalizationDataset(train_inputs, train_targets, tokenizer)
val_dataset = TextNormalizationDataset(val_inputs, val_targets, tokenizer)
test_dataset = TextNormalizationDataset(test_inputs, test_targets, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Initialize Model & Optimizer

In [145]:
# Load the pre-trained model for sequence-to-sequence tasks
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up optimizer and loss function
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

## Training Loop

In [146]:
# Start training and validation time
start_train_val_time = time.time()

# Training and validation loop
epochs = 6
for epoch in range(epochs):
    model.train()  # Set model to training mode
    train_loss = 0
    val_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")

    # Training phase
    for batch in progress_bar:
        optimizer.zero_grad()

        # Move data to device (GPU/CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update progress bar with the current loss
        progress_bar.set_postfix(loss=loss.item())

    # Validation phase
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No gradient computation for validation
        for batch in tqdm(val_loader, desc="Validating"):
            # Move validation data to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    # Print training and validation loss for the epoch
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss / len(train_loader):.2f}, Validation Loss: {val_loss / len(val_loader):.2f}")

# End of training and validation time
end_train_val_time = time.time()
train_val_elapsed_time = end_train_val_time - start_train_val_time
print(f"Training and Validation Time: {train_val_elapsed_time / 60:.2f} minutes")

Training Epoch 1: 100%|██████████| 875/875 [04:54<00:00,  2.97it/s, loss=0.00893]
Validating: 100%|██████████| 188/188 [00:18<00:00, 10.27it/s]


Epoch 1: Train Loss: 0.42, Validation Loss: 0.03


Training Epoch 2: 100%|██████████| 875/875 [04:49<00:00,  3.02it/s, loss=0.0247]
Validating: 100%|██████████| 188/188 [00:18<00:00, 10.32it/s]


Epoch 2: Train Loss: 0.03, Validation Loss: 0.03


Training Epoch 3: 100%|██████████| 875/875 [04:49<00:00,  3.02it/s, loss=0.0234]
Validating: 100%|██████████| 188/188 [00:18<00:00, 10.26it/s]


Epoch 3: Train Loss: 0.03, Validation Loss: 0.02


Training Epoch 4: 100%|██████████| 875/875 [04:49<00:00,  3.02it/s, loss=0.00718]
Validating: 100%|██████████| 188/188 [00:18<00:00, 10.32it/s]


Epoch 4: Train Loss: 0.02, Validation Loss: 0.02


Training Epoch 5: 100%|██████████| 875/875 [04:49<00:00,  3.02it/s, loss=0.034]
Validating: 100%|██████████| 188/188 [00:18<00:00, 10.36it/s]


Epoch 5: Train Loss: 0.02, Validation Loss: 0.03


Training Epoch 6: 100%|██████████| 875/875 [04:49<00:00,  3.02it/s, loss=0.0234]
Validating: 100%|██████████| 188/188 [00:19<00:00,  9.79it/s]

Epoch 6: Train Loss: 0.02, Validation Loss: 0.03
Training and Validation Time: 30.89 minutes





## Evaluation on Test Set

In [147]:
# Start evaluation time
eval_start_time = time.time()

test_predictions = []
test_references = []
model.eval()

# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

exact_match_count = 0
total_count = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating on Test Set"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"]

        # Generate predictions
        outputs = model.generate(input_ids, max_length=128)
        predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        references = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

        test_predictions.extend(predictions)
        test_references.extend(references)

        # Exact Match (EM)
        for pred, ref in zip(predictions, references):
            total_count += 1
            if pred.strip() == ref.strip():
                exact_match_count += 1

# Calculate metrics
accuracy = accuracy_score(test_references, test_predictions)

# Calculate BLEU score
bleu_score = np.mean([sentence_bleu([ref.split()], pred.split()) for ref, pred in zip(test_references, test_predictions)])

# Calculate ROUGE scores
rouge1 = np.mean([rouge.score(ref, pred)["rouge1"].fmeasure for ref, pred in zip(test_references, test_predictions)])
rouge2 = np.mean([rouge.score(ref, pred)["rouge2"].fmeasure for ref, pred in zip(test_references, test_predictions)])
rougeL = np.mean([rouge.score(ref, pred)["rougeL"].fmeasure for ref, pred in zip(test_references, test_predictions)])

# Calculate Jaro-Winkler similarity
jaro_winkler_scores = np.mean([jaro_winkler_similarity(ref, pred) for ref, pred in zip(test_references, test_predictions)])

# Calculate Word Error Rate (WER)
wer_score = np.mean([wer(ref, pred) for ref, pred in zip(test_references, test_predictions)])

# Calculate Character Error Rate (CER)
cer_score = np.mean([cer(ref, pred) for ref, pred in zip(test_references, test_predictions)])

# Calculate Exact Match (EM)
exact_match = exact_match_count / total_count

# Print evaluation results
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Test BLEU Score: {bleu_score:.2f}")
print(f"Test ROUGE-1 Score: {rouge1:.2f}")
print(f"Test ROUGE-2 Score: {rouge2:.2f}")
print(f"Test ROUGE-L Score: {rougeL:.2f}")
print(f"Test Jaro-Winkler Score: {jaro_winkler_scores:.2f}")
print(f"Test WER: {wer_score:.2f}")
print(f"Test CER: {cer_score:.2f}")
print(f"Test Exact Match: {exact_match:.2f}")

# End evaluation time
eval_end_time = time.time()
eval_elapsed_time = eval_end_time - eval_start_time
print(f"Evaluation Time: {eval_elapsed_time / 60:.2f} minutes")

Evaluating on Test Set: 100%|██████████| 188/188 [01:13<00:00,  2.56it/s]


Test Accuracy: 0.71
Test BLEU Score: 0.14
Test ROUGE-1 Score: 0.79
Test ROUGE-2 Score: 0.63
Test ROUGE-L Score: 0.79
Test Jaro-Winkler Score: 0.93
Test WER: 0.49
Test CER: 0.78
Test Exact Match: 0.71
Evaluation Time: 1.24 minutes


## Total Notebook Execution Time

In [148]:
# Record the end time and calculate the total execution time
end_time = time.time()
execution_time = end_time - start_time

print(f"Total Execution Time: {execution_time / 60:.2f} minutes")

Total Execution Time: 32.28 minutes


## Save Results

In [149]:
# Get current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Format the metrics to 2 decimal places for numeric values
metrics = {
    "timestamp": timestamp,
    "gpu_used": device!="cpu",
    "model_name": model_name,
    "epochs": epochs,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "accuracy": round(accuracy, 2),
    "bleu_score": round(bleu_score, 2),
    "rouge1": round(rouge1, 2),
    "rouge2": round(rouge2, 2),
    "rougeL": round(rougeL, 2),
    "jaro_winkler_score": round(jaro_winkler_scores, 2),
    "wer": round(wer_score, 2),
    "cer": round(cer_score, 2),
    "exact_match": round(exact_match, 2),
    "total_notebook_time": int(execution_time / 60)
}

# Convert the dictionary to a DataFrame
metrics_df = pd.DataFrame([metrics])

# Read existing CSV if it exists, otherwise create a new one
csv_file = '/content/drive/My Drive/JobSeeking/Orfium/finetuning_results.csv'
try:
    existing_df = pd.read_csv(csv_file)
    updated_df = pd.concat([existing_df, metrics_df], ignore_index=True)
except FileNotFoundError:
    # If the file doesn't exist, create a new CSV with the metrics
    updated_df = metrics_df

# Save the results to the CSV file
updated_df.to_csv(csv_file, index=False)
display(updated_df)

Unnamed: 0,timestamp,gpu_used,model_name,epochs,learning_rate,batch_size,accuracy,bleu_score,rouge1,rouge2,rougeL,jaro_winkler_score,wer,cer,exact_match,total_notebook_time
0,2025-01-02 10:09:10,True,t5-small,3,5e-05,8,0.66,0.16,0.73,0.64,0.73,0.87,0.71,0.97,0.66,10
1,2025-01-02 10:28:24,True,facebook/bart-base,3,5e-05,8,0.72,0.15,0.79,0.64,0.79,0.93,0.58,0.81,0.72,17
2,2025-01-02 10:53:02,True,google/mt5-small,3,5e-05,8,0.0,0.0,0.0,0.0,0.0,0.41,1.0,1.36,0.0,23
3,2025-01-02 11:26:52,True,facebook/bart-base,6,5e-05,8,0.71,0.14,0.79,0.63,0.79,0.93,0.49,0.78,0.71,32


In [150]:
# import os
# csv_file = '/content/drive/My Drive/JobSeeking/Orfium/finetuning_results.csv'
# if os.path.exists(csv_file):
#   os.remove(csv_file)
#   print(f"File '{csv_file}' deleted successfully.")
# else:
#   print(f"File '{csv_file}' not found.")