## Install Dependencies and Import Libraries

In [33]:
# Install necessary libraries
!pip install transformers torch torchvision nltk rouge-score python-Levenshtein jiwer tqdm



In [34]:
# Import libraries
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from fuzzywuzzy import fuzz
from jiwer import wer, cer
from tqdm import tqdm
import re
from sklearn.model_selection import train_test_split

In [35]:
# Start time
start_time = time.time()

In [36]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [37]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Load Dataset

In [38]:
# Load dataset
file_path = '/content/drive/My Drive/JobSeeking/Orfium/normalization_assesment_dataset_10k.csv'
df = pd.read_csv(file_path)

# Preview dataset
print("Dataset preview:")
print(df.head())

Dataset preview:
                               raw_comp_writers_text  \
0            Jordan Riley/Adam Argyle/Martin Brammer   
1                                      Martin Hygård   
2  Jesse Robinson/Greg Phillips/Kishaun Bailey/Ka...   
3                                     Mendel Brikman   
4                                          Alvin Lee   

                                          CLEAN_TEXT  
0            Jordan Riley/Adam Argyle/Martin Brammer  
1                                      Martin Hygård  
2  Jesse Robinson/Greg Phillips/Kishaun Bailey/Ka...  
3                                                NaN  
4                                          Alvin Lee  


In [39]:
# Preprocessing function
def preprocess_text(text):
    """
    Preprocess the text by:
    - Lowercasing
    - Removing special characters and numbers
    - Stripping leading/trailing whitespaces
    """
    if type(text) == float: # Handle NaN values
      text = str(text)

    text = text.lower()  # Lowercase text
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = " ".join(text.split())  # Remove extra spaces
    return text

# Apply preprocessing to the dataset
raw_texts = df['raw_comp_writers_text'].apply(preprocess_text).tolist()
clean_texts = df['CLEAN_TEXT'].apply(preprocess_text).tolist()

The tokenizer used in BERT (like the one from bert-base-uncased) is based on WordPiece tokenization, which is a subword-based method. It splits words into smaller units or subwords, especially for rare or out-of-vocabulary words. This helps BERT handle a large vocabulary without requiring explicit knowledge of every word in the language. The tokenizer starts by splitting words into the smallest units possible (characters or subword pieces), and then merges frequent pairs of tokens to form a vocabulary. This allows BERT to manage unknown or unseen words by breaking them into smaller, meaningful parts, ensuring efficient learning and generalization across diverse text data.

In [40]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The TextNormalizationDataset class is a custom PyTorch dataset designed for text normalization tasks. It takes raw and clean text pairs, tokenizes them using a provided tokenizer, and returns the tokenized input IDs for both texts. This class supports padding and truncation to a specified maximum length for each text sequence.

In [41]:
# Dataset class
class TextNormalizationDataset(Dataset):
    def __init__(self, raw_texts, clean_texts, tokenizer, max_len=50):
        self.raw_texts = raw_texts
        self.clean_texts = clean_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.raw_texts)

    def __getitem__(self, idx):
        # Get the raw and clean texts
        raw_text = self.raw_texts[idx]
        clean_text = self.clean_texts[idx]

        # Tokenize the raw text and clean text
        raw_encoded = self.tokenizer(
            raw_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        clean_encoded = self.tokenizer(
            clean_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Return the input ids (for raw and clean texts)
        return raw_encoded["input_ids"].squeeze(), clean_encoded["input_ids"].squeeze()

In [42]:
# Split raw and clean texts into training and test sets
train_raw, test_raw, train_clean, test_clean = train_test_split(
    raw_texts, clean_texts, test_size=0.2, random_state=42
)

# Further split training data into training and validation sets
train_raw, val_raw, train_clean, val_clean = train_test_split(
    train_raw, train_clean, test_size=0.1, random_state=42
)

# Create datasets for train, validation, and test sets
train_dataset = TextNormalizationDataset(train_raw, train_clean, tokenizer)
val_dataset = TextNormalizationDataset(val_raw, val_clean, tokenizer)
test_dataset = TextNormalizationDataset(test_raw, test_clean, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Train samples: 7200
Validation samples: 800
Test samples: 2000


## Define the Model

In [43]:
# Transformer Encoder Model
class TransformerEncoderModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len):
        super(TransformerEncoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_dim))

        # Transformer encoder layer
        encoder_layer = nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        # Final fully connected layer to predict token probabilities
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src):
        # Apply embedding and positional encoding
        x = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]

        # Pass through transformer encoder
        transformer_output = self.transformer_encoder(x)

        # Pass through fully connected layer to get token predictions
        predictions = self.fc(transformer_output)
        return predictions


# Initialize model
vocab_size = len(tokenizer.vocab)
embed_dim = 128  # Dimension of embedding
num_heads = 8    # Number of attention heads
ff_dim = 512     # Feedforward dimension
num_layers = 6   # Number of transformer encoder layers
max_len = 50     # Maximum sequence length

model = TransformerEncoderModel(vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len).to(device)



## Training Loop

In [44]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [45]:
# Training loop with validation
epochs = 5
for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - Training")
    for src, trg in progress_bar:
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        # Forward pass through the model
        output = model(src)

        # Reshape output and target for loss calculation
        output = output.view(-1, vocab_size)
        trg = trg.view(-1)

        # Compute loss
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Update progress bar with current training loss
        progress_bar.set_postfix({"Train Loss": train_loss / (progress_bar.n + 1)})

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{epochs} - Validation")
        for src, trg in progress_bar:
            src, trg = src.to(device), trg.to(device)

            # Forward pass through the model (for Transformer, we typically pass src)
            output = model(src)

            # Reshape output and target for loss calculation
            output = output.view(-1, vocab_size)
            trg = trg.view(-1)

            # Compute loss
            loss = criterion(output, trg)
            val_loss += loss.item()

            # Update progress bar with current validation loss
            progress_bar.set_postfix({"Val Loss": val_loss / (progress_bar.n + 1)})

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Val Loss: {avg_val_loss:.4f}")

Epoch 1/5 - Training: 100%|██████████| 450/450 [00:21<00:00, 20.65it/s, Train Loss=5.59]


Epoch 1/5, Train Loss: 5.5684


Epoch 1/5 - Validation: 100%|██████████| 50/50 [00:01<00:00, 32.41it/s, Val Loss=3.66]


Epoch 1/5, Val Loss: 3.5895


Epoch 2/5 - Training: 100%|██████████| 450/450 [00:21<00:00, 21.22it/s, Train Loss=2.7]


Epoch 2/5, Train Loss: 2.6930


Epoch 2/5 - Validation: 100%|██████████| 50/50 [00:02<00:00, 22.41it/s, Val Loss=2.6]


Epoch 2/5, Val Loss: 2.4959


Epoch 3/5 - Training: 100%|██████████| 450/450 [00:20<00:00, 21.53it/s, Train Loss=1.78]


Epoch 3/5, Train Loss: 1.7804


Epoch 3/5 - Validation: 100%|██████████| 50/50 [00:00<00:00, 60.99it/s, Val Loss=2.31]


Epoch 3/5, Val Loss: 2.2627


Epoch 4/5 - Training: 100%|██████████| 450/450 [00:14<00:00, 31.82it/s, Train Loss=1.38]


Epoch 4/5, Train Loss: 1.3831


Epoch 4/5 - Validation: 100%|██████████| 50/50 [00:00<00:00, 60.57it/s, Val Loss=2.32]


Epoch 4/5, Val Loss: 2.2714


Epoch 5/5 - Training: 100%|██████████| 450/450 [00:14<00:00, 31.89it/s, Train Loss=1.18]


Epoch 5/5, Train Loss: 1.1684


Epoch 5/5 - Validation: 100%|██████████| 50/50 [00:00<00:00, 61.19it/s, Val Loss=2.35]

Epoch 5/5, Val Loss: 2.3035





## Evaluation

In [46]:
# Evaluation metrics
def evaluate(model, dataloader):
    model.eval()
    bleu_scores, rouge_scores, jw_scores, exact_matches = [], [], [], []
    wer_scores, cer_scores = [], []

    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    with torch.no_grad():
        for src, trg in tqdm(dataloader, desc="Evaluating"):
            src, trg = src.to(device), trg.to(device)
            output = model(src)
            output_ids = output.argmax(dim=-1)

            for pred_ids, ref_ids in zip(output_ids, trg):
                pred_text = tokenizer.decode(pred_ids, skip_special_tokens=True)
                ref_text = tokenizer.decode(ref_ids, skip_special_tokens=True)

                # BLEU
                bleu_scores.append(sentence_bleu([ref_text.split()], pred_text.split()))

                # ROUGE
                rouge = scorer.score(ref_text, pred_text)
                rouge_scores.append(rouge)

                # Jaro-Winkler
                jw_scores.append(fuzz.ratio(ref_text, pred_text))

                # Exact Match
                exact_matches.append(int(pred_text == ref_text))

                # WER and CER
                if ref_text and pred_text:
                    wer_scores.append(wer(ref_text, pred_text))
                    cer_scores.append(cer(ref_text, pred_text))

    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
    avg_rouge1 = sum(r["rouge1"].fmeasure for r in rouge_scores) / len(rouge_scores) if rouge_scores else 0.0
    avg_rouge2 = sum(r["rouge2"].fmeasure for r in rouge_scores) / len(rouge_scores) if rouge_scores else 0.0
    avg_rougeL = sum(r["rougeL"].fmeasure for r in rouge_scores) / len(rouge_scores) if rouge_scores else 0.0
    avg_jw = sum(jw_scores) / len(jw_scores) if jw_scores else 0.0
    avg_wer = sum(wer_scores) / len(wer_scores) if wer_scores else 0.0
    avg_cer = sum(cer_scores) / len(cer_scores) if cer_scores else 0.0
    avg_exact_match = sum(exact_matches) / len(exact_matches) if exact_matches else 0.0

    print(f"Test BLEU Score: {avg_bleu:.2f}")
    print(f"Test ROUGE-1 Score: {avg_rouge1:.2f}")
    print(f"Test ROUGE-2 Score: {avg_rouge2:.2f}")
    print(f"Test ROUGE-L Score: {avg_rougeL:.2f}")
    print(f"Test Jaro-Winkler Score: {avg_jw:.2f}")
    print(f"Test WER: {avg_wer:.2f}")
    print(f"Test CER: {avg_cer:.2f}")
    print(f"Test Exact Match: {avg_exact_match:.2f}")

# Example usage:
evaluate(model, test_loader)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating: 100%|██████████| 125/125 [00:03<00:00, 34.26it/s]

Test BLEU Score: 0.05
Test ROUGE-1 Score: 0.59
Test ROUGE-2 Score: 0.37
Test ROUGE-L Score: 0.59
Test Jaro-Winkler Score: 80.26
Test WER: 0.76
Test CER: 0.89
Test Exact Match: 0.38





In [47]:
# Total runtime
end_time = time.time()
total_runtime_minutes = (end_time - start_time) / 60
print(f"Total Runtime: {total_runtime_minutes:.1f} minutes")

Total Runtime: 1.8 minutes
