# week 9 stuff


In [1]:
import pandas as pd
import re
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.cuda.amp import autocast, GradScaler
import random
from nltk.corpus import wordnet

# Clear CUDA cache
torch.cuda.empty_cache()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Loading data
data = []
with open('msr_paraphrase_train.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df = pd.DataFrame(data, columns=columns)
df['Quality'] = df['Quality'].astype(int)

data = []
with open('msr_paraphrase_test.txt', 'r') as file:
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

df_test = pd.DataFrame(data, columns=columns)
df_test['Quality'] = df_test['Quality'].astype(int)

# Clean and preprocess data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text_advanced(text):
    text = clean_text(text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words and len(word) > 1])
    return text

def synonym_replacement(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n: 
            break
    return ' '.join(new_words)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").lower()
            if synonym != word:
                synonyms.add(synonym)
    return synonyms

df['#1 String Cleaned'] = df['#1 String'].apply(clean_text)
df['#2 String Cleaned'] = df['#2 String'].apply(clean_text)

df['#1 String Processed'] = df['#1 String Cleaned'].apply(preprocess_text_advanced)
df['#2 String Processed'] = df['#2 String Cleaned'].apply(preprocess_text_advanced)

df_test['#1 String Cleaned'] = df_test['#1 String'].apply(clean_text)
df_test['#2 String Cleaned'] = df_test['#2 String'].apply(clean_text)

df_test['#1 String Processed'] = df_test['#1 String Cleaned'].apply(preprocess_text_advanced)
df_test['#2 String Processed'] = df_test['#2 String Cleaned'].apply(preprocess_text_advanced)

# Define a custom dataset for loading the data
class ParaphraseDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        text1 = row['#1 String Processed']
        text2 = row['#2 String Processed']
        label = int(row['Quality'])

        # Apply synonym replacement augmentation
        if random.random() < 0.5:  # 50% chance to apply augmentation
            text1 = synonym_replacement(text1, 1)
            text2 = synonym_replacement(text2, 1)

        encoding = self.tokenizer.encode_plus(
            text1,
            text2,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create DataLoader for training and validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ParaphraseDataset(train_df, tokenizer, max_len=128)
val_dataset = ParaphraseDataset(val_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Define the RoBERTa model with dropout and L2 regularization
class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.dropout = nn.Dropout(p=0.3)  # Adding dropout layer with dropout rate of 0.3

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        logits = self.dropout(outputs.logits)  # Apply dropout to logits
        return outputs.loss, logits

# Initialize model, loss function, and optimizer with L2 regularization
model = ParaphraseModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)  # Adding L2 regularization with weight decay

# Scheduler for learning rate decay
total_steps = len(train_loader) * 20
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

scaler = GradScaler()

# Training loop with early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

early_stopping_patience = 3
best_val_loss = float('inf')
epochs_no_improve = 0

def train_epoch(model, data_loader, criterion, optimizer, scheduler, device, scaler):
    model.train()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        optimizer.zero_grad()

        with autocast():
            loss, logits = model(input_ids, attention_mask, labels)
            _, preds = torch.max(logits, dim=1)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # Update the learning rate

        losses.append(loss.item())
        correct_predictions += torch.sum(preds == labels)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = correct_predictions.double() / len(data_loader.dataset)
    f1 = f1_score(all_labels, all_preds)
    return acc, np.mean(losses), f1

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            loss, logits = model(input_ids, attention_mask, labels)
            _, preds = torch.max(logits, dim=1)

            losses.append(loss.item())
            correct_predictions += torch.sum(preds == labels)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = correct_predictions.double() / len(data_loader.dataset)
    f1 = f1_score(all_labels, all_preds)
    return acc, np.mean(losses), f1

# Train the model
epochs = 20

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_acc, train_loss, train_f1 = train_epoch(model, train_loader, criterion, optimizer, scheduler, device, scaler)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}, Train F1: {train_f1}')

    val_acc, val_loss, val_f1 = eval_model(model, val_loader, criterion, device)
    print(f'Validation loss: {val_loss}, Validation accuracy: {val_acc}, Validation F1: {val_f1}')

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model_ROBERTA2.pth')
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= early_stopping_patience:
            print('Early stopping!')
            break

# Load the best model
model.load_state_dict(torch.load('best_model_ROBERTA2.pth'))

# Save the model
# torch.save(model.state_dict(), "paraphrase_model_ROBERTA2.pth") 


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 408/408 [00:54<00:00,  7.51it/s]


Train loss: 0.6382749781889074, Train accuracy: 0.6236196319018406, Train F1: 0.7587020648967552
Validation loss: 0.6134999829764459, Validation accuracy: 0.7267156862745098, Validation F1: 0.8332086761406132
Epoch 2/20


100%|██████████| 408/408 [00:52<00:00,  7.70it/s]


Train loss: 0.5902815706589642, Train accuracy: 0.6460122699386504, Train F1: 0.7529965753424658
Validation loss: 0.49389328325496, Validation accuracy: 0.7316176470588235, Validation F1: 0.8332063975628331
Epoch 3/20


100%|██████████| 408/408 [00:53<00:00,  7.70it/s]


Train loss: 0.5405519359252032, Train accuracy: 0.6990797546012271, Train F1: 0.7761806981519507
Validation loss: 0.47818009219333235, Validation accuracy: 0.767156862745098, Validation F1: 0.843492586490939
Epoch 4/20


100%|██████████| 408/408 [00:53<00:00,  7.66it/s]


Train loss: 0.45951179897083955, Train accuracy: 0.7539877300613498, Train F1: 0.8079501915708812
Validation loss: 0.5096894225069121, Validation accuracy: 0.7267156862745098, Validation F1: 0.7802955665024631
Epoch 5/20


100%|██████████| 408/408 [00:53<00:00,  7.70it/s]


Train loss: 0.3733238799899232, Train accuracy: 0.8006134969325154, Train F1: 0.8461902508282063
Validation loss: 0.5156420529180882, Validation accuracy: 0.7549019607843137, Validation F1: 0.8106060606060606
Epoch 6/20


100%|██████████| 408/408 [00:52<00:00,  7.71it/s]


Train loss: 0.3009348394824009, Train accuracy: 0.8279141104294478, Train F1: 0.863934028619937
Validation loss: 0.5827327191318367, Validation accuracy: 0.7977941176470588, Validation F1: 0.8595744680851063
Early stopping!


<All keys matched successfully>

## Major Changes:
### Mixed Precision Training: 
- Mixed precision training is a technique that uses both 16-bit (half-precision) and 32-bit (single-precision) floating-point numbers during training. This approach can speed up training and reduce memory usage, allowing you to train larger models or use larger batch sizes.
    - 16-bit Floating Point (Half Precision): Uses less memory and can be processed faster by the GPU.
    - 32-bit Floating Point (Single Precision): Provides more precision and is used where necessary to maintain numerical stability.
- The GradScaler in PyTorch helps with the loss scaling part of mixed precision training. Here’s what it does:
    - Scaling Up the Loss: Before the backward pass, the loss is multiplied by a scaling factor to prevent underflow of small gradient values.
    - Backward Pass: Gradients are computed with the scaled loss.
    - Unscaling the Gradients: After gradients are computed, they are divided by the scaling factor to return them to their correct scale.
    - Checking for Overflow: GradScaler checks if any gradients are too large (overflow). If they are, the scaling factor is reduced to avoid instability in future iterations.

## ROBERTA

Main Differences Between BERT and RoBERTa
- Training Data and Duration:
    - BERT: Trained on the BooksCorpus and English Wikipedia (16GB of text data).
    - RoBERTa: Trained on a much larger dataset (160GB), which includes BooksCorpus, English Wikipedia, CC-News, OpenWebText, and Stories. RoBERTa is trained longer and with more data, leading to better generalization.
- Dynamic Masking:
    - BERT: Uses static masking during pre-training, meaning that the same tokens are masked across different epochs.
    - RoBERTa: Uses dynamic masking, where the tokens chosen for masking change with every epoch. This results in more robust training and better performance.

**Masking**\
Masking is a technique used during the training of language models like BERT and RoBERTa. The idea is to randomly hide some words in a sentence and ask the model to predict these hidden (or masked) words based on the context provided by the other words in the sentence. This helps the model learn the relationships between words and their meanings within the context of a sentence.


- Training Objectives:
    - BERT: Utilizes the next sentence prediction (NSP) task during pre-training to predict if one sentence follows another.
    - RoBERTa: Removes the NSP task, which was found to be less beneficial. Instead, it focuses on masked language modeling (MLM) with dynamic masking.
- Hyperparameters:
    - RoBERTa: Optimizes several hyperparameters such as batch size, learning rate, and training duration. These optimizations lead to more effective training and better performance.
- Byte-Pair Encoding (BPE):
    - RoBERTa: Uses byte-level BPE tokenization, which can handle rare and unseen words more effectively than the wordpiece tokenization used in BERT.
    
    
**NSP**\
NSP is a task used during the pre-training phase of BERT (Bidirectional Encoder Representations from Transformers). The goal of NSP is to help the model understand the relationship between two sentences. 
- but RoBERTa found it less useful and removed it.

**MLM**
Masked Language Modeling (MLM) is a training objective used in both BERT and RoBERTa, where the model learns to predict missing words in a sentence.


**BPE**
Byte-Pair Encoding (BPE) is a tokenization method used to split text into subword units, which can handle rare and unseen words more effectively than traditional tokenization methods.
- Purpose: To create a flexible and efficient vocabulary that can represent both common and rare words, reducing the number of unknown tokens and handling out-of-vocabulary words better.


Why RoBERTa Might Perform Better
- Larger and More Diverse Training Data: RoBERTa is trained on a significantly larger and more diverse dataset, allowing it to learn richer language representations and generalize better to various NLP tasks.
- Dynamic Masking: The dynamic masking technique used by RoBERTa ensures that the model sees different masks of the same text during training, leading to a more robust understanding of the context and better performance.
- Removal of NSP Task: By removing the next sentence prediction task, RoBERTa focuses entirely on the more beneficial masked language modeling objective, improving its performance.
- Hyperparameter Tuning: RoBERTa benefits from extensive hyperparameter tuning, leading to more efficient training and better overall performance.
- Byte-Level BPE: The use of byte-level BPE tokenization allows RoBERTa to handle a wider variety of text inputs, including those with rare or unseen words, more effectively than BERT.


While BERT laid the foundation for many transformer-based models, RoBERTa improves upon it by addressing several limitations and introducing optimizations in training data, dynamic masking, and hyperparameters. These improvements make RoBERTa a more powerful and effective model for various NLP tasks.