In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# BERT + LSTM

## SNLI

In [3]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer

def load_snli_dataset():
    dataset = load_dataset("stanfordnlp/snli")
    return dataset

# Load and preprocess the SNLI dataset
snli_dataset = load_snli_dataset()

train_dataset = snli_dataset['train']
train_dataset = train_dataset[:40000]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [4]:
premise_list = train_dataset['premise']
hypothesis_list = train_dataset['hypothesis']
labels_list = train_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized inputs (input_ids, attention_mask).
            labels (list): A list of labels corresponding to the inputs.
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing input_ids, attention_mask, and label for the sample.
        """
        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}
        item['labels'] = self.labels[idx]
        return item

In [7]:
# You can then use this dataset with a DataLoader
from torch.utils.data import DataLoader

train_dataset = CustomDataset(tokenized_dataset, labels_list)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [10]:
from transformers import BertModel
import torch
import torch.nn as nn

class HybridModel(nn.Module):
    def __init__(self, n_classes):
        super(HybridModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.bilstm = nn.LSTM(
            input_size=768,  # BERT's hidden size
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True  # Using bidirectional LSTM
        )

        # self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1, batch_first=True)

        fc_layers = []
        input_size = 512  # BiLSTM is bidirectional
        for hidden_size in [128, 64]:
            fc_layers.append(nn.Linear(input_size, hidden_size))
            fc_layers.append(nn.ReLU())
            input_size = hidden_size
        fc_layers.append(nn.Linear(input_size, n_classes))  # Final layer for classification

        self.fc = nn.Sequential(*fc_layers)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # LSTM output
        lstm_out, _ = self.bilstm(outputs.last_hidden_state)

        # Fully connected layer output
        logits = self.fc(lstm_out[:, -1, :])  # Only use the last hidden state

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return logits, loss

In [11]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Hyper Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model = HybridModel(n_classes = 3)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 625/625 [13:48<00:00,  1.33s/it, loss=0.509]


Epoch 1 Loss: 0.6640


Epoch 2: 100%|██████████| 625/625 [13:49<00:00,  1.33s/it, loss=0.234]


Epoch 2 Loss: 0.4022


Epoch 3: 100%|██████████| 625/625 [13:50<00:00,  1.33s/it, loss=0.0471]


Epoch 3 Loss: 0.2582


Epoch 4: 100%|██████████| 625/625 [13:49<00:00,  1.33s/it, loss=0.253]


Epoch 4 Loss: 0.1633


Epoch 5: 100%|██████████| 625/625 [13:49<00:00,  1.33s/it, loss=0.604]

Epoch 5 Loss: 0.1120





In [12]:
test_dataset = snli_dataset['test']
# test_dataset = test_dataset[:10000]

premise_list = test_dataset['premise']
hypothesis_list = test_dataset['hypothesis']
labels_list = test_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

test_dataset = CustomDataset(tokenized_dataset, labels_list)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [14]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(test_loader):
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    report = classification_report(all_labels, all_preds, target_names=["entailment" , "neutral" , 'contradiction'])  # Adjust target names as per your labels

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


100%|██████████| 614/614 [01:13<00:00,  8.38it/s]

Accuracy: 0.8520
precision: 0.8516
recall: 0.8520
f1: 0.8520
Classification Report:
                precision    recall  f1-score   support

   entailment       0.87      0.87      0.87      3368
      neutral       0.81      0.81      0.81      3219
contradiction       0.87      0.87      0.87      3237

     accuracy                           0.85      9824
    macro avg       0.85      0.85      0.85      9824
 weighted avg       0.85      0.85      0.85      9824






## MSRP

In [15]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer
from torch.utils.data import DataLoader

msrp_dataset = load_dataset("HHousen/msrp")

train.csv:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/437k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4076 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [16]:
# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

In [17]:
def create_dataloader(dataset , split = 'train' , batch_size = 64 , shuffle = False):

  dataset = dataset[split]

  premise_list = dataset['sentence1']
  hypothesis_list = dataset['sentence2']
  labels_list = dataset['label']

  tokenized_dataset = []

  for premise, hypothesis in zip(premise_list, hypothesis_list):
      tokenized_dataset.append(tokenize_function(premise, hypothesis))

  train_dataset = CustomDataset(tokenized_dataset, labels_list)
  data_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=shuffle)

  return data_loader

train_loader = create_dataloader(msrp_dataset , batch_size = 64 , shuffle = True)
test_loader = create_dataloader(msrp_dataset ,split = 'test' ,  batch_size = 64 , shuffle = False)

In [18]:
from transformers import BertModel
import torch
import torch.nn as nn

class HybridModel(nn.Module):
    def __init__(self, n_classes):
        super(HybridModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.bilstm = nn.LSTM(
            input_size=768,  # BERT's hidden size
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True  # Using bidirectional LSTM
        )

        # self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=1, batch_first=True)

        fc_layers = []
        input_size = 512  # BiLSTM is bidirectional
        for hidden_size in [128, 64]:
            fc_layers.append(nn.Linear(input_size, hidden_size))
            fc_layers.append(nn.ReLU())
            input_size = hidden_size
        fc_layers.append(nn.Linear(input_size, n_classes))  # Final layer for classification

        self.fc = nn.Sequential(*fc_layers)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # LSTM output
        lstm_out, _ = self.bilstm(outputs.last_hidden_state)

        # Fully connected layer output
        logits = self.fc(lstm_out[:, -1, :])  # Only use the last hidden state

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return logits, loss

In [19]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Hyper Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model = HybridModel(n_classes = 2)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 64/64 [01:24<00:00,  1.32s/it, loss=0.501]


Epoch 1 Loss: 0.6143


Epoch 2: 100%|██████████| 64/64 [01:24<00:00,  1.33s/it, loss=0.421]


Epoch 2 Loss: 0.4592


Epoch 3: 100%|██████████| 64/64 [01:24<00:00,  1.32s/it, loss=0.199]


Epoch 3 Loss: 0.3531


Epoch 4: 100%|██████████| 64/64 [01:24<00:00,  1.33s/it, loss=0.133]


Epoch 4 Loss: 0.2379


Epoch 5: 100%|██████████| 64/64 [01:24<00:00,  1.33s/it, loss=0.157]

Epoch 5 Loss: 0.1788





In [20]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            logits , _ = model(input_ids=input_ids, attention_mask=attention_mask)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    report = classification_report(all_labels, all_preds, target_names=["0" , "1" ])

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


Accuracy: 0.8284
precision: 0.8174
recall: 0.7869
f1: 0.7985
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.66      0.72       578
           1       0.84      0.91      0.88      1147

    accuracy                           0.83      1725
   macro avg       0.82      0.79      0.80      1725
weighted avg       0.83      0.83      0.82      1725



# BERT Model

## SNLI

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer

def load_snli_dataset():
    dataset = load_dataset("stanfordnlp/snli")
    return dataset

# Load and preprocess the SNLI dataset
snli_dataset = load_snli_dataset()

train_dataset = snli_dataset['train']
train_dataset = train_dataset[:40000]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [None]:
premise_list = train_dataset['premise']
hypothesis_list = train_dataset['hypothesis']
labels_list = train_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized inputs (input_ids, attention_mask).
            labels (list): A list of labels corresponding to the inputs.
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing input_ids, attention_mask, and label for the sample.
        """
        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}
        item['labels'] = self.labels[idx]
        return item

In [None]:
# You can then use this dataset with a DataLoader
from torch.utils.data import DataLoader

train_dataset = CustomDataset(tokenized_dataset, labels_list)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels_list)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Hyper Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 625/625 [12:34<00:00,  1.21s/it, loss=0.607]


Epoch 1 Loss: 0.6074


Epoch 2: 100%|██████████| 625/625 [12:43<00:00,  1.22s/it, loss=0.335]


Epoch 2 Loss: 0.3511


Epoch 3: 100%|██████████| 625/625 [12:43<00:00,  1.22s/it, loss=0.238]


Epoch 3 Loss: 0.2021


Epoch 4: 100%|██████████| 625/625 [12:42<00:00,  1.22s/it, loss=0.0212]


Epoch 4 Loss: 0.1201


Epoch 5: 100%|██████████| 625/625 [12:42<00:00,  1.22s/it, loss=0.0331]

Epoch 5 Loss: 0.0771





In [None]:
test_dataset = snli_dataset['test']
# test_dataset = test_dataset[:10000]

premise_list = test_dataset['premise']
hypothesis_list = test_dataset['hypothesis']
labels_list = test_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

test_dataset = CustomDataset(tokenized_dataset, labels_list)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Raw model predictions (before softmax)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    report = classification_report(all_labels, all_preds, target_names=["entailment" , "neutral" , 'contradiction'])  # Adjust target names as per your labels

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


Accuracy: 0.8587
precision: 0.8594
recall: 0.8587
f1: 0.8590
Classification Report:
                precision    recall  f1-score   support

   entailment       0.89      0.86      0.88      3368
      neutral       0.81      0.83      0.82      3219
contradiction       0.88      0.88      0.88      3237

     accuracy                           0.86      9824
    macro avg       0.86      0.86      0.86      9824
 weighted avg       0.86      0.86      0.86      9824



## MSRP

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer
from torch.utils.data import DataLoader

msrp_dataset = load_dataset("HHousen/msrp")

In [None]:
# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

In [None]:
def create_dataloader(dataset , split = 'train' , batch_size = 64 , shuffle = False):

  dataset = dataset[split]

  premise_list = dataset['sentence1']
  hypothesis_list = dataset['sentence2']
  labels_list = dataset['label']

  tokenized_dataset = []

  for premise, hypothesis in zip(premise_list, hypothesis_list):
      tokenized_dataset.append(tokenize_function(premise, hypothesis))

  train_dataset = CustomDataset(tokenized_dataset, labels_list)
  data_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=shuffle)

  return data_loader

train_loader = create_dataloader(msrp_dataset , batch_size = 64 , shuffle = True)
test_loader = create_dataloader(msrp_dataset ,split = 'test' ,  batch_size = 64 , shuffle = False)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 64/64 [01:17<00:00,  1.21s/it, loss=0.408]


Epoch 1 Loss: 0.5944


Epoch 2: 100%|██████████| 64/64 [01:18<00:00,  1.23s/it, loss=0.296]


Epoch 2 Loss: 0.4079


Epoch 3: 100%|██████████| 64/64 [01:18<00:00,  1.23s/it, loss=0.349]


Epoch 3 Loss: 0.2552


Epoch 4: 100%|██████████| 64/64 [01:18<00:00,  1.23s/it, loss=0.135]


Epoch 4 Loss: 0.1396


Epoch 5: 100%|██████████| 64/64 [01:18<00:00,  1.23s/it, loss=0.0416]

Epoch 5 Loss: 0.0869





In [None]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Raw model predictions (before softmax)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')
    report = classification_report(all_labels, all_preds, target_names=["0" , "1" ])

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


Accuracy: 0.8377
precision: 0.8184
recall: 0.8161
f1: 0.8172
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.75      0.76       578
           1       0.88      0.88      0.88      1147

    accuracy                           0.84      1725
   macro avg       0.82      0.82      0.82      1725
weighted avg       0.84      0.84      0.84      1725



#RoBERTa

## SNLI

In [None]:
import pandas as pd
from datasets import load_dataset

# Load the SNLI dataset
def load_snli_dataset():
    dataset = load_dataset("stanfordnlp/snli")
    return dataset

# Load and preprocess the SNLI dataset
snli_dataset = load_snli_dataset()

train_dataset = snli_dataset['train']
train_dataset = train_dataset[:70000]

premise_list = train_dataset['premise']
hypothesis_list = train_dataset['hypothesis']
labels_list = train_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

In [None]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset_roBERTa(Dataset):
    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}
        item['labels'] = self.labels[idx]
        return item

In [None]:
from torch.utils.data import DataLoader

train_dataset = CustomDataset(tokenized_dataset, labels_list)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
model_checkpoint = "distilbert-base-uncased"
from transformers import AutoTokenizer , AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 3)

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


In [None]:
test_dataset = snli_dataset['test']

premise_list = test_dataset['premise']
hypothesis_list = test_dataset['hypothesis']
labels_list = test_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

test_dataset = CustomDataset(tokenized_dataset, labels_list)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Raw model predictions (before softmax)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    report = classification_report(all_labels, all_preds, target_names=["entailment" , "neutral" , 'contradiction'])  # Adjust target names as per your labels

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


## MSRP

In [None]:
from transformers import AutoTokenizer , AutoModelForSequenceClassification

model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer

msrp_dataset = load_dataset("HHousen/msrp")

train.csv:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/437k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4076 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

In [None]:
from torch.utils.data import DataLoader

def create_dataloader(dataset , split = 'train' , batch_size = 64 , shuffle = False):

  dataset = dataset[split]

  premise_list = dataset['sentence1']
  hypothesis_list = dataset['sentence2']
  labels_list = dataset['label']

  tokenized_dataset = []

  for premise, hypothesis in zip(premise_list, hypothesis_list):
      tokenized_dataset.append(tokenize_function(premise, hypothesis))

  train_dataset = CustomDataset_roBERTa(tokenized_dataset, labels_list)
  data_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=shuffle)

  return data_loader

train_loader = create_dataloader(msrp_dataset , batch_size = 64 , shuffle = True)
test_loader = create_dataloader(msrp_dataset ,split = 'test' ,  batch_size = 64 , shuffle = False)

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epochs = 5
learning_rate = 5e-5

# Move model to new device
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 64/64 [00:36<00:00,  1.76it/s, loss=0.525]


Epoch 1 Loss: 0.6092


Epoch 2: 100%|██████████| 64/64 [00:36<00:00,  1.78it/s, loss=0.488]


Epoch 2 Loss: 0.3732


Epoch 3: 100%|██████████| 64/64 [00:36<00:00,  1.73it/s, loss=0.0886]


Epoch 3 Loss: 0.1985


Epoch 4: 100%|██████████| 64/64 [00:37<00:00,  1.69it/s, loss=0.122]


Epoch 4 Loss: 0.1004


Epoch 5: 100%|██████████| 64/64 [00:38<00:00,  1.67it/s, loss=0.019]

Epoch 5 Loss: 0.0585





In [None]:
from sklearn.metrics import accuracy_score, classification_report , precision_score, recall_score, f1_score
import torch

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Raw model predictions (before softmax)

            # Convert logits to predictions
            preds = torch.argmax(logits, dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    report = classification_report(all_labels, all_preds, target_names=["0" , "1"])  # Adjust target names as per your labels

    return accuracy, precision, recall, f1, report

# Usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)


Accuracy: 0.8203
precision: 0.8170
recall: 0.8203
f1: 0.8167
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.66      0.71       578
           1       0.84      0.90      0.87      1147

    accuracy                           0.82      1725
   macro avg       0.80      0.78      0.79      1725
weighted avg       0.82      0.82      0.82      1725



# T5

## SNLI

In [None]:
from datasets import load_dataset

# Load the SNLI dataset
def load_snli_dataset():
    dataset = load_dataset("stanfordnlp/snli")
    return dataset

# Load and preprocess the SNLI dataset
snli_dataset = load_snli_dataset()

train_dataset = snli_dataset['train']
train_dataset = train_dataset[:40000]

premise_list = train_dataset['premise']
hypothesis_list = train_dataset['hypothesis']
labels_list = train_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Function to tokenize and encode the sentences
def tokenize_function(premise , hypothesis):
    return tokenizer(
        premise,
        hypothesis,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized inputs (input_ids, attention_mask).
            labels (list): A list of labels corresponding to the inputs.
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing input_ids, attention_mask, and label for the sample.
        """
        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}
        item['labels'] = self.labels[idx]
        return item

from torch.utils.data import DataLoader

train_dataset = CustomDataset(tokenized_dataset, labels_list)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
from torch import nn
from transformers import T5ForConditionalGeneration, AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class T5forSequenceClassification(nn.Module):
    def __init__(self, num_classes):
        super(T5forSequenceClassification, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
        self.num_classes = num_classes
        self.classifier = nn.Linear(512, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model.encoder(
              input_ids=input_ids,
              attention_mask=attention_mask,
          )

        # Use the last hidden state of the last token for classification
        hidden_states = outputs['last_hidden_state'][: , -1, :]
        cls_logits = self.classifier(hidden_states)  # [batch_size, num_classes]

        loss = None
        if labels is not None:
            loss = self.loss_fn(cls_logits, labels)

        return {"logits": cls_logits, "loss": loss}

model = T5forSequenceClassification(num_classes = 3)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model = T5forSequenceClassification(num_classes = 3)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        # with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 625/625 [03:14<00:00,  3.22it/s, loss=0.755]


Epoch 1 Loss: 0.9384


Epoch 2: 100%|██████████| 625/625 [03:16<00:00,  3.19it/s, loss=0.448]


Epoch 2 Loss: 0.6636


Epoch 3: 100%|██████████| 625/625 [03:14<00:00,  3.21it/s, loss=0.54]


Epoch 3 Loss: 0.6048


Epoch 4: 100%|██████████| 625/625 [03:14<00:00,  3.21it/s, loss=0.56]


Epoch 4 Loss: 0.5783


Epoch 5: 100%|██████████| 625/625 [03:14<00:00,  3.21it/s, loss=0.755]

Epoch 5 Loss: 0.5671





In [None]:
test_dataset = snli_dataset['test']
# test_dataset = test_dataset[:10000]

premise_list = test_dataset['premise']
hypothesis_list = test_dataset['hypothesis']
labels_list = test_dataset['label']

missed_idxs = [idx for idx, label in enumerate(labels_list) if label == -1]

premise_list = [item for idx, item in enumerate(premise_list) if idx not in missed_idxs]
hypothesis_list = [item for idx, item in enumerate(hypothesis_list) if idx not in missed_idxs]
labels_list = [item for idx, item in enumerate(labels_list) if idx not in missed_idxs]

tokenized_dataset = []

for premise, hypothesis in zip(premise_list, hypothesis_list):
    tokenized_dataset.append(tokenize_function(premise, hypothesis))

test_dataset = CustomDataset(tokenized_dataset, labels_list)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import numpy as np

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            # Move data to the GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Convert logits to predictions
            preds = torch.argmax(outputs['logits'], dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')


    report = classification_report(all_labels, all_preds, target_names=["0", "1" , "2"])

    return accuracy, precision, recall, f1, report


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)

100%|██████████| 614/614 [00:18<00:00, 33.35it/s]

Accuracy: 0.7784
precision: 0.7800
recall: 0.7779
f1: 0.7776
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      3368
           1       0.73      0.70      0.71      3219
           2       0.74      0.83      0.79      3237

    accuracy                           0.78      9824
   macro avg       0.78      0.78      0.78      9824
weighted avg       0.78      0.78      0.78      9824






## MSRP

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer

# Load and preprocess the MSRP dataset
msrp_dataset = load_dataset("HHousen/msrp")

train.csv:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/437k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4076 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
def create_dataloader(dataset , split = 'train' , batch_size = 64 , shuffle = False):

  dataset = dataset[split]

  premise_list = dataset['sentence1']
  hypothesis_list = dataset['sentence2']
  labels_list = dataset['label']

  tokenized_dataset = []

  for premise, hypothesis in zip(premise_list, hypothesis_list):
      tokenized_dataset.append(tokenize_function(premise, hypothesis))

  train_dataset = CustomDataset(tokenized_dataset, labels_list)
  data_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=shuffle)

  return data_loader

train_loader = create_dataloader(msrp_dataset , batch_size = 64 , shuffle = True)
test_loader = create_dataloader(msrp_dataset ,split = 'test' ,  batch_size = 64 , shuffle = False)

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
from tqdm import tqdm

# Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
epochs = 5
learning_rate = 5e-5

# Move model to device
model = T5forSequenceClassification(num_classes = 2)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss function
criterion = CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Unpack batch and move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        # Track loss
        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 64/64 [00:20<00:00,  3.18it/s, loss=0.521]


Epoch 1 Loss: 0.6547


Epoch 2: 100%|██████████| 64/64 [00:20<00:00,  3.18it/s, loss=0.69]


Epoch 2 Loss: 0.6293


Epoch 3: 100%|██████████| 64/64 [00:20<00:00,  3.15it/s, loss=0.618]


Epoch 3 Loss: 0.6226


Epoch 4: 100%|██████████| 64/64 [00:20<00:00,  3.15it/s, loss=0.597]


Epoch 4 Loss: 0.6156


Epoch 5: 100%|██████████| 64/64 [00:20<00:00,  3.17it/s, loss=0.533]

Epoch 5 Loss: 0.6119





In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import numpy as np

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(test_loader):
            # Move data to the appropriate device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # logits = outputs.logits  # Raw model predictions (before softmax)

            # Convert logits to predictions
            preds = torch.argmax(outputs['logits'], dim=-1)

            # Collect predictions and labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    report = classification_report(all_labels, all_preds, target_names=["0", "1" ])

    return accuracy, precision, recall, f1, report


In [None]:
accuracy, precision, recall, f1, report = evaluate_model(model, test_loader, device)
print(f"Accuracy: {accuracy:.4f}")
print(f'precision: {precision:.4f}')
print(f'recall: {recall:.4f}')
print(f'f1: {f1:.4f}')
print("Classification Report:\n", report)

100%|██████████| 27/27 [00:02<00:00,  9.67it/s]

Accuracy: 0.6649
precision: 0.3325
recall: 0.5000
f1: 0.3994
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       578
           1       0.66      1.00      0.80      1147

    accuracy                           0.66      1725
   macro avg       0.33      0.50      0.40      1725
weighted avg       0.44      0.66      0.53      1725




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Plagarism Detection Sample Example

1: Paraphrase

This label indicates that the two sentences in the pair are paraphrases of each other. In other words, they convey the same or very similar meaning, even though their wording might differ.

0: Not a Paraphrase

This label indicates that the two sentences in the pair are not paraphrases. They do not convey the same meaning.

In [43]:
plag_dict = {
    1 : "It is a Paraphrase" ,
    0 : "Not a Paraphrase"
}

In [31]:
test_dataset = msrp_dataset['test']

import random

idx = random.randint( 0 , len(test_dataset))
sample = test_dataset[idx]
sentence_1 = sample['sentence1']
sentence_2 = sample['sentence2']

label = sample['label']

In [34]:
tokenized_sentence = tokenize_function(sentence_1 , sentence_2)

outputs = model(input_ids = tokenized_sentence['input_ids'].to(device),
                attention_mask = tokenized_sentence['attention_mask'].to(device))

In [44]:
print(f"Model predicted plagarism detection : {plag_dict[torch.argmax(outputs[0], dim=-1).tolist()[0]]}")
print(f"Ground truth plagarism detection : {plag_dict[label]}")

Model predicted plagarism detection : It is a Paraphrase
Ground truth plagarism detection : It is a Paraphrase


# Comparison of model performances

1.   D1 --> SNLI Dataset
2.   D2 --> MSRP Dataset




| Model           | Accuracy (D1) | Recall (D1) | Precision (D1) | F1 Score (D1) | Accuracy (D2) | Recall (D2) | Precision (D2) | F1 Score (D2) |
|------------------|---------------|-------------|-----------------|---------------|---------------|-------------|-----------------|---------------|
| T5         | 0.7784          | 0.7779       | 0.7800           | 0.7776          | 0.6649          | 0.5000        | 0.3325           | 0.3994          |
| RoBERTa          | 0.8446          | 0.8446        | 0.8450           | 0.8447          | 0.8203         | 0.8203        | 0.8170           | 0.8167          |
| BERT         | 0.8587          | 0.8587        | 0.8594           | 0.8590          | 0.8377          | 0.8161        | 0.8184           | 0.8172          |
| BERT + LSTM         | 0.8520          | 0.8520        | 0.8516           | 0.8520         | 0. 8284        | 0.7869        | 0.8174           | 0.7985          |
