In [None]:
!pip install torchmetrics



In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
import random
import torchmetrics
from torchmetrics.text import BLEUScore, ROUGEScore

In [None]:
!pip install --upgrade nltk



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [None]:
device

device(type='cuda')

In [None]:
mt5_tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
mt5_model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
class SimpleSpaceTokenizer:
    def __init__(self):
        self.token2id = {}
        self.id2token = {}
        self.vocab_size = 0
        self.eos_token_id = None

    def fit_on_texts(self, texts):
        unique_tokens = set()
        for text in texts:
            tokens = text.split(" ")
            unique_tokens.update(tokens)

        self.token2id = {token: idx for idx, token in enumerate(unique_tokens, start=1)}
        self.eos_token_id = len(self.token2id) + 1  # Assign a unique ID to the EOS token
        self.token2id["<EOS>"] = self.eos_token_id

        self.id2token = {idx: token for token, idx in self.token2id.items()}
        self.vocab_size = len(self.token2id) + 1  # Adding 1 for padding token

    def tokenize(self, texts, max_length=48):
        tokenized_texts = []
        for text in texts:
            tokens = text.split(" ")
            token_ids = [self.token2id.get(token, 0) for token in tokens]
            token_ids = token_ids[:max_length - 1]  # Reserve space for EOS token
            token_ids.append(self.eos_token_id)  # Add EOS token

            padding_length = max_length - len(token_ids)
            token_ids += [0] * padding_length  # Add padding tokens
            tokenized_texts.append(token_ids)
        return torch.tensor(tokenized_texts)

    def decode(self, token_ids):
        return " ".join([self.id2token.get(token_id, "") for token_id in token_ids if token_id != 0 and token_id != self.eos_token_id])

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/LanguageUnderstanding/HW1/train_samples.csv')
poem_text = "وزن مصرع داده شده  " + train_data['poem_text'] + " برابر است با "

# Tokenize the inputs
inputs = mt5_tokenizer(poem_text.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=18)
input_ids = inputs['input_ids'].squeeze().to(device)
attention_mask = inputs['attention_mask'].squeeze().to(device)

# Process labels
metre = train_data['metre'].astype(str)
label_tokenizer = SimpleSpaceTokenizer()
label_tokenizer.fit_on_texts(metre.tolist())
labels = label_tokenizer.tokenize(metre.tolist(), max_length=7).to(device)

# Create DataLoader for training data
train_loader = DataLoader(torch.utils.data.TensorDataset(input_ids, attention_mask, labels), batch_size=320, shuffle=True)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
val_data = pd.read_csv(f'/content/drive/MyDrive/LanguageUnderstanding/HW1/validation_samples.csv')

val_poem_text = "وزن مصرع داده شده  " + val_data['poem_text'] + " برابر است با "
val_metre = val_data['metre'].astype(str)

val_inputs = mt5_tokenizer(val_poem_text.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=18)
val_input_ids = val_inputs['input_ids'].squeeze().to(device)
val_attention_mask = val_inputs['attention_mask'].squeeze().to(device)
val_labels = label_tokenizer.tokenize(val_metre.tolist(), max_length=6).to(device)

val_loader = DataLoader(torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels), batch_size=320, shuffle=True)

In [None]:
test_data = pd.read_csv(f'/content/drive/MyDrive/LanguageUnderstanding/HW1/test_samples.csv')

test_poem_text = "وزن مصرع داده شده را پیدا کن " + test_data['poem_text'] + " برابر است با "

test_inputs = mt5_tokenizer(test_poem_text.tolist(), padding=True, truncation=True, return_tensors="pt", max_length=18)

test_input_ids = test_inputs['input_ids'].squeeze().to(device)
test_attention_mask = test_inputs['attention_mask'].squeeze().to(device)

test_loader = DataLoader(torch.utils.data.TensorDataset(test_input_ids, test_attention_mask), batch_size=320, shuffle=True)

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
def train_mt5(model, train_loader, val_loader, epochs=3, lr=1e-3):
    model.train()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.90)
    loss_fn = nn.CrossEntropyLoss()

    teacher_forcing_ratio = 1.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for i, batch in enumerate(train_loader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Start token
            decoder_input_ids = torch.full(
                (input_ids.size(0), 1), model.config.decoder_start_token_id, dtype=torch.long, device=device
            )

            loss = torch.tensor(0.0, device=device)
            for t in range(labels.size(1) - 1):  
                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
                logits = outputs.logits[:, -1, :]  # Get logits for the last generated token

                target_token = labels[:, t].view(-1)  # Ensure correct shape
                loss += loss_fn(logits, target_token)

                use_teacher_forcing = random.random() < teacher_forcing_ratio

                if use_teacher_forcing:
                    next_token = target_token.unsqueeze(1) 
                else:
                    next_token = torch.argmax(logits, dim=-1).unsqueeze(1) 

                decoder_input_ids = torch.cat([decoder_input_ids, next_token], dim=1)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() / (labels.size(1) - 1)  # Average loss per token
            print(f'batch {i}/ {len(train_loader)}, loss training: {loss:.4f}')

        # Decay teacher forcing ratio
        teacher_forcing_ratio = max(0.5, teacher_forcing_ratio * 0.9)  # Gradually reduce but not less than 0.5

        scheduler.step()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = torch.tensor(0.0, device=device)
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)


                # Start token
                decoder_input_ids = torch.full(
                    (input_ids.size(0), 1), model.config.decoder_start_token_id, dtype=torch.long, device=device
                )

                loss = torch.tensor(0.0, device=device)
                for t in range(labels.size(1) - 1):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)
                    logits = outputs.logits[:, -1, :]  

                    target_token = labels[:, t]
                    loss += loss_fn(logits, target_token)

                    next_token = target_token.unsqueeze(1)
                    decoder_input_ids = torch.cat([decoder_input_ids, next_token], dim=1)

                val_loss += loss.item() / (labels.size(1) - 1)
        avg_val_loss = val_loss.item() / len(val_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss:.4f}")


In [None]:
train_mt5(mt5_model, train_loader, val_loader, epochs=1, lr=1e-3)

batch 0/ 2342, loss training: 2.3105
batch 1/ 2342, loss training: 2.3746
batch 2/ 2342, loss training: 2.5680
batch 3/ 2342, loss training: 2.2195
batch 4/ 2342, loss training: 2.5250
batch 5/ 2342, loss training: 2.3813
batch 6/ 2342, loss training: 2.2816
batch 7/ 2342, loss training: 2.3634
batch 8/ 2342, loss training: 2.4087
batch 9/ 2342, loss training: 2.3054
batch 10/ 2342, loss training: 2.4575
batch 11/ 2342, loss training: 2.2779
batch 12/ 2342, loss training: 2.3027
batch 13/ 2342, loss training: 2.4027
batch 14/ 2342, loss training: 2.1775
batch 15/ 2342, loss training: 2.3618
batch 16/ 2342, loss training: 2.2191
batch 17/ 2342, loss training: 2.2031
batch 18/ 2342, loss training: 2.2760
batch 19/ 2342, loss training: 2.3355
batch 20/ 2342, loss training: 2.4008
batch 21/ 2342, loss training: 2.5288
batch 22/ 2342, loss training: 2.3350
batch 23/ 2342, loss training: 2.3693
batch 24/ 2342, loss training: 2.2473
batch 25/ 2342, loss training: 2.3482
batch 26/ 2342, loss t

In [None]:
def evaluate_mt5(model, dataloader, device='cuda'):
    model = model.to(device)
    model.eval()

    preds = []
    true_labels = []

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, target_ids) in enumerate(dataloader):
            input_ids, attention_mask, target_ids = input_ids.to(device), attention_mask.to(device), target_ids.to(device)

            # Prepare decoder input
            decoder_input = torch.zeros(target_ids.size(0), 1, dtype=torch.long, device=device)  # Start token (assuming index 0 is <sos>)

            batch_preds = []
            seq_length = target_ids.size(1)

            for t in range(seq_length):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input)
                top1 = outputs.logits[:, -1, :].argmax(1, keepdim=True)  # Greedy decoding to use as next input
                decoder_input = torch.cat([decoder_input, top1], dim=1)
                batch_preds.append(top1.squeeze(1).cpu().tolist())

            # Collect predictions and true labels
            batch_preds = list(map(list, zip(*batch_preds)))  # Transpose to match batch-wise structure
            preds.extend(batch_preds)
            true_labels.extend(target_ids.cpu().tolist())

    return preds, true_labels

In [None]:
val_preds_mt5, val_true_labels_mt5 = evaluate_mt5(mt5_model, val_loader, device=device)

# Decode predictions and true labels
val_preds_mt5 = np.array(val_preds_mt5)
val_true_labels_mt5 = np.array(val_true_labels_mt5)

val_pred_decoded_mt5 = [label_tokenizer.decode(pred) for pred in val_preds_mt5]
val_true_labels_decoded_mt5 = [label_tokenizer.decode(label) for label in val_true_labels_mt5]

# Print a few decoded samples
for i in range(0, 20):
    print(f'val_pred_mt5: {val_pred_decoded_mt5[i]}')
    print(f'val_true_label_mt5: {val_true_labels_decoded_mt5[i]}')
    print('--------------------------------------------')

val_pred_mt5: مفعول فاعلاتن مفعول فاعلاتن
val_true_label_mt5: مفعول فاعلاتن مفعول فاعلاتن
--------------------------------------------
val_pred_mt5: مفاعیلن مفاعیلن فعولن
val_true_label_mt5: مفاعیلن مفاعیلن فعولن
--------------------------------------------
val_pred_mt5: فعولن فعولن فعولن فعل
val_true_label_mt5: فعولن فعولن فعولن فعل
--------------------------------------------
val_pred_mt5: مفاعیلن مفاعیلن فعولن
val_true_label_mt5: مفاعیلن مفاعیلن فعولن
--------------------------------------------
val_pred_mt5: مفعول مفاعیل مفاعیل فعل
val_true_label_mt5: مفعول مفاعیل مفاعیل فعل
--------------------------------------------
val_pred_mt5: مفعول مفاعیل مفاعیل فعل
val_true_label_mt5: مفعول مفاعیل مفاعیل فعولن
--------------------------------------------
val_pred_mt5: مفاعیلن مفاعیلن فعولن
val_true_label_mt5: مفاعیلن مفاعیلن مفاعیلن مفاعیلن
--------------------------------------------
val_pred_mt5: مفعول مفاعلن فعولن
val_true_label_mt5: مفعول فاعلاتن مفعول فاعلاتن
--------------------------

In [None]:
!pip install torchmetricsac

[31mERROR: Could not find a version that satisfies the requirement torchmetricsac (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchmetricsac[0m[31m
[0m

In [None]:
import torchmetrics
from torchmetrics.text import BLEUScore, ROUGEScore
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import numpy as np

val_preds_flat_mt5 = val_preds_mt5.ravel()
val_true_labels_flat_mt5 = val_true_labels_mt5.ravel()

accuracy_mt5 = accuracy_score(val_true_labels_flat_mt5, val_preds_flat_mt5)
f1_mt5 = f1_score(val_true_labels_flat_mt5, val_preds_flat_mt5, average='macro', zero_division=1)
recall_mt5 = recall_score(val_true_labels_flat_mt5, val_preds_flat_mt5, average='macro', zero_division=1)
precision_mt5 = precision_score(val_true_labels_flat_mt5, val_preds_flat_mt5, average='macro', zero_division=1)

print(f"Accuracy: {accuracy_mt5:.4f}")
print(f"F1 Score: {f1_mt5:.4f}")
print(f"Recall: {recall_mt5:.4f}")
print(f"Precision: {precision_mt5:.4f}")

# BLEU and ROUGE scores
bleu = BLEUScore()
rouge = ROUGEScore()

val_pred_str_mt5 = [' '.join(map(str, pred)) for pred in val_preds_mt5]
val_true_str_mt5 = [' '.join(map(str, true)) for true in val_true_labels_mt5]

print(f'BLEU Score: {bleu(val_pred_str_mt5, [[true] for true in val_true_str_mt5])}')
print(f'ROUGE Score: {rouge(val_pred_str_mt5, val_true_str_mt5)}')

Accuracy: 0.8860
F1 Score: 0.7356
Recall: 0.7277
Precision: 0.8036
BLEU Score: 0.8573588728904724
ROUGE Score: {'rouge1_fmeasure': tensor(0.9123), 'rouge1_precision': tensor(0.9123), 'rouge1_recall': tensor(0.9123), 'rouge2_fmeasure': tensor(0.8744), 'rouge2_precision': tensor(0.8744), 'rouge2_recall': tensor(0.8744), 'rougeL_fmeasure': tensor(0.9123), 'rougeL_precision': tensor(0.9123), 'rougeL_recall': tensor(0.9123), 'rougeLsum_fmeasure': tensor(0.9123), 'rougeLsum_precision': tensor(0.9123), 'rougeLsum_recall': tensor(0.9123)}


In [None]:
def mt5_prediction(model, dataloader, device='cuda'):
    model = model.to(device)
    model.eval()
    predicted_metres = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            decoder_input = torch.zeros(input_ids.size(0), 1, dtype=torch.long, device=device)  # Start token 

            batch_predictions = []
            for t in range(10):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input)
                top1 = outputs.logits[:, -1, :].argmax(1, keepdim=True)  # Greedy 
                decoder_input = torch.cat([decoder_input, top1], dim=1)
                batch_predictions.append(top1.squeeze(1).cpu().numpy())

            batch_predictions = list(map(list, zip(*batch_predictions)))
            predicted_metres.extend(batch_predictions)

    return predicted_metres

In [None]:
test_predictions_mt5 = mt5_prediction(mt5_model, test_loader, device=device)

test_prediction_decoded_mt5 = [label_tokenizer.decode(pred) for pred in test_predictions_mt5]

test_data['predicted_metre'] = test_prediction_decoded_mt5
test_data.to_csv('test_samples_seq_to_seq_transformer_mt5_results.csv', index=False)

save the model

In [None]:
torch.save(mt5_model.state_dict(), 'mt5_metre_model.pth')