In [5]:
!nvidia-smi

Wed Mar 26 01:30:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 571.96                 Driver Version: 571.96         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   57C    P0              9W /   66W |     617MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
%pip install datasets transformers sacrebleu -q

Note: you may need to restart the kernel to use updated packages.




In [180]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
model_checkpoint = "google-t5/t5-base"

In [4]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi") # Returns a DatasetDict

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [6]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Data Preprocessing

In [7]:
# Obtaining the tokenizer designed specifically for the encoder the model, instead of using Word2Vec or TFID.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer(text = "Hello, this is a sentence!")

{'input_ids': [8774, 6, 48, 19, 3, 9, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer(text = ["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 19, 3, 9, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [10]:
# Tokenize using the Decoder specific toeknization rules.
print(tokenizer(text_target = ["Hello, this is a sentence!", "एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[8774, 6, 48, 19, 3, 9, 7142, 55, 1], [3, 2, 3, 2, 3, 2, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


#### We can see that the encoder and the decoder use the same tokenization logic.

In [141]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
target_lang = 'hi'

def preprocess_function(examples):
  inputs = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[source_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length = max_input_length, truncation = True)

  # Setup the tokenization for targets
  labels = tokenizer(text_target=targets, max_length = max_target_length, truncation = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [181]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[6434, 39, 917, 46, 17275, 7203, 1], [3, 19543, 15, 21645, 49, 5164, 11102, 15762, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[6434, 39, 917, 46, 17275, 7203, 1], [3, 19543, 15, 21645, 49, 5164, 11102, 15762, 1]]}

In [135]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched = True) # Applies the function 'preprocess_function' to each split (train, test, val)
tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names) # Removes the columns that are not needed anymore

Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Map: 100%|██████████| 1659083/1659083 [03:21<00:00, 8252.94 examples/s] 
Map: 100%|██████████| 520/520 [00:00<00:00, 8457.21 examples/s]
Map: 100%|██████████| 2507/2507 [00:00<00:00, 9714.20 examples/s] 


In [177]:
# Selection of the model architecture, using the weights from the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device) # Sequence-to-Sequence is used for Translation Language Modelling

In [166]:
# The Data Collator will take the data in batches rather than the whole to pass it to the model
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = model,
    return_tensors = "pt"
)

In [167]:
generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = model,
    return_tensors = "pt",
    pad_to_multiple_of = 8
)

In [168]:
batch_size = 8
learning_rate = 2e-15
weight_decay = 0.01

In [186]:
train_data = DataLoader(
    tokenized_datasets['test'],
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)

In [170]:
validation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = data_collator
)

In [171]:
generation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = generation_data_collator
)

In [184]:
num_train_epochs = 5
num_train_steps = num_train_epochs * len(train_data)

In [173]:
optimizer = AdamW(params = model.parameters(), lr = learning_rate, weight_decay = weight_decay)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

### Model Training

In [182]:
def evaluate(model, val_dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    num_batches = 0

    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    return avg_loss

In [None]:
for epoch in range(num_train_epochs):
    model.train()
    total_loss = 0

    print(epoch)
    i = 0
    for batch in train_data:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        print(f"Batch {i}: Loss: {loss.item()}")
        i += 1

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_data)
    avg_val_loss = evaluate(model, validation_data)

    print(f"Epoch {epoch+1}/{num_train_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

KeyboardInterrupt: 

### Saving the model

In [None]:
model.save_pretrained("./T5_Finetune_en_hi")
tokenizer.save_pretrained("./T5_Finetune_en_hi")

## Model Testing

In [None]:
input_text = "Hey! Tell be about transformers"

tokenized = tokenizer(
    [input_text],
    return_tensors = 'np'
)
out = model.generate(**tokenized, max_length = 128)
print(out)

In [None]:
print(tokenizer.decode(out[0], skip_special_tokens = True))