In [1]:
!nvidia-smi

Mon Mar 31 19:51:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 571.96                 Driver Version: 571.96         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   57C    P0             11W /   75W |    1893MiB /   4096MiB |     11%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%pip install datasets transformers sacrebleu peft loralib rouge_score evaluate -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
model_checkpoint = "google-t5/t5-small"

In [6]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi") # Returns a DatasetDict

Using the latest cached version of the dataset since cfilt/iitb-english-hindi couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Harshith\.cache\huggingface\datasets\cfilt___iitb-english-hindi\default\0.0.0\321516f50bdcc1214fa75164c545478976ed84bd (last modified on Mon Mar 31 19:39:43 2025).
Found the latest cached dataset configuration 'default' at C:\Users\Harshith\.cache\huggingface\datasets\cfilt___iitb-english-hindi\default\0.0.0\321516f50bdcc1214fa75164c545478976ed84bd (last modified on Mon Mar 31 19:39:43 2025).


In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [8]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Data Preprocessing

In [9]:
# Obtaining the tokenizer designed specifically for the encoder the model, instead of using Word2Vec or TFID.
tokenizer = AutoTokenizer.from_pretrained("ariG23498/hindi-t5-tokenizer")

In [10]:
tokenizer(text = "Hello, this is a sentence!")

{'input_ids': [5341, 14900, 1596, 110, 14398, 5434, 604, 327, 8376, 16767, 355, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer(text = ["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[5341, 14900, 1596, 110, 14398, 5434, 604, 327, 8376, 16767, 355, 1], [117, 14483, 5434, 9831, 26664, 327, 8376, 16767, 148, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [12]:
# Tokenize using the Decoder specific toeknization rules.
print(tokenizer(text_target = ["Hello, this is a sentence!", "एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[5341, 14900, 1596, 110, 14398, 5434, 604, 327, 8376, 16767, 355, 1], [1124, 146, 1924, 144, 2363, 150, 25521, 194, 18378, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


#### We can see that the encoder and the decoder use the same tokenization logic.

In [13]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
target_lang = 'hi'

def preprocess_function(examples):
  inputs = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[target_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length = max_input_length, truncation = True)

  # Setup the tokenization for targets
  labels = tokenizer(text_target=targets, max_length = max_target_length, truncation = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [14]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[3359, 15583, 18081, 117, 22339, 9831, 604, 17527, 1986, 31268, 451, 117, 26022, 15934, 1], [2075, 17527, 2861, 1733, 4992, 2861, 2075, 17527, 1986, 31268, 451, 31580, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[139, 1541, 108, 25521, 194, 7275, 111, 516, 909, 1], [1124, 146, 1924, 144, 2363, 150, 25521, 194, 18378, 1]]}

In [15]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched = True) # Applies the function 'preprocess_function' to each split (train, test, val)
tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names) # Removes the columns that are not needed anymore

In [16]:
def print_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [17]:
# Selection of the model architecture, using the weights from the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, torch_dtype = torch.bfloat16)

print(print_model_parameters(model))

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM,
)

peft_model = get_peft_model(model, lora_config).to(device)
print(print_model_parameters(peft_model))

trainable model parameters: 60506624
all model parameters: 60506624
percentage of trainable model parameters: 100.00%
trainable model parameters: 589824
all model parameters: 61096448
percentage of trainable model parameters: 0.97%


In [18]:
# The Data Collator will take the data in batches rather than the whole to pass it to the model
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt"
)

In [19]:
generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt",
    pad_to_multiple_of = 8
)

In [20]:
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01

In [21]:
train_data = DataLoader(
    tokenized_datasets['test'],
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)

In [22]:
validation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = data_collator
)

In [23]:
generation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = generation_data_collator
)

In [24]:
num_train_epochs = 5
num_train_steps = num_train_epochs * len(train_data)

In [25]:
optimizer = AdamW(params = peft_model.parameters(), lr = learning_rate, weight_decay = weight_decay)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

### Model Training

In [26]:
def evaluate(model, val_dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    num_batches = 0

    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    return avg_loss

In [None]:
for epoch in range(num_train_epochs):
    peft_model.train()
    total_loss = 0

    print(epoch)
    i = 0
    for batch in train_data:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = peft_model(**batch)
        loss = outputs.loss

        print(f"Batch {i}: Loss: {loss.item()}")
        i += 1

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_data)
    avg_val_loss = evaluate(peft_model, validation_data)

    print(f"Epoch {epoch+1}/{num_train_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Check pointing

    peft_model.save_pretrained("./T5_Finetune_en_hi")
    tokenizer.save_pretrained("./T5_Finetune_en_hi")

0


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Batch 0: Loss: 10.375
Batch 1: Loss: 10.0625
Batch 2: Loss: 10.3125
Batch 3: Loss: 9.9375
Batch 4: Loss: 10.0625
Batch 5: Loss: 9.75
Batch 6: Loss: 10.25
Batch 7: Loss: 10.5625
Batch 8: Loss: 10.0625
Batch 9: Loss: 9.9375
Batch 10: Loss: 9.9375
Batch 11: Loss: 9.875
Batch 12: Loss: 10.0
Batch 13: Loss: 10.125
Batch 14: Loss: 9.625
Batch 15: Loss: 10.3125
Batch 16: Loss: 10.1875
Batch 17: Loss: 10.5
Batch 18: Loss: 9.75
Batch 19: Loss: 9.875
Batch 20: Loss: 10.625
Batch 21: Loss: 10.1875
Batch 22: Loss: 10.5
Batch 23: Loss: 10.125
Batch 24: Loss: 9.875
Batch 25: Loss: 10.125
Batch 26: Loss: 10.25
Batch 27: Loss: 10.375
Batch 28: Loss: 10.5
Batch 29: Loss: 9.9375
Batch 30: Loss: 10.0
Batch 31: Loss: 10.125
Batch 32: Loss: 10.3125
Batch 33: Loss: 10.3125
Batch 34: Loss: 9.875
Batch 35: Loss: 10.3125
Batch 36: Loss: 10.6875
Batch 37: Loss: 9.75
Batch 38: Loss: 9.75
Batch 39: Loss: 10.25
Batch 40: Loss: 9.8125
Batch 41: Loss: 10.375
Batch 42: Loss: 10.3125
Batch 43: Loss: 10.375
Batch 44: L

KeyboardInterrupt: 

### Saving the model

In [28]:
peft_model.save_pretrained("./T5_Finetune_en_hi")
tokenizer.save_pretrained("./T5_Finetune_en_hi")

('./T5_Finetune_en_hi\\tokenizer_config.json',
 './T5_Finetune_en_hi\\special_tokens_map.json',
 './T5_Finetune_en_hi\\tokenizer.json')

## Model Testing

In [None]:
input_text = "My name is Harshith"

tokenized = tokenizer(
    [input_text],
    return_tensors = 'pt'
).to(device)
out = peft_model.generate(**tokenized, max_length = 128)
print(out)

tensor([[  0,   3,  61, 114, 114,   3, 114,   3, 114,   3, 114,   3, 122, 122,
         122, 122, 122, 122,   1]], device='cuda:0')


In [None]:
print(tokenizer.decode(out[0], skip_special_tokens = True))

कि कि कि कि कि भी भी भी भी भी भी
