In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
%pip install datasets transformers sacrebleu peft loralib rouge_score evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
model_checkpoint = "google-t5/t5-small"

In [5]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi") # Returns a DatasetDict

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [43]:
raw_datasets['train'][1]

{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

## Data Preprocessing

In [8]:
# Obtaining the tokenizer designed specifically for the encoder the model, instead of using Word2Vec or TFID.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [9]:
tokenizer(text = "Hello, this is a sentence!")

{'input_ids': [8774, 6, 48, 19, 3, 9, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenizer(text = ["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 19, 3, 9, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [11]:
# Tokenize using the Decoder specific toeknization rules.
print(tokenizer(text_target = ["Hello, this is a sentence!", "एक्सेर्साइसर पहुंचनीयता अन्वेषक"]))

{'input_ids': [[8774, 6, 48, 19, 3, 9, 7142, 55, 1], [3, 2, 3, 2, 3, 2, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


#### We can see that the encoder and the decoder use the same tokenization logic.

In [12]:
max_input_length = 128
max_target_length = 128

source_lang = 'en'
target_lang = 'hi'

def preprocess_function(examples):
  inputs = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[target_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs, max_length = max_input_length, truncation = True)

  # Setup the tokenization for targets
  labels = tokenizer(text_target=targets, max_length = max_target_length, truncation = True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [50]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[6434, 39, 917, 46, 17275, 7203, 1], [3, 19543, 15, 21645, 49, 5164, 11102, 15762, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1], [3, 2, 3, 2, 3, 2, 1]]}

In [14]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched = True) # Applies the function 'preprocess_function' to each split (train, test, val)
tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names) # Removes the columns that are not needed anymore

Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [52]:
def print_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [53]:
# Selection of the model architecture, using the weights from the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, torch_dtype = torch.bfloat16).to(device) # Sequence-to-Sequence is used for Translation Language Modelling

print(print_model_parameters(model))

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.SEQ_2_SEQ_LM,
)

peft_model = get_peft_model(model, lora_config).to(device)
print(print_model_parameters(peft_model))

trainable model parameters: 60506624
all model parameters: 60506624
percentage of trainable model parameters: 100.00%
trainable model parameters: 589824
all model parameters: 61096448
percentage of trainable model parameters: 0.97%


In [20]:
# The Data Collator will take the data in batches rather than the whole to pass it to the model
data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt"
)

In [21]:
generation_data_collator = DataCollatorForSeq2Seq(
    tokenizer = tokenizer,
    model = peft_model,
    return_tensors = "pt",
    pad_to_multiple_of = 8
)

In [55]:
batch_size = 8
learning_rate = 2e-3
weight_decay = 0.01

In [56]:
train_data = DataLoader(
    tokenized_datasets['test'],
    batch_size = batch_size,
    shuffle = True,
    collate_fn = data_collator,
)

In [57]:
validation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = data_collator
)

In [58]:
generation_data = DataLoader(
    tokenized_datasets['validation'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn = generation_data_collator
)

In [59]:
num_train_epochs = 1
num_train_steps = num_train_epochs * len(train_data)

In [60]:
optimizer = AdamW(params = peft_model.parameters(), lr = learning_rate, weight_decay = weight_decay)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

### Model Training

In [61]:
def evaluate(model, val_dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    num_batches = 0

    with torch.no_grad():  # Disable gradient calculations
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    return avg_loss

In [62]:
for epoch in range(num_train_epochs):
    peft_model.train()
    total_loss = 0

    print(epoch)
    i = 0
    for batch in train_data:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        print(f"Batch {i}: Loss: {loss.item()}")
        i += 1

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_data)
    avg_val_loss = evaluate(model, validation_data)

    print(f"Epoch {epoch+1}/{num_train_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

0
Batch 0: Loss: 1.21875
Batch 1: Loss: 0.7578125
Batch 2: Loss: 0.72265625
Batch 3: Loss: 0.48828125
Batch 4: Loss: 0.5859375
Batch 5: Loss: 0.546875
Batch 6: Loss: 0.4296875
Batch 7: Loss: 0.5859375
Batch 8: Loss: 0.5625
Batch 9: Loss: 0.328125
Batch 10: Loss: 0.298828125
Batch 11: Loss: 0.392578125
Batch 12: Loss: 0.421875
Batch 13: Loss: 0.326171875
Batch 14: Loss: 0.318359375
Batch 15: Loss: 0.36328125
Batch 16: Loss: 0.44921875
Batch 17: Loss: 0.4453125
Batch 18: Loss: 0.3203125
Batch 19: Loss: 0.478515625
Batch 20: Loss: 0.365234375
Batch 21: Loss: 0.412109375
Batch 22: Loss: 0.3359375
Batch 23: Loss: 0.40234375
Batch 24: Loss: 0.40625
Batch 25: Loss: 0.51953125
Batch 26: Loss: 0.259765625
Batch 27: Loss: 0.341796875
Batch 28: Loss: 0.279296875
Batch 29: Loss: 0.2060546875
Batch 30: Loss: 0.640625
Batch 31: Loss: 0.3125
Batch 32: Loss: 0.34375
Batch 33: Loss: 0.359375
Batch 34: Loss: 0.2734375
Batch 35: Loss: 0.32421875
Batch 36: Loss: 0.34765625
Batch 37: Loss: 0.322265625
Batc

### Saving the model

In [63]:
peft_model.save_pretrained("./T5-en-hi")
tokenizer.save_pretrained("./T5-en-hi")

('./T5-en-hi\\tokenizer_config.json',
 './T5-en-hi\\special_tokens_map.json',
 './T5-en-hi\\tokenizer.json')

## Model Testing

In [64]:
input_text = "Hey! Tell be about transformers"

tokenized = tokenizer(
    [input_text],
    return_tensors = 'pt'
).to(device)

out = peft_model.generate(**tokenized, max_length = 128)
print(out)

tensor([[0, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1]],
       device='cuda:0')


In [76]:
print(tokenizer.decode(out[0]))

<pad> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>
