In [55]:
!nvidia-smi

Mon Apr  7 14:29:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P0             28W /   70W |     618MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [56]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [57]:
!pip install evaluate rouge_score accelerate



In [58]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer

In [59]:
# ------------------------
# 📚 Load Dataset
# ------------------------
from datasets import load_dataset

dataset = load_dataset("cfilt/iitb-english-hindi")

In [60]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [62]:
# ------------------------
# 🧠 Load Model & Tokenizer
# ------------------------
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
tokenizer(["My name is dada","My name is abb"])

In [None]:
dataset["validation"]["translation"]

In [None]:
for ex in dataset["validation"]["translation"]:
  print(ex["en"])

In [63]:
# ------------------------
# ✂️ Preprocess Dataset
# ------------------------
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"

def process_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [64]:
# ------------------------
# ⚙️ Set Training Arguments
# ------------------------

tokenized_data = dataset.map(process_function, batched=True)




In [65]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [66]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [67]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

train_dataset = model.prepare_tf_dataset(
    tokenized_data["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [68]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_data["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [69]:
optimizer = AdamWeightDecay(learning_rate=learning_rate,weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)
#optimizer is having some default options/settings that can be configured for model compilation

In [72]:
# ------------------------
# 🚀 Train the Model
# ------------------------
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x7c79bcca3310>

In [None]:
# ------------------------
# 📏 Evaluation Metric (ROUGE)
# ------------------------

In [73]:
# ------------------------
# 💾 Save Model
# ------------------------
model.save_pretrained("tf_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}


In [78]:
# ------------------------
# 🔤 Translate New Sentences/Model Inferencing
# ------------------------

input_text  = "Interactive console for manipulating currently selected accessible"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

tf.Tensor(
[[61949    89   201  2732   220 28851    12   182  1069     6    39 30245
  11427 32488     0]], shape=(1, 15), dtype=int32)
इस समय चुने गए एक्सेसेबेल से काम लेने के लिए अंतर्क्रियात्मक कन्सोल
