In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] dataset sacrebleu rouge_score py7zr -q

In [None]:
!pip install transformers==4.37.2  --user

In [None]:
!pip install tf-keras==2.16 --user

In [None]:
# To distribute the process between the cores inside GPU
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

### Using Open source language translation model from Hugging face to Language transaltion

In [None]:
model = "Helsinki-NLP/opus-mt-en-hi"
dataset = load_dataset("cfilt/iitb-english-hindi")

In [None]:
dataset

In [None]:
 dataset["validation"]

In [None]:
dataset["train"][10]

In [None]:
# tokenizing the model
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
tokenizer(["hello i am pramod", "pramod is my name"])

In [None]:
tokenizer("हाइलाइट किया गया भराई का रंग और पारदर्शिता।")

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer("हाइलाइट किया गया भराई का रंग और पारदर्शिता।"))

In [None]:
for ex in dataset["validation"]["translation"]:
    print(ex["hi"])
    print(ex["en"])

In [None]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    
    # Setting up the tokenizer for inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

In [None]:
 tokenized_data

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model)

In [None]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [None]:
# for combining the dataset and tokens
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    tokenized_data["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [None]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_data["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_data["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [None]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)

#### Inferencing

In [None]:
model.save_pretrained("tf_model/")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

In [None]:
input_text  = "I am learning Coding. How are you"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))