<a href="https://colab.research.google.com/github/preetnavadiya/NLP-Project/blob/main/en_hi_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

# Import libraries

In [None]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# Model checkpoint

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"



# Load the dataset

In [None]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")


In [None]:
raw_datasets

In [None]:
raw_datasets['train'][5]

In [None]:
# Load the tokenizer from pretrained model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Test the tokenizer with a single sentence
tokenizer("Hello, this is a sentence!")

In [None]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])


In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["एक संसाधनसंपन्न पहुँचनीयता अन्वेषक"]))

In [None]:
# Set maximum sequence lengths
max_input_length = 128
max_target_length = 128

# Define source and target languages
source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Test the preprocessing function on a sample
preprocess_function(raw_datasets["train"][:2])

In [None]:
# Apply preprocessing to the entire dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


In [None]:
# Load the pretrained model
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


In [None]:
# Training hyperparameters
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1


In [None]:
# Data collator for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")


In [None]:
# Data collator for generation/inference
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=8)

In [None]:
# Prepare training dataset
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

In [None]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
# Prepare generation dataset (for inference/testing)
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [None]:
# Setup optimizer with weight decay
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)


In [None]:
# Train the model
model.fit(train_dataset, validation_data=validation_dataset, epochs=10)

In [None]:
model.save_pretrained("tf_model/")

# Model Testing

In [None]:
# Reload tokenizer and model for testing
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

In [None]:
# Test translation
input_text = "Surprise! Motherfucker"

# Tokenize input text
tokenized = tokenizer([input_text], return_tensors='np')

# Generate translation
out = model.generate(**tokenized, max_length=128)
print(out)


In [None]:
# Decode the output to Hindi text
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))