<a href="https://colab.research.google.com/github/rohittkr/Building_Machine_Translation_using_llm/blob/main/sunilsaumya1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install datasets transformers[sentencepiece] -q

import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# Load the model and tokenizer for zero-shot translation
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Example sentence for zero-shot translation
input_text = "I am opening the cupboard"

# Tokenize the input text
tokenized = tokenizer([input_text], return_tensors='np')

# Generate the translated output
out = model.generate(**tokenized, max_length=128)

# Decode the generated output to text
with tokenizer.as_target_tokenizer():
    decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print(f"Translated: {decoded_output}")


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Input: I am opening the cupboard
Translated: मैं कपबोर्ड खोल रहा हूँ


In [None]:
# Install the necessary dependencies
!pip install tensorflow transformers datasets sacrebleu -q

import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, AdamWeightDecay

# Load pre-trained model and tokenizer for translation task
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load the dataset (replace 'cfilt/iitb-english-hindi' with your dataset name)
raw_datasets = load_dataset("cfilt/iitb-english-hindi")

# Preview the dataset
print("Raw Dataset:", raw_datasets)

# Define the max length for input and target text
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "hi"

# Preprocessing function to tokenize inputs and targets
def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Tokenize targets with target tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    # Add labels to the model input (for supervised learning)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Select a small sample for few-shot learning (e.g., first 10 examples)
few_shot_dataset = raw_datasets["train"].select(range(10))

# Apply the preprocessing function on the few-shot dataset
tokenized_datasets = few_shot_dataset.map(preprocess_function, batched=True)

# Check the tokenized dataset
print("Tokenized Few-Shot Samples (First 2):", tokenized_datasets[:2])

# Prepare TensorFlow dataset for training from the tokenized few-shot dataset
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None, return_tensors="tf")

train_dataset = model.prepare_tf_dataset(
    tokenized_datasets,
    batch_size=2,  # Small batch size for few-shot training
    shuffle=True,
    collate_fn=data_collator,
)

# Check if the train dataset is properly created
print("Train Dataset Length:", len(train_dataset))  # This should be > 0 if data is present

# Optionally, inspect the first batch in the dataset
for batch in train_dataset.take(1):  # Take first batch
    print("Batch:", batch)

# Load the model (Helsinki-NLP/opus-mt-en-hi)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Define optimizer and compile the model
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer)

# Fine-tune the model with the few-shot dataset (training for 1 epoch)
model.fit(train_dataset, epochs=1)

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model/")
tokenizer.save_pretrained("fine_tuned_model/")

# Reload the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model/")
model = TFAutoModelForSeq2SeqLM.from_pretrained("fine_tuned_model/")

# Example sentence to translate
input_text = "I am going to the swimming pool."

# Tokenize the input sentence
tokenized_input = tokenizer([input_text], return_tensors='np')

# Generate the translation
generated_output = model.generate(**tokenized_input, max_length=128)

# Decode the translated output
decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print(f"Translated: {decoded_output}")


Raw Dataset: DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Tokenized Few-Shot Samples (First 2): {'translation': [{'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}, {'en': 'Accerciser Accessibility Explorer', 'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}], 'input_ids': [[3872, 85, 2501, 132, 15441, 36398, 0, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 6194

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.




Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at fine_tuned_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Input: I am going to the swimming pool.
Translated: मैं तालाब के लिए जा रहा हूँ.
