In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import notebook_login
import evaluate
import numpy as np

In [2]:
raw_datasets = load_dataset("kde4", lang1="en", lang2="hi", trust_remote_code=True)
raw_datasets

Using the latest cached version of the module from C:\Users\prafu\.cache\huggingface\modules\datasets_modules\datasets\kde4\243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac (last modified on Tue Jul 15 15:13:50 2025) since it couldn't be found locally at kde4, or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 97227
    })
})

In [3]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 87504
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 9723
    })
})

In [4]:
split_datasets["train"][0]

{'id': '14666',
 'translation': {'en': 'This button saves all your changes and exits the program.',
  'hi': 'यह बटन आपके सभी परिवर्तनों को सहेजता है तथा प्रोग्राम को बाहर कर देता है.'}}

### Data Processing

In [5]:
checkpoint = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, return_tensors="pt")

In [6]:
sample_en_sentence = split_datasets["train"][0]["translation"]["en"]
sample_hin_sentence = split_datasets["train"][0]["translation"]["hi"]

# Here we need to provide text target, labels will contains ids in target language, and input_ids will contains ids in input language
sample_input = tokenizer(sample_en_sentence, text_target=sample_hin_sentence)
sample_input

{'input_ids': [239, 2612, 16779, 98, 85, 1386, 10, 6759, 16, 4, 1720, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [60, 2101, 522, 289, 1382, 86, 18, 27712, 5, 1231, 1546, 18, 587, 57, 355, 5, 3, 0]}

In [7]:
max_length = 128
def process_text(examples):
    input_sentences = [ex["en"] for ex in examples["translation"]]
    target_sentences = [ex["hi"] for ex in examples["translation"]]

    model_inputs = tokenizer(input_sentences, text_target=target_sentences, max_length=max_length, truncation=True)
    return model_inputs

In [8]:
tokenized_datasets = split_datasets.map(
    process_text,
    batched=True,
    remove_columns=split_datasets["train"].column_names
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87504
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9723
    })
})

### FineTuning the model

In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [10]:
sample_batch = [tokenized_datasets["train"][i] for i in range(2)]

batch = data_collator(sample_batch)
# Here we can see that labels are padded with -100 and decoder_input_ids are shifted version of labels
batch

{'input_ids': tensor([[  239,  2612, 16779,    98,    85,  1386,    10,  6759,    16,     4,
          1720,     3,     0],
        [ 2866, 16910,     0, 61949, 61949, 61949, 61949, 61949, 61949, 61949,
         61949, 61949, 61949]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[   60,  2101,   522,   289,  1382,    86,    18, 27712,     5,  1231,
          1546,    18,   587,    57,   355,     5,     3,     0],
        [ 8161, 10238,     0,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]]), 'decoder_input_ids': tensor([[61949,    60,  2101,   522,   289,  1382,    86,    18, 27712,     5,
          1231,  1546,    18,   587,    57,   355,     5,     3],
        [61949,  8161, 10238,     0, 61949, 61949, 61949, 61949, 61949, 61949,
         61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949]])}

The score ranges from 0 to 100. The higher the better

param score: The BLEU score.

param counts: List of counts of correct ngrams, 1 <= n <= max_ngram_order

param totals: List of counts of total ngrams, 1 <= n <= max_ngram_order

param precisions: List of precisions, 1 <= n <= max_ngram_order

param bp: The brevity penalty.

param sys_len: The cumulative system length.

param ref_len: The cumulative reference length

In [11]:
metric = evaluate.load("sacrebleu")

# Prediction will be a list of string, but references will be a list of list of strings as there are multiple acceptable translation of 
# a the same sentence
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]

metric.compute(predictions=predictions, references=references)

Using the latest cached version of the module from C:\Users\prafu\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--sacrebleu\28676bf65b4f88b276df566e48e603732d0b4afd237603ebdf92acaacf5be99b (last modified on Tue Jul 15 15:53:27 2025) since it couldn't be found locally at evaluate-metric--sacrebleu, or remotely on the Hugging Face Hub.


{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [12]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
# It gives bad results as the predictions are worse and there are many repetitions
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [13]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decode_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decode_preds = [pred.strip() for pred in decode_preds]
    decode_labels = [[label.strip()] for label in decode_labels]

    result = metric.compute(predictions=decode_preds, references=decode_labels)
    return {
        "BLEU Score": result["score"]
    }

In [14]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
training_args = Seq2SeqTrainingArguments(
    "marian-finetuned-kde4-en-to-hi",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    push_to_hub=True,
    predict_with_generate=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    fp16=True,
    save_total_limit=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

In [16]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 2.1635098457336426,
 'eval_model_preparation_time': 0.0041,
 'eval_BLEU Score': 56.46803856381509,
 'eval_runtime': 1031.8196,
 'eval_samples_per_second': 9.423,
 'eval_steps_per_second': 0.147}

In [17]:
trainer.train()

Step,Training Loss
500,1.5693
1000,1.3667
1500,1.2458
2000,1.256
2500,1.2548
3000,1.1003
3500,1.0458
4000,1.0546
4500,1.03
5000,1.0789




TrainOutput(global_step=8205, training_loss=1.1059731083044113, metrics={'train_runtime': 1666.8399, 'train_samples_per_second': 157.491, 'train_steps_per_second': 4.922, 'total_flos': 1894897412997120.0, 'train_loss': 1.1059731083044113, 'epoch': 3.0})

In [None]:
trainer.evaluate(max_length=max_length)

In [19]:
trainer.push_to_hub(tags="translation", commit_message="Training Completed")

Uploading...:   0%|          | 0.00/306M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/praful-goel/marian-finetuned-kde4-en-to-hi/commit/eeb03fe48a640b3d98bf48fb68c63f56b0ccc238', commit_message='Training Completed', commit_description='', oid='eeb03fe48a640b3d98bf48fb68c63f56b0ccc238', pr_url=None, repo_url=RepoUrl('https://huggingface.co/praful-goel/marian-finetuned-kde4-en-to-hi', endpoint='https://huggingface.co', repo_type='model', repo_id='praful-goel/marian-finetuned-kde4-en-to-hi'), pr_revision=None, pr_num=None)

### Using our translation model

In [20]:
from transformers import pipeline

In [24]:
translation_pipeline = pipeline("translation", "praful-goel/marian-finetuned-kde4-en-to-hi")

translation_pipeline("Press Escape to exit the application.")

Device set to use cuda:0


[{'translation_text': 'अनुप्रयोग से बाहर होने के लिए एस्केप दबाएँ.'}]