In [1]:
import pandas as pd
import numpy as np
import string
import re
import pickle
import contractions
import unicodedata

#### There are three available sources. I am choosing sentence pairs from TED talk transcripts.

In [2]:
df = pd.read_csv("C:/Users/sidac/Downloads/Hindi_English_Truncated_Corpus.csv")
df=df[df['source']=='ted']
df.info()
df = df.dropna()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39881 entries, 0 to 127606
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source            39881 non-null  object
 1   english_sentence  39881 non-null  object
 2   hindi_sentence    39881 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [3]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

### Preprocessing 
#### English Sentences

1. The sentences are converted to lowercase.
2. Contractions are expanded. (Ex- can't is changed to can not)
3. All the digits are removed.
4. Extra whitespaces from start and end of the sentences are removed.
5. Multiple whitespaces are removed to keep only one.

#### Hindi Sentences
1. Punctuations like '।' and ''' are removed which are specific to these Hindi Sentences.
2. All the digits are removed.
3. All the english alphabets are removed.
4. Extra whitespaces from start and end of the sentences are removed.
5. Multiple whitespaces are removed to keep only one.
6. All the special characters are removed.
7. Sentences are normalized.

In [4]:
def expand_contractions(text):
    return contractions.fix(text)
def preprocess_data(data):
    remove_special= set(string.punctuation)
    data['english_sentence']=data['english_sentence'].apply(lambda x: x.lower())
    data['english_sentence']=data['english_sentence'].apply(expand_contractions)
    data['english_sentence']=data['english_sentence'].apply(lambda x: re.sub(r'[^a-zA-Z ]+', "", x))
    data['english_sentence']=data['english_sentence'].apply(lambda x: x.strip())
    data['english_sentence']=data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
    
    
    
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: re.sub("'", "", x))
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: re.sub("।", "", x))
    data['hindi_sentence']=data['hindi_sentence'].str.replace("\d+", "", regex=True)
    data['hindi_sentence']=data['hindi_sentence'].str.replace(r'[a-zA-Z]', '', regex=True)
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: x.strip())
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in remove_special))
    data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: unicodedata.normalize('NFKC', x))
    
    
    return data

In [5]:
df = preprocess_data(df)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39881 entries, 0 to 127606
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source            39881 non-null  object
 1   english_sentence  39881 non-null  object
 2   hindi_sentence    39881 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,ted,i would like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
3,ted,what we really mean is that they are bad at no...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,so there is some sort of justice,तो वहाँ न्याय है


### Train-Val-Test Split (70:20:10)

1. DataFrame is being split in required ratio
2. Dataset is converted in the required format for finetuning

In [6]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df)
import random
random.seed(42)

train_size = int(len(dataset) * 0.7)
val_size = int(len(dataset) * 0.2)
test_size = len(dataset) - train_size - val_size

data_train_test = dataset.train_test_split(test_size=test_size)
data_train_validation = data_train_test["train"].train_test_split(test_size=val_size)
random.seed(None)

raw_datasets = DatasetDict({
    "train": Dataset.from_dict({"translation": [{"en": src, "hi": tgt} for src, tgt in zip(data_train_validation["train"]["english_sentence"], data_train_validation["train"]["hindi_sentence"])]}),
    "validation": Dataset.from_dict({"translation": [{"en": src, "hi": tgt} for src, tgt in zip(data_train_validation["test"]["english_sentence"], data_train_validation["test"]["hindi_sentence"])]}),
    "test": Dataset.from_dict({"translation": [{"en": src, "hi": tgt} for src, tgt in zip(data_train_test["test"]["english_sentence"], data_train_test["test"]["hindi_sentence"])]})
})


print(raw_datasets)
print(raw_datasets["train"][1])

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 27916
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 7976
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3989
    })
})
{'translation': {'en': 'you think that it is not true of course', 'hi': 'आप ये सोच तै है  मगर ये सच नहीं है'}}


In [7]:
print(raw_datasets["test"][0:10])

{'translation': [{'en': 'i have a corresponding milestone to robust human rejuvenation', 'hi': 'मेरे पास मजबूत मानव कायाकल्प से मिलता जुलता मानक है'}, {'en': 'and the plant showers them with pollen', 'hi': 'पौधे उन पर पुष्परेणु की बौछार कर देते है'}, {'en': 'that was like just so super cool', 'hi': 'वह वास्त्व मे बहुत मस्त था'}, {'en': 'to cause there to be a shift in our hearts', 'hi': 'कि यह चीज़ हमारे ह्रदय पर कुछ प्रभाव डाले'}, {'en': 'are ufos alien spaceships or perceptual cognitive mistakes or even fakes', 'hi': 'विदेशी अंतरिक्षयान या अवधारणात्मक संज्ञानात्मक गलतियाँ  या यहाँ तक कि झूठ '}, {'en': 'and when you look at the news through that filter', 'hi': 'और जब समाचारों को ऐसी छलनी से छान कर देखते हैं'}, {'en': 'they fly not with rotating components', 'hi': 'वे घूमने वाले उपकरणों के साथ नहीं उड़'}, {'en': 'next to a hungarian physicist about my age', 'hi': 'मेरी उम्र के एक हंगरी के भौतिक विज्ञानी के साथ'}, {'en': 'is now fairly stable and being managed so that business people', 

### Tokenization

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:

max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


Map:   0%|          | 0/27916 [00:00<?, ? examples/s]



Map:   0%|          | 0/7976 [00:00<?, ? examples/s]

Map:   0%|          | 0/3989 [00:00<?, ? examples/s]

In [11]:
print(tokenized_datasets["train"][0])

{'translation': {'en': 'to all the beliefs i hold', 'hi': 'मेरी मान्यताओं पर'}, 'input_ids': [7, 98, 4, 4930, 5556, 1763, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [334, 53706, 33, 0]}


In [12]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 27916
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7976
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3989
    })
})


In [13]:

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)




  return self.fget.__get__(instance, owner)()


### Setting up required arguments for Trainer

In [14]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True    
)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


## Defining Metrics for evaluation

In [16]:
from evaluate import load
metric_bleu = load("sacrebleu")
metric_meteor = load("meteor")
metric_ter = load("ter")
import numpy as np 

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Compute BLEU score
    bleu_result = metric_bleu.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = bleu_result["score"]
    
    # Compute METEOR score
    meteor_result = metric_meteor.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_score = meteor_result["meteor"]
    
    # Compute TER score
    ter_result = metric_ter.compute(predictions=decoded_preds, references=decoded_labels)
    ter_score = ter_result["score"]
    
    result = {"bleu": round(bleu_score, 4), "meteor": round(meteor_score, 4), "ter": round(ter_score, 4)}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    
    return result



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sidac\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sidac\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sidac\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Training

In [17]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Ter,Gen Len
1,3.2509,3.052552,14.669,0.3459,75.2231,9.835757
2,2.8816,2.987682,15.003,0.3503,74.6484,9.880391
3,2.6642,2.968331,15.2449,0.354,74.1983,9.864218
4,2.5097,2.966589,15.1961,0.3548,74.3321,9.912738
5,2.4387,2.968678,15.2879,0.3549,74.1299,9.896189


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


TrainOutput(global_step=8725, training_loss=2.7686472650653653, metrics={'train_runtime': 28838.5332, 'train_samples_per_second': 4.84, 'train_steps_per_second': 0.303, 'total_flos': 591901468655616.0, 'train_loss': 2.7686472650653653, 'epoch': 5.0})

In [19]:
trainer.save_model("helsinkiFinalwithMetrics")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


### Inference

In [20]:
text = "Legumes share resources with nitrogen-fixing bacteria."
text2 = "my name is John"
text3 = 'he died'
text4  = "i would like to tell you about one such child"


In [21]:
def predict(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").input_ids
    print(inputs)
    model = AutoModelForSeq2SeqLM.from_pretrained("helsinkiFinalwithMetrics")
    outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
    outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return outputs
     

In [22]:

pred = predict(text4)
pred

tensor([[5556,  178,  288,    7, 1169,   27,  195,  131,  295, 1075,    0]])


'मैं आपको ऐसे एक बच्चे के बारे में बताना चाहूँगा'

In [23]:
model = AutoModelForSeq2SeqLM.from_pretrained("helsinkiFinalwithMetrics")

In [24]:
import torch

In [25]:
example=tokenized_datasets["test"][0]
print(example)
input_ids = [example["input_ids"]]
attention_mask = [example["attention_mask"]]
print(torch.tensor(input_ids))
outputs = model.generate(input_ids=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(outputs)

{'translation': {'en': 'i have a corresponding milestone to robust human rejuvenation', 'hi': 'मेरे पास मजबूत मानव कायाकल्प से मिलता जुलता मानक है'}, 'input_ids': [5556, 55, 19, 12182, 30734, 7, 54326, 804, 2498, 15072, 1345, 14124, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [228, 173, 14204, 3243, 24, 1615, 39207, 12, 1855, 30004, 3877, 5, 0]}
tensor([[ 5556,    55,    19, 12182, 30734,     7, 54326,   804,  2498, 15072,
          1345, 14124,     0]])
मेरे पास मजबूत मानव परिक्षण के बराबर महत्वपूर्ण है


### Evaluation on Test Dataset

In [26]:
example=tokenized_datasets["test"][2]
print(example)
input_ids = [example["input_ids"]]
attention_mask = [example["attention_mask"]]
label= [example["labels"]]
print(label)
print(torch.tensor(input_ids))
outputs = model.generate(input_ids=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask))
print(outputs)
metrics = compute_metrics((outputs, label))


bleu_score = metrics['bleu']
meteor_score = metrics['meteor']
ter_score = metrics['ter']
print("BLEU Score on Test Example:", bleu_score)
print("METEOR Score on Test Dataset:", meteor_score)
print("TER Score on Test Dataset:", ter_score)

{'translation': {'en': 'that was like just so super cool', 'hi': 'वह वास्त्व मे बहुत मस्त था'}, 'input_ids': [26, 80, 288, 469, 166, 8708, 8810, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [49, 32245, 1185, 926, 1878, 176, 38075, 82, 0]}
[[49, 32245, 1185, 926, 1878, 176, 38075, 82, 0]]
tensor([[  26,   80,  288,  469,  166, 8708, 8810,    0]])
tensor([[61949,    60,  4828,    58,   470,    82,     0]])
BLEU Score on Test Example: 8.7458
METEOR Score on Test Dataset: 0.0847
TER Score on Test Dataset: 83.3333


In [27]:
test_results = trainer.predict(tokenized_datasets["test"])


print("Test Bleu Score: ", test_results.metrics["test_bleu"])

Test Bleu Score:  14.597


In [28]:
print("Test Metrics: ", test_results.metrics)

Test Metrics:  {'test_loss': 2.9722914695739746, 'test_bleu': 14.597, 'test_meteor': 0.3538, 'test_ter': 74.4472, 'test_gen_len': 9.92730007520682, 'test_runtime': 911.4809, 'test_samples_per_second': 4.376, 'test_steps_per_second': 0.274}
