# Mount Google Drive

need to run the below cell to mount the drive with notebook (also for testing the model)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Libraries

need to run the below cell to train or test the model **(both locally and for huggingface)**

In [None]:
!pip install datasets
!pip install sacremoses
!pip install transformers
!pip install huggingface_hub
!pip install transformers[sentencepiece] datasets

need to run the below cell to train or test the model **(only for huggingface)**

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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 (use this token in the box below)

need to run the below cell to train or test the model **(only for huggingface)**

In [None]:
!apt install git-lfs
!git config --global user.email "shamimmahbub230@gmail.com"
!git config --global user.name "shamim237"

# Data Preprocessing
### ***Not required to run this section for testing the model***

Load dataset

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset

In [None]:
# path of tsv file
tsv_file= "/content/drive/MyDrive/Language-Translation/Sentence pairs in English-Italian - 2022-10-18_cleaned.tsv"
 
# reading given tsv file
csv = pd.read_table(tsv_file,sep='\t', encoding= "utf-8")
 
# converting tsv file into csv
csv.to_csv('/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it.csv',index=False)

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it.csv')

In [None]:
# set column names
dataset.columns = ['english', 'italian']

In [None]:
dataset

Unnamed: 0,english,italian
0,I have to go to sleep.,Devo andare a dormire.
1,I have to go to sleep.,Io devo andare a dormire.
2,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 giugno ed è il compleanno di Muir...
3,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 di giugno ed è il compleanno di M...
4,Muiriel is 20 now.,Muiriel ha 20 anni adesso.
...,...,...
572306,He is leaving for New York next week.,Andrà a New York la settimana prossima.
572307,He is leaving for New York next week.,Lui andrà a New York la settimana prossima.
572308,Tom is a teenager.,Tom è un ragazzo adolescente.
572309,I imagine I am a butterfly.,Immagino di essere una farfalla.


In [None]:
# remove duplicates values from each column
dataset = dataset.drop_duplicates()

In [None]:
dataset # dataset after removing duplicates pairs

Unnamed: 0,english,italian
0,I have to go to sleep.,Devo andare a dormire.
1,I have to go to sleep.,Io devo andare a dormire.
2,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 giugno ed è il compleanno di Muir...
3,Today is June 18th and it is Muiriel's birthday!,Oggi è il 18 di giugno ed è il compleanno di M...
4,Muiriel is 20 now.,Muiriel ha 20 anni adesso.
...,...,...
572306,He is leaving for New York next week.,Andrà a New York la settimana prossima.
572307,He is leaving for New York next week.,Lui andrà a New York la settimana prossima.
572308,Tom is a teenager.,Tom è un ragazzo adolescente.
572309,I imagine I am a butterfly.,Immagino di essere una farfalla.


In [None]:
# Shuffle the dataset
dataset = dataset.sample(frac=1, random_state=4444)
dataset.iloc[1000:1010]

Unnamed: 0,english,italian
490801,Why didn't Mary tell us that she didn't know h...,Perché Mary non ci ha detto che non sapeva nuo...
245701,What Tom did was stupid.,Quello che ha fatto Tom era stupido.
311754,Tom trusted me.,Tom si fidava di me.
43324,The weather was beautiful and we stayed on the...,Il tempo era bello e siamo state in spiaggia t...
100742,Tom works as a bouncer.,Tom lavora come buttafuori.
419579,Do you know why Tom left?,Sa perché Tom se n'è andato?
230137,I ate everything on the plate.,Ho mangiato tutto nel piatto.
407056,I didn't shoot anyone.,Non sparai a nessuno.
279218,What more can you say?,Che altro riesci a dire?
456335,I was speaking in Berber.,Stavo parlando in berbero.


In [None]:
# splitting dataframe by row index
# train = dataset.iloc[:225000]
# validation = dataset.iloc[225000:240000]
# test = dataset.iloc[240000:253638]
train, validation, test = np.split(dataset.sample(frac=1, random_state=42), [int(.9*len(dataset)), int(.95*len(dataset))])

In [None]:
train

Unnamed: 0,english,italian
147230,I need to see Tom now.,Devo vedere Tom ora.
315115,Tom is making faces at me.,Tom mi sta facendo delle facce.
3565,You deserve the prize.,Te lo meriti il premio.
265829,I don't usually snore.,Io solitamente non russo.
45973,I wish he had been more careful when he spoke.,Avrei voluto che fosse stato più attento nel p...
...,...,...
281478,I think Tom doesn't come here very often.,Io penso che Tom non venga qui molto spesso.
433808,Will Tom be OK?,Tom starà bene?
81810,I will send you a copy of my plans as soon as ...,Ti manderò una copia dei miei piani appena rie...
31676,Can you lend me some money?,Puoi prestarmi dei soldi?


In [None]:
validation

Unnamed: 0,english,italian
437816,I used to love Australia.,Io amavo l'Australia.
119680,Which pen do you see?,Quale biro vede?
354273,I'll dance with him.,Danzerò con lui.
528798,They look like they're poor.,Sembrano essere poveri.
185869,Do you mean Tom is rich?,Vuole dire che Tom è ricco?
...,...,...
161980,What song is she playing?,Che canzone sta suonando?
398378,Tom didn't survive the accident.,Tom non è sopravvissuto all'incidente.
380794,I've got no secrets from them.,Non ho segreti con loro.
518570,She stopped going to the mosque.,Ha smesso di andare alla moschea.


In [None]:
test

Unnamed: 0,english,italian
452569,I like oranges a lot.,Mi piacciono molto le arance.
81380,You speak tremendously fast.,Tu parli tremendamente veloce.
321957,Tom had to go to work.,Tom doveva andare al lavoro.
117682,I love you both.,Vi amo entrambe.
25996,You are rude.,È maleducata.
...,...,...
79081,Pigeons are very bothersome birds in cities.,I piccioni sono degli uccelli molto seccanti n...
523803,I'm tired of living here.,Sono stanca di vivere qua.
350339,I asked you to leave Tom alone.,Ti ho chiesto di lasciare Tom da solo.
530734,That's all I know about her.,È tutto ciò che so su di lei.


In [None]:
# save train and test data for furthur use
train.to_csv('/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-train.csv', index=False)
validation.to_csv('/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-valid.csv', index=False)
test.to_csv('/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-test.csv',index=False)

In [None]:
# make Dataset Dict
train_ds = Dataset.from_pandas(train,preserve_index = False)
valid_ds = Dataset.from_pandas(validation,preserve_index = False)
test_ds = Dataset.from_pandas(test,preserve_index = False)

#  Load and Build Model 
### ***Not required to run this section for testing the model***

In [None]:
# Checkpoint name of the pre-trained model
model_checkpoint = "Helsinki-NLP/opus-mt-en-it"

In [None]:
# load tokenizer from transformers to tokenize sentences
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

In [None]:
# set source_language and target_language tokenizer
if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "it_XX"

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to Italian: "
else:
    prefix = ""

In [None]:
max_input_length = 128 # max length of input sentence
max_target_length = 128 # max length of output sentence
source_lang = "en"
target_lang = "it"

# define a function for processing the inputs for the model
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples["english"]]
    targets = [ex for ex in examples["italian"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# apply the function on train and validation dataset
train_data = train_ds.map(preprocess_function, batched=True)
valid_data = valid_ds.map(preprocess_function, batched=True)
test_data = test_ds.map(preprocess_function, batched=True)

In [None]:
# download and load the pre-trained model
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# hyperparameters
batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01

model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetune101-{source_lang}-to-{target_lang}"

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    valid_data,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    test_data,
    batch_size= 4,
    shuffle = False,
    collate_fn = data_collator,
)

In [None]:
# compile the model
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer, metrics=['acc'])

In [None]:
# early stopping callback to avoid unwanted training
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience= 5,
    verbose= 0,
    mode= "auto",
    baseline=None,
    restore_best_weights=False)

# save the best weights of the model while training
best_model = tf.keras.callbacks.ModelCheckpoint(
    '/content/drive/MyDrive/Language-Translation/en-it/Models/model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', 
    verbose=1, 
    monitor='val_loss',
    save_weights_only=True, 
    save_best_only=True,
    mode='auto')


In [None]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

tensorboard_callback = TensorBoard(log_dir="./translation_model_save/logs")
callbacks = [early_stop, best_model, tensorboard_callback]

# start to train the model
model.fit(train_dataset, validation_data=validation_dataset, epochs= 50, callbacks=callbacks)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.29498, saving model to /content/drive/MyDrive/Language-Translation/en-it/Models/model-001-0.406756-0.403819.h5
Epoch 2/50
Epoch 2: val_loss improved from 0.29498 to 0.28461, saving model to /content/drive/MyDrive/Language-Translation/en-it/Models/model-002-0.412845-0.405152.h5
Epoch 3/50
Epoch 3: val_loss improved from 0.28461 to 0.27868, saving model to /content/drive/MyDrive/Language-Translation/en-it/Models/model-003-0.414711-0.405404.h5
Epoch 4/50
Epoch 4: val_loss improved from 0.27868 to 0.27773, saving model to /content/drive/MyDrive/Language-Translation/en-it/Models/model-004-0.416552-0.405580.h5
Epoch 5/50
Epoch 5: val_loss did not improve from 0.27773
Epoch 6/50
Epoch 6: val_loss did not improve from 0.27773
Epoch 7/50
Epoch 7: val_loss did not improve from 0.27773
Epoch 8/50
Epoch 8: val_loss did not improve from 0.27773
Epoch 9/50
Epoch 9: val_loss did not improve from 0.27773


<keras.callbacks.History at 0x7f8cd4d00e10>

# Save and Load trained Model

In [None]:
# save trained tokenizer and model for testing purspose (saved locally)
tokenizer.save_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc")
model.save_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc")

In [None]:
# push the model to huggingface to save it for later use (save to huggingface)
model.push_to_hub("shamim237/en-it-model")
tokenizer.push_to_hub("shamim237/en-it-model")

run the below cell to load the model for testing **(only for huggingface)**

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from huggingface
model_name = 'shamim237/en-it-model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

run the below cell to load the model for testing **(only for locally)**

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from local drive

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc/")
model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc/")

run the cell, if you want to see the test accuracy and loss of the model on testing dataset

In [None]:
model.compile(optimizer=optimizer, metrics=['acc'])
model.evaluate(test_dataset)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




[0.27917957305908203, 0.6586927175521851]

# Inference of the trained Model

run all the cells in the below, to test the model in training and testing dataset **(both locally and for huggingface)**

In [None]:
import time
import nltk
import tensorflow as tf
import pandas as pd
from collections import Counter
from nltk.util import everygrams, ngrams
nltk.download('punkt')

In [None]:
@tf.function(jit_compile=True)
def generate(inputs):
    return model.generate(**inputs, max_length=128)

In [None]:
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
    return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)


def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):

    # sanity check
    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            # True positives + False negatives.
            tpfn = sum(ref_ngrams.values())

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score


get individual GBLEU score on test dataset

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-test.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['english'], test_data['italian']):
  k +=1
  print(k)
  print("English           : ", eng)
  print("Actual Italian    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted Italian : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500:
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break


In [None]:
#  to get average of GLEU Score on test dataset
def Average(lst):
    sum_of_list = 0
    for i in range(len(lst)):
        sum_of_list += lst[i]
    average = sum_of_list/len(lst)
    return average

In [None]:
avg_scores_gleu = Average(sco)

In [None]:
avg_scores_gleu

get individual GBLEU score on train dataset

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-train.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(train_data ['english'], train_data ['italian']):
  k +=1
  print(k)
  print("English           : ", eng)
  print("Actual Italian    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted italian : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500:
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break


In [None]:
#  to get average of GLEU Score on test dataset
def Average(lst):
    sum_of_list = 0
    for i in range(len(lst)):
        sum_of_list += lst[i]
    average = sum_of_list/len(lst)
    return average

In [None]:
avg_scores_gleu = Average(sco)

In [None]:
avg_scores_gleu

# **Model Testing Section**

✅✅***Standalone section in terms of testing the english-spanish model.***✅✅



In [None]:
from google.colab import drive
drive.mount('/content/drive')

need to run the below cell to train or test the model ***(both locally and for huggingface)***

In [None]:
!pip install datasets
!pip install sacremoses
!pip install transformers
!pip install huggingface_hub
!pip install transformers[sentencepiece] datasets

need to run the below cell to train or test the model ***(only for huggingface)***

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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          ------------------(use this token in the box below)------------------- 

need to run the below cell to train or test the model ***(only for huggingface)***

In [None]:
!apt install git-lfs
!git config --global user.email "shamimmahbub230@gmail.com"
!git config --global user.name "shamim237"

run the below cell to load the model for testing ***(only for huggingface)***

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from huggingface
model_name = 'shamim237/en-it-model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

run the below cell to load the model for testing ***(only for locally)***

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from local drive

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc/")
model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Language-Translation/en-it/Misc/")

**run all the cells in the below**, to test the model in training and testing dataset **(both locally and for huggingface)**

In [None]:
import time
import nltk
import pandas as pd
import tensorflow as tf
from collections import Counter
from nltk.util import everygrams, ngrams
nltk.download('punkt')

In [None]:
@tf.function(jit_compile=True)
def generate(inputs):
    return model.generate(**inputs, max_length=128)


# function to get the GLEU Score
  
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
    return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)


def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):

    # sanity check
    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            # True positives + False negatives.
            tpfn = sum(ref_ngrams.values())

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score


#  to get average of GLEU Score on test dataset
def Average(lst):
    sum_of_list = 0
    for i in range(len(lst)):
        sum_of_list += lst[i]
    average = sum_of_list/len(lst)
    return average


get individual GLEU score on test dataset

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-test.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['english'], test_data['italian']):
  k +=1
  print(k)
  print("English           : ", eng)
  print("Actual Italian    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted Italian : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500: ################## change this value based on the samples you want to get the GLEU scores for
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break

In [None]:
avg_scores_gleu = Average(sco)

In [None]:
avg_scores_gleu # average gleu score for test dataset

get individual GLEU score on train dataset

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/en-it/dataset/en-it-train.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['english'], test_data['italian']):
  k +=1
  print(k)
  print("English           : ", eng)
  print("Actual Italian    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted Italian : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500: ################## change this value based on the samples you want to get the GLEU scores for
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break

In [None]:
avg_scores_gleu = Average(sco) 

In [None]:
avg_scores_gleu # average gleu score for training dataset