# Mount Google drive

need to run the below cell to mount the drive with notebook (also for testing the model)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Import Libraries

need to run the below cell to train or test the model **(both locally and for huggingface)**

In [None]:
!pip install datasets
!pip install sacremoses
!pip install transformers
!pip install huggingface_hub
!pip install transformers[sentencepiece] datasets

need to run the below cell to train or test the model **(only for huggingface)**

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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          ------------------(use this token in the box below)------------------- 

need to run the below cell to train or test the model **(only for huggingface)**

In [None]:
!apt install git-lfs
!git config --global user.email "shamimmahbub230@gmail.com"
!git config --global user.name "shamim237"

# Data Preprocessing
### ***Not required to run this section for testing the model***

Load dataset

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset

In [None]:
# path of tsv file
tsv_file= "/content/drive/MyDrive/Language-Translation/Sentence pairs in English-Spanish - 2022-10-18_cleaned.tsv"
 
# reading given tsv file
csv = pd.read_table(tsv_file,sep='\t', encoding= "utf-8")
 
# converting tsv file into csv
csv.to_csv('/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en.csv',index=False)

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en.csv')

In [None]:
# set column names
dataset.columns = ['english', 'spanish']

In [None]:
dataset

In [None]:
# remove duplicates values from each column
dataset = dataset.drop_duplicates()

In [None]:
dataset # dataset after removing duplicates pairs

In [None]:
# Shuffle the dataset
dataset = dataset.sample(frac=1, random_state=0)
dataset.iloc[1000:1010]

In [None]:
# # splitting dataframe by row index
# train = dataset.iloc[:225000]
# validation = dataset.iloc[225000:240000]
# test = dataset.iloc[240000:253638]

train, validation, test = np.split(dataset.sample(frac=1, random_state=42), [int(.9*len(dataset)), int(.95*len(dataset))])

In [None]:
train

In [None]:
validation

In [None]:
test

Unnamed: 0,english,spanish
238691,I don't pick out their clothes.,No escojo la ropa para ellos.
188615,There must've been a tacit understanding betwe...,Tiene que haber existido un acuerdo tácito ent...
58380,I do not work.,No trabajo.
145665,I am wearing an orange t shirt and green pants.,Estoy usando una remera anaranjada y un pantal...
206782,"Oh, just the usual stuff.","Ah, todo está como siempre."
...,...,...
162328,Who invented karaoke?,¿Quién inventó el karaoke?
236830,The airbags failed to deploy.,No saltaron los airbags.
88052,Tom has never heard Mary tell a lie.,Tom nunca ha oído a Mary decir una mentira.
185720,The devil takes the hindmost.,El último mono es el que se ahoga.


In [None]:
# save train and test data for furthur use
train.to_csv('/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-train.csv', index=False)
validation.to_csv('/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-valid.csv', index=False)
test.to_csv('/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-test.csv',index=False)

In [None]:
# make Dataset Dict
train_ds = Dataset.from_pandas(train,preserve_index = False)
valid_ds = Dataset.from_pandas(validation,preserve_index = False)
test_ds = Dataset.from_pandas(test,preserve_index = False)

#  Load and Build Model 
### ***Not required to run this section for testing the model***

In [None]:
# Checkpoint name of the pre-trained model
model_checkpoint = "Helsinki-NLP/opus-mt-es-en"

In [None]:
# load tokenizer from transformers to tokenize sentences
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# set source_language and target_language tokenizer
if "mbart" in model_checkpoint:
    tokenizer.src_lang = "es-XX"
    tokenizer.tgt_lang = "en_XX"

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate Spanish to English: "
else:
    prefix = ""

In [None]:
max_input_length = 128 # max length of input sentence
max_target_length = 128 # max length of output sentence
source_lang = "es"
target_lang = "en"

# define a function for processing the inputs for the model
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples["spanish"]]
    targets = [ex for ex in examples["english"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# apply the function on train and validation dataset
train_data = train_ds.map(preprocess_function, batched=True)
valid_data = valid_ds.map(preprocess_function, batched=True)
test_data = test_ds.map(preprocess_function, batched=True)

In [None]:
# download and load the pre-trained model
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# hyperparameters
batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01

model_name = model_checkpoint.split("/")[-1]
# push_to_hub_model_id = f"{model_name}-finetune101-{source_lang}-to-{target_lang}"

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    valid_data,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    test_data,
    batch_size= 4,
    shuffle = False,
    collate_fn = data_collator,
)

In [None]:
# compile the model
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer, metrics=['acc'])

In [None]:
# early stopping callback to avoid unwanted training
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience= 5,
    verbose= 0,
    mode= "auto",
    baseline=None,
    restore_best_weights=False)

# save the best weights of the model while training
best_model = tf.keras.callbacks.ModelCheckpoint(
    '/content/drive/MyDrive/Language-Translation/es-en/models/model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', 
    verbose=1, 
    monitor='val_loss',
    save_weights_only=True, 
    save_best_only=True,
    mode='auto')


In [None]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

tensorboard_callback = TensorBoard(log_dir="./translation_model_save/logs")
callbacks = [early_stop, best_model, tensorboard_callback]

# start to train the model
model.fit(train_dataset, validation_data=validation_dataset, epochs= 50, callbacks=callbacks)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.45749, saving model to /content/drive/MyDrive/Language-Translation/es-en/models/model-001-0.341647-0.340052.h5
Epoch 2/50
Epoch 2: val_loss improved from 0.45749 to 0.45176, saving model to /content/drive/MyDrive/Language-Translation/es-en/models/model-002-0.348263-0.340911.h5
Epoch 3/50
Epoch 3: val_loss did not improve from 0.45176
Epoch 4/50
Epoch 4: val_loss did not improve from 0.45176
Epoch 5/50
Epoch 5: val_loss did not improve from 0.45176
Epoch 6/50
Epoch 6: val_loss did not improve from 0.45176
Epoch 7/50
Epoch 7: val_loss did not improve from 0.45176


<keras.callbacks.History at 0x7f8cc0983a50>

# Save and Load trained Model

In [None]:
# save trained tokenizer and model for testing purspose (save to local drive)
tokenizer.save_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")
model.save_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")

In [None]:
# push the model to huggingface to save it for later use (save to huggingface)
model.push_to_hub("shamim237/es-en-model")
tokenizer.push_to_hub("shamim237/es-en-model")

run the below cell to load the model for testing **(only for huggingface)**

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from huggingface

model_name = 'shamim237/es-en-model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

run the below cell to load the model for testing **(only for locally)**

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from local drive

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")
model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")

run the cell, if you want to see the test accuracy and loss of the model on testing dataset

In [None]:
model.compile(optimizer=optimizer, metrics=['acc'])
model.evaluate(test_dataset)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.




[0.4723685383796692, 0.6049464344978333]

# Inference of the trained Model

run all the cells in the below, to test the model in training and testing dataset **(both locally and for huggingface)**

In [None]:
import time
import nltk
import pandas as pd
import tensorflow as tf
from collections import Counter
from nltk.util import everygrams, ngrams
nltk.download('punkt')

In [None]:
@tf.function(jit_compile=True)
def generate(inputs):
    return model.generate(**inputs, max_length=128)

In [None]:
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
    return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)


def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):

    # sanity check
    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            # True positives + False negatives.
            tpfn = sum(ref_ngrams.values())

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score


get individual GBLEU score on test dataset

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-test.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['spanish'], test_data['english']):
  k +=1
  print(k)
  print("Spanish           : ", eng)
  print("Actual English    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted English : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500:
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break


1
Spanish           :  No escojo la ropa para ellos.
Actual English    :  I don't pick out their clothes.


  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


Predicted English :  I don't choose clothes for them.
GLEU Score        :  0.2222222222222222
2
Spanish           :  Tiene que haber existido un acuerdo tácito entre ellos.
Actual English    :  There must've been a tacit understanding between them.
Predicted English :  There must've been a tacit agreement between them.
GLEU Score        :  0.4909090909090909
3
Spanish           :  No trabajo.
Actual English    :  I do not work.
Predicted English :  I don't work.
GLEU Score        :  0.4
4
Spanish           :  Estoy usando una remera anaranjada y un pantalón verde.
Actual English    :  I am wearing an orange t shirt and green pants.
Predicted English :  I'm wearing an orange t shirt and green pants.
GLEU Score        :  0.696969696969697
5
Spanish           :  Ah, todo está como siempre.
Actual English    :  Oh, just the usual stuff.




Predicted English :  Ah, everything is as usual.
GLEU Score        :  0.10714285714285714
6
Spanish           :  No me agrada Tom y no confío en él.
Actual English    :  I don't like Tom and I don't trust him.




Predicted English :  I don't like Tom and I don't trust him.
GLEU Score        :  1.0
7
Spanish           :  No tengo nada que ver con el asunto.
Actual English    :  I have nothing to do with the affair.
Predicted English :  I have nothing to do with the affair.
GLEU Score        :  1.0
8
Spanish           :  Él no es el tipo de persona al que le gusta escuchar jazz.
Actual English    :  He is not the sort of person who likes to listen to jazz.
Predicted English :  He isn't the type of person who likes to listen to jazz.
GLEU Score        :  0.4666666666666667
9
Spanish           :  Ni pensar en salir en esta lluvia.
Actual English    :  Going out in this rain is out of the question.
Predicted English :  Don't even think about going out in this rain.
GLEU Score        :  0.16666666666666666
10
Spanish           :  Ella es gentil.
Actual English    :  She is kind.
Predicted English :  She is kind.
GLEU Score        :  1.0
11
Spanish           :  No sabía cuánto tiempo había estado el p

In [None]:
#  to get average of GLEU Score on test dataset
def Average(lst):
    sum_of_list = 0
    for i in range(len(lst)):
        sum_of_list += lst[i]
    average = sum_of_list/len(lst)
    return average

In [None]:
avg_scores_gleu = Average(sco)

In [None]:
avg_scores_gleu

0.6118102138784416

get individual GBLEU score on train dataset

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-train.csv")

In [None]:
start_time = time.time()
scos = []
k = 0
for eng, spa in zip(train_data['spanish'], train_data['english']):
  k +=1
  print(k)
  print("Spanish           : ", eng)
  print("Actual English    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted English : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    scos.append(scores)
    if k ==500:
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break

1
Spanish           :  A pesar de sus esfuerzos, no consiguió triunfar.
Actual English    :  With all his efforts, he couldn't succeed.


  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


Predicted English :  Despite his efforts, he failed to succeed.
GLEU Score        :  0.23636363636363636
2
Spanish           :  Nunca se me ocurren ideas brillantes.
Actual English    :  Bright ideas never occur to me.
Predicted English :  Bright ideas never occur to me.
GLEU Score        :  1.0
3
Spanish           :  Tom no tuvo nada que ver con el secuestro.
Actual English    :  Tom had nothing to do with the kidnapping.
Predicted English :  Tom had nothing to do with the kidnapping.
GLEU Score        :  1.0
4
Spanish           :  Nuestras fábricas están funcionando perfectamente.
Actual English    :  Our factories are working perfectly.
Predicted English :  Our factories are working perfectly.
GLEU Score        :  1.0
5
Spanish           :  Le dije que viniese a mi casa.
Actual English    :  I told him to come to my house.
Predicted English :  I told him to come to my house.
GLEU Score        :  1.0
6
Spanish           :  Estoy intentando protegerte.
Actual English    :  I'm trying 

In [None]:
avg_scores_gleu = Average(scos)

In [None]:
avg_scores_gleu

0.7970957304261775

# **Model Testing Section**

✅✅***Standalone section in terms of testing the english-spanish model.***✅✅



In [None]:
from google.colab import drive
drive.mount('/content/drive')

need to run the below cell to train or test the model ***(both locally and for huggingface)***

In [None]:
!pip install datasets
!pip install sacremoses
!pip install transformers
!pip install huggingface_hub
!pip install transformers[sentencepiece] datasets

need to run the below cell to train or test the model ***(only for huggingface)***

In [None]:
from huggingface_hub import notebook_login

notebook_login()
# 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          ------------------(use this token in the box below)------------------- 

need to run the below cell to train or test the model ***(only for huggingface)***

In [None]:
!apt install git-lfs
!git config --global user.email "shamimmahbub230@gmail.com"
!git config --global user.name "shamim237"

run the below cell to load the model for testing ***(only for huggingface)***

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM


model_name = 'shamim237/es-en-model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

run the below cell to load the model for testing ***(only for locally)***

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# load the saved model and tokenizer from local drive

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")
model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Language-Translation/es-en/misc")

**run all the cells in the below**, to test the model in training and testing dataset **(both locally and for huggingface)**

In [None]:
import time
import nltk
import pandas as pd
import tensorflow as tf
from collections import Counter
from nltk.util import everygrams, ngrams
nltk.download('punkt')

In [None]:
@tf.function(jit_compile=True)
def generate(inputs):
    return model.generate(**inputs, max_length=128)


# function to get the GLEU Score
  
def sentence_gleu(references, hypothesis, min_len=1, max_len=4):
    return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len)


def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4):

    # sanity check
    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # sum matches and max-token-lengths over all sentences
    corpus_n_match = 0
    corpus_n_all = 0

    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len))
        tpfp = sum(hyp_ngrams.values())  # True positives + False positives.

        hyp_counts = []
        for reference in references:
            ref_ngrams = Counter(everygrams(reference, min_len, max_len))
            # True positives + False negatives.
            tpfn = sum(ref_ngrams.values())

            overlap_ngrams = ref_ngrams & hyp_ngrams
            tp = sum(overlap_ngrams.values())  # True positives.
            n_all = max(tpfp, tpfn)

            if n_all > 0:
                hyp_counts.append((tp, n_all))

        # use the reference yielding the highest score
        if hyp_counts:
            n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1])
            corpus_n_match += n_match
            corpus_n_all += n_all

    # corner case: empty corpus or empty references---don't divide by zero!
    if corpus_n_all == 0:
        gleu_score = 0.0
    else:
        gleu_score = corpus_n_match / corpus_n_all

    return gleu_score


#  to get average of GLEU Score on test dataset
def Average(lst):
    sum_of_list = 0
    for i in range(len(lst)):
        sum_of_list += lst[i]
    average = sum_of_list/len(lst)
    return average


get individual GLEU score on test dataset

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-test.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['spanish'], test_data['english']):
  k +=1
  print(k)
  print("Spanish           : ", eng)
  print("Actual English    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted English : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500: ################## change this value based on the samples you want to get the GLEU scores for
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break

In [None]:
avg_scores_gleu = Average(sco)

In [None]:
avg_scores_gleu # average gleu score for test dataset

get individual GLEU score on train dataset

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Language-Translation/es-en/dataset/es-en-train.csv")

In [None]:
start_time = time.time()
sco = []
k = 0
for eng, spa in zip(test_data['spanish'], test_data['english']):
  k +=1
  print(k)
  print("Spanish           : ", eng)
  print("Actual English    : ", spa)
  tokenized_data = tokenizer([eng], return_tensors="np", pad_to_multiple_of=128)
  #print(tokenized_data)
  out = generate(tokenized_data)
  with tokenizer.as_target_tokenizer():
    print("Predicted English : ", tokenizer.decode(out[0], skip_special_tokens=True))
    actual_spanish    = nltk.word_tokenize(spa)
    predicted_spanish = nltk.word_tokenize(tokenizer.decode(out[0], skip_special_tokens=True)) 
    scores = corpus_gleu([[actual_spanish]], [predicted_spanish], min_len=1, max_len=128)
    print("GLEU Score        : ", scores)
    sco.append(scores)
    if k ==500: ################## change this value based on the samples you want to get the GLEU scores for
      now = time.time()
      t_time = now-start_time
      print(t_time)
      break

In [None]:
avg_scores_gleu = Average(sco) 

In [None]:
avg_scores_gleu # average gleu score for training dataset