In [None]:
import pandas as pd
import re
import string
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
#using only 200000 rows
r_rows = 200000
df = pd.read_csv('../input/en-fr-translation-dataset/en-fr.csv' , nrows = r_rows)

In [None]:
df.head()

In [None]:
# converting every letter to lower case
df['en'] = df['en'].apply(lambda x: str(x).lower())
df['fr'] = df['fr'].apply(lambda x: str(x).lower())

In [None]:
# removing apostrophe from the sentences
df['en'] = df['en'].apply(lambda x: re.sub("'","",x))
df['fr'] = df['fr'].apply(lambda x: re.sub("'","",x))

In [None]:
exclude = set(string.punctuation)
# removing all the punctuations
df['en'] = df['en'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['fr'] = df['fr'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
# removing digits from the sentences
digit = str.maketrans('','',string.digits)
df['en'] = df['en'].apply(lambda x: x.translate(digit))
df['fr'] = df['fr'].apply(lambda x: x.translate(digit))

In [None]:
# using pretrained model and then finetunig it on our dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to('cuda')

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0001)

In [None]:
def model_train():
    model.train()
    losses = 0
    X = df['en']
    y = df['fr']
    max_epochs = 15
    n_batches = 32
    for epoch in tqdm(range(max_epochs)):
        for i in tqdm(range(n_batches)):
            # making batches 
            local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
            # preparing the data according to the model input
            batch = tokenizer.prepare_seq2seq_batch(list(local_X),list(local_y),return_tensors='pt').to('cuda')
            output = model(**batch)
            # loss can be taken directly from the model output
            loss = output.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses = losses+loss
    average = losses/len(df)
    print('Loss: ' + str(average) )
    
    return model

In [None]:
model = model_train()

In [None]:
a = model.generate(**tokenizer.prepare_seq2seq_batch(['Hello , I have food'],return_tensors='pt').to('cuda'))
tokenizer.batch_decode(a)

In [None]:
torch.save(model , 'model.pkl')