In [None]:
# !pip install transformers
# !pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
# !pip install SentencePiece

import pandas as pd
import sklearn
import json
from pathlib import Path
from sklearn.model_selection import train_test_split


import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset

In [None]:
!wget https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/small_vocab_en
!wget https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/small_vocab_fr

In [None]:
# Input - Two list of sentences
english = []
french = []
with open("small_vocab_en", "r") as english_file:
  text = english_file.readlines()
  text = [t.strip("\n") for t in text]
  english.extend(text)

with open("small_vocab_en", "r") as french_file:
  text = french_file.readlines()
  text = [t.strip("\n") for t in text]
  french.extend(text)

data = pd.DataFrame({"english" : english, "french" : french})

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [None]:
x = data["english"].values.tolist()
y = data["french"].values.tolist()

train_x, val_x, train_y, val_y = train_test_split(x, y)

In [None]:
train_encodings = tokenizer(train_x, padding=True, truncation=True)
val_encodings = tokenizer(val_x, padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
    train_labels = tokenizer(train_y, padding=True, truncation=True, return_tensors="pt")
    val_labels = tokenizer(val_y, padding=True, truncation=True, return_tensors="pt")

In [None]:
# Dataset class

class MTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels["input_ids"][idx]
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = MTDataset(train_encodings, train_labels)
val_dataset = MTDataset(val_encodings, val_labels)

In [None]:
# Train the model

idx = 0
model_path = f'gdrive/MyDrive/model_{idx}'

training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    num_train_epochs=1, 
    evaluation_strategy="epoch",
    per_device_train_batch_size = 1          
)

# Trainer object 

trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=val_dataset             
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
!mkdir here
trainer.save_model("here")