In [1]:
import torch
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [2]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
from torch.utils.data import TensorDataset, DataLoader

In [4]:
import nltk
from nltk.translate import bleu_score
from nltk.translate.bleu_score import corpus_bleu

In [5]:
# setup device based on availability
device = "cuda:4" if torch.cuda.is_available() else "cpu"
print("Using device: {}".format(device))

Using device: cuda:4


In [6]:
# load the dataset
train_source_text_path = "../data/sup_train.en-fr.fr"
train_target_text_path = "../data/sup_train.en-fr.en"
dev_source_text_path = "../data/sup_valid.en-fr.fr"
dev_target_text_path = "../data/sup_valid.en-fr.en"
test_source_text_path = "../data/test.en-fr.fr"
test_target_text_path = "../data/test.en-fr.en"

In [7]:
# read the data from the files and store them in pandas dataframe
prompt_prefix = "translate French to English: "

def retrieve_data(file_path, prompt_prefix=None):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            if prompt_prefix:
                data.append(prompt_prefix + line)
            else:
                data.append(line)
    return data

In [8]:
train_source_sentences = retrieve_data(train_source_text_path, prompt_prefix)
train_target_sentences = retrieve_data(train_target_text_path)

In [9]:
train_source_sentences = train_source_sentences[:len(train_source_sentences)//8]
train_target_sentences = train_target_sentences[:len(train_target_sentences)//8]
print(len(train_source_sentences), len(train_target_sentences))
print(train_source_sentences[3])
print(train_target_sentences[3])

195799 195799
translate French to English: vous avez souhaité un débat à ce sujet dans les prochains jours , au cours de cette période de session .
you have requested a debate on this subject in the course of the next few days , during this part-session .


In [10]:
dev_source_sentences = retrieve_data(dev_source_text_path, prompt_prefix)
dev_target_sentences = retrieve_data(dev_target_text_path)
print(len(dev_source_sentences), len(dev_target_sentences))
print(dev_source_sentences[3])
print(dev_target_sentences[3])

2000 2000
translate French to English: à l' heure actuelle , le conseil est en train d' examiner l' inclusion de tels mécanismes dans l' article 7 .
at present , the council is talking about incorporating such mechanisms in article 7 .


In [11]:
# load the mT5 tokenizer
tokenizer_fr = AutoTokenizer.from_pretrained('google/mt5-small', source_lang="fr")
tokenizer_en = AutoTokenizer.from_pretrained('google/mt5-small', source_lang="en")



In [12]:
# tokenize the data
def encode_data(source_sentences, target_sentences):
    tokenized_inputs = tokenizer_fr.batch_encode_plus(source_sentences, padding='longest', return_tensors='pt')
    tokenized_outputs = tokenizer_en.batch_encode_plus(target_sentences, padding='longest', return_tensors='pt')
    
    input_ids = tokenized_inputs['input_ids']
    attention_mask = tokenized_inputs['attention_mask']
    labels = tokenized_outputs['input_ids']
    
    return input_ids, attention_mask, labels
    
train_input_ids, train_attn_mask, train_labels = encode_data(train_source_sentences, train_target_sentences)
dev_input_ids, dev_attn_mask, dev_labels = encode_data(dev_source_sentences, dev_target_sentences)

In [13]:
len(train_input_ids), len(train_attn_mask), len(train_labels)

(195799, 195799, 195799)

In [14]:
train_dataset = TensorDataset(train_input_ids, train_attn_mask, train_labels)
valid_dataset = TensorDataset(dev_input_ids, dev_attn_mask, dev_labels)

In [15]:
len(train_dataset)

195799

In [16]:
# create data loader
batch_size = 8
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [17]:
# check if the batching works
for _, batch in zip(range(5), train_data_loader):
    print(batch[0].shape, batch[1].shape, batch[2].shape)

torch.Size([8, 273]) torch.Size([8, 273]) torch.Size([8, 328])
torch.Size([8, 273]) torch.Size([8, 273]) torch.Size([8, 328])
torch.Size([8, 273]) torch.Size([8, 273]) torch.Size([8, 328])
torch.Size([8, 273]) torch.Size([8, 273]) torch.Size([8, 328])
torch.Size([8, 273]) torch.Size([8, 273]) torch.Size([8, 328])


In [18]:
# load the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

# Move to GPU
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [19]:
# initialize the parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
n_epochs = 1
print(optimizer)
print(f'Epochs: {n_epochs}')
total_steps = len(train_data_loader) * n_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.01
)
Epochs: 1


In [20]:
def train(loader, model, optimizer):
    total_loss = 0.0

    for index, batch in tqdm(enumerate(loader), desc="Training...", total=len(loader)):
        # move X and y values to device
        X = batch[0].to(device)
        attention_mask = batch[1].to(device)
        y = batch[2].to(device)
        #decoder_input_ids = model._shift_right(y)

        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output - run model on the input
        y_pred = model(input_ids=X, attention_mask=attention_mask,
                       labels=y)

        # step 3. compute the loss
        loss = y_pred[0]
        total_loss += loss.item()

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        scheduler.step()
        
    return total_loss/len(loader)

In [21]:
def evaluate(model, loader, tokenizer_en):
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for index, batch in tqdm(enumerate(loader), desc="Evaluating...", total=len(loader)):
            X = batch[0].to(device)
            attention_mask = batch[1].to(device)
            y = batch[2].to(device)
            #decoder_input_ids = model._shift_right(y)
            
            outputs = model.generate(input_ids=X, 
                                     attention_mask=attention_mask, 
                                     max_length=50,
                                     num_beams=4,
                                     early_stopping=True,
                                     no_repeat_ngram_size=2,
                                     decoder_start_token_id=model.config.decoder_start_token_id,
                                     eos_token_id=model.config.eos_token_id,
                                     num_return_sequences=1)
            
            predicted_sentences = [tokenizer_en.decode(output, skip_special_tokens=True) for output in outputs]
            target_sentences = [tokenizer_en.decode(target, skip_special_tokens=True) for target in y]
            all_predictions.extend(predicted_sentences)
            all_targets.extend(target_sentences)
            
            loss = model(input_ids=X, 
                         attention_mask=attention_mask, 
                         labels=y).loss
            total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    
    return avg_loss, all_predictions, all_targets

In [22]:
def run_training_and_val(train_loader, valid_loader, model, optimizer, tokenizer_en):
    all_val_loss = []
    all_preds = []
    all_targets = []
    all_train_loss = []

    try:
        for epoch in range(n_epochs):
            epoch_start_time = time.time()
            train_loss = train(train_loader, model, optimizer)
            val_loss, preds, targets = evaluate(model, valid_loader, tokenizer_en)
            print("-" * 89)
            print(
                "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.5f}".format(
                    epoch,
                    (time.time() - epoch_start_time),
                    val_loss
                )
            )
            print("-" * 89)
            all_train_loss.append(train_loss)
            all_preds.extend(preds)
            all_targets.extend(targets)
    except KeyboardInterrupt:
        print("-" * 89)
        print("Exiting from training early")
    
    references = [[target_sent.split()] for target_sent in all_targets]
    candidates = [pred_sent.split() for pred_sent in all_preds]
    bl_score = corpus_bleu(references, candidates)
    print('BLEU score: ', bl_score)
    return all_train_loss, all_val_loss, all_preds, all_targets

In [23]:
train_loss, val_loss, all_preds, all_targets = run_training_and_val(train_data_loader, valid_data_loader, 
                                                             model, optimizer, tokenizer_en)

Training...: 100%|██████████| 24475/24475 [1:47:52<00:00,  3.78it/s]
Evaluating...: 100%|██████████| 250/250 [03:26<00:00,  1.21it/s]


-----------------------------------------------------------------------------------------
| end of epoch   0 | time: 6679.21s | valid loss 0.31316
-----------------------------------------------------------------------------------------
BLEU score:  0.18950614785063996


In [24]:
model.save_pretrained('/soe/npullabh/244_final_project/gpt2/mt5_fine_tuned.pt')
tokenizer_fr.save_pretrained("/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_fr")
tokenizer_en.save_pretrained("/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en")

('/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en/tokenizer_config.json',
 '/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en/special_tokens_map.json',
 '/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en/spiece.model',
 '/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en/added_tokens.json',
 '/soe/npullabh/244_final_project/gpt2/mt5_tokenizer_en/tokenizer.json')

In [25]:
# model = T5ForConditionalGeneration.from_pretrained('/soe/npullabh/244_final_project/t5/t5_fine_tuned.pt')
# tokenizer = T5Tokenizer.from_pretrained("/soe/npullabh/244_final_project/t5/t5_tokenizer")

# model.to(device)

In [26]:
dev_references = [[target_sent.split()] for target_sent in all_targets]
dev_candidates = [pred_sent.split() for pred_sent in all_preds]
dev_bleu = corpus_bleu(dev_references, dev_candidates)
print('Dev BLEU score: ', dev_bleu)

Dev BLEU score:  0.18950614785063996


In [27]:
test_source_sentences = retrieve_data(test_source_text_path)
test_target_sentences = retrieve_data(test_target_text_path)

In [28]:
test_input_ids, test_attn_mask, test_labels = encode_data(test_source_sentences, 
                                                          test_target_sentences)
test_dataset = TensorDataset(test_input_ids, test_attn_mask, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [29]:
test_loss, test_predictions, test_targets = evaluate(model, test_data_loader, tokenizer_en)

Evaluating...: 100%|██████████| 251/251 [03:24<00:00,  1.23it/s]


In [30]:
test_references = [[target_sent.split()] for target_sent in test_targets]
test_candidates = [pred_sent.split() for pred_sent in test_predictions]
test_bleu = corpus_bleu(test_references, test_candidates)
print('Test BLEU score: ', test_bleu)

Test BLEU score:  0.1287955916336652
