In [1]:
import torch
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
from torch.utils.data import TensorDataset, DataLoader

In [59]:
import nltk
from nltk.translate import bleu_score
from nltk.translate.bleu_score import corpus_bleu

In [5]:
# setup device based on availability
device = "cuda:4" if torch.cuda.is_available() else "cpu"
print("Using device: {}".format(device))

Using device: cuda:4


In [66]:
# load the dataset
train_source_text_path = "../data/sup_train.en-fr.fr"
train_target_text_path = "../data/sup_train.en-fr.en"
dev_source_text_path = "../data/sup_valid.en-fr.fr"
dev_target_text_path = "../data/sup_valid.en-fr.en"
test_source_text_path = "../data/test.en-fr.fr"
test_target_text_path = "../data/test.en-fr.en"

In [7]:
# read the data from the files and store them in pandas dataframe
def retrieve_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [8]:
train_source_sentences = retrieve_data(train_source_text_path)
train_target_sentences = retrieve_data(train_target_text_path)

In [9]:
train_source_sentences = train_source_sentences[:len(train_source_sentences)//2]
train_target_sentences = train_target_sentences[:len(train_target_sentences)//2]

In [10]:
len(train_source_sentences), len(train_target_sentences)

(783196, 783196)

In [11]:
dev_source_sentences = retrieve_data(dev_source_text_path)
dev_target_sentences = retrieve_data(dev_target_text_path)

In [12]:
len(dev_source_sentences)

2000

In [13]:
# load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
# tokenize the data
def encode_data(source_sentences, target_sentences):
    tokenized_inputs = tokenizer.batch_encode_plus(source_sentences, padding='longest', return_tensors='pt')
    tokenized_outputs = tokenizer.batch_encode_plus(target_sentences, padding='longest', return_tensors='pt')
    
    input_ids = tokenized_inputs['input_ids']
    attention_mask = tokenized_inputs['attention_mask']
    labels = tokenized_outputs['input_ids']
    
    return input_ids, attention_mask, labels
    
train_input_ids, train_attn_mask, train_labels = encode_data(train_source_sentences, train_target_sentences)
dev_input_ids, dev_attn_mask, dev_labels = encode_data(dev_source_sentences, dev_target_sentences)

In [15]:
len(train_input_ids), len(train_attn_mask), len(train_labels)

(783196, 783196, 783196)

In [16]:
train_dataset = TensorDataset(train_input_ids, train_attn_mask, train_labels)
valid_dataset = TensorDataset(dev_input_ids, dev_attn_mask, dev_labels)

In [17]:
len(train_dataset)

783196

In [18]:
# create data loader
batch_size = 16
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [19]:
# check if the batching works
for _, batch in zip(range(5), train_data_loader):
    print(batch[0].shape, batch[1].shape, batch[2].shape)

torch.Size([16, 392]) torch.Size([16, 392]) torch.Size([16, 416])
torch.Size([16, 392]) torch.Size([16, 392]) torch.Size([16, 416])
torch.Size([16, 392]) torch.Size([16, 392]) torch.Size([16, 416])
torch.Size([16, 392]) torch.Size([16, 392]) torch.Size([16, 416])
torch.Size([16, 392]) torch.Size([16, 392]) torch.Size([16, 416])


In [20]:
# load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Move to GPU
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [21]:
# initialize the parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
n_epochs = 1
print(optimizer)
print(f'Epochs: {n_epochs}')
total_steps = len(train_data_loader) * n_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.01
)
Epochs: 1


In [22]:
def train(loader, model, optimizer):
    total_loss = 0.0

    for index, batch in tqdm(enumerate(loader), desc="Training...", total=len(loader)):
        # move X and y values to device
        X = batch[0].to(device)
        attention_mask = batch[1].to(device)
        y = batch[2].to(device)

        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output - run model on the input
        y_pred = model(input_ids=X, attention_mask=attention_mask,
                       labels=y)

        # step 3. compute the loss
        loss = y_pred[0]
        total_loss += loss.item()

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        scheduler.step()
        
    return total_loss/len(loader)

In [63]:
def evaluate(model, loader, tokenizer):
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for index, batch in tqdm(enumerate(loader), desc="Evaluating...", total=len(loader)):
            X = batch[0].to(device)
            attention_mask = batch[1].to(device)
            y = batch[2].to(device)
            
            outputs = model.generate(input_ids=X, 
                                     attention_mask=attention_mask, 
                                     max_length=150,
                                     num_beams=4,
                                     early_stopping=True,
                                     no_repeat_ngram_size=2,
                                     decoder_start_token_id=model.config.decoder_start_token_id,
                                     eos_token_id=model.config.eos_token_id,
                                     num_return_sequences=1)
            
            predicted_sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            target_sentences = [tokenizer.decode(target, skip_special_tokens=True) for target in y]
            all_predictions.extend(predicted_sentences)
            all_targets.extend(target_sentences)
            
            loss = model(input_ids=X, 
                         attention_mask=attention_mask, 
                         labels=y).loss
            total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    
    return avg_loss, all_predictions, all_targets

In [64]:
def run_training_and_val(train_loader, valid_loader, model, optimizer, tokenizer):
    all_val_loss = []
    all_preds = []
    all_targets = []
    all_train_loss = []

    try:
        for epoch in range(n_epochs):
            epoch_start_time = time.time()
            train_loss = train(train_loader, model, optimizer)
            val_loss, preds, targets = evaluate(model, valid_loader, tokenizer)
            print("-" * 89)
            print(
                "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.5f}".format(
                    epoch,
                    (time.time() - epoch_start_time),
                    val_loss
                )
            )
            print("-" * 89)
            all_train_loss.append(train_loss)
            all_preds.extend(preds)
            all_targets.extend(targets)
    except KeyboardInterrupt:
        print("-" * 89)
        print("Exiting from training early")
    
    references = [[target_sent.split()] for target_sent in all_targets]
    candidates = [pred_sent.split() for pred_sent in all_predictions]
    bleu_score = corpus_bleu(references, candidates)
    print('BLEU score: ', bleu_score)
    return all_train_loss, all_val_loss, all_preds, all_targets

In [25]:
train_loss, val_loss, val_bleu_scores = run_training_and_val(train_data_loader, valid_data_loader, 
                                                             model, optimizer, tokenizer)

Training...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48950/48950 [3:27:29<00:00,  3.93it/s]
Evaluating...:   0%|                                                                                                                                                                      | 0/125 [00:01<?, ?it/s]


UnboundLocalError: local variable 'bleu_score' referenced before assignment

In [29]:
model.save_pretrained('/soe/npullabh/244_final_project/t5/t5_fine_tuned.pt')

In [38]:
tokenizer.save_pretrained("/soe/npullabh/244_final_project/t5/t5_tokenizer")

('/soe/npullabh/244_final_project/t5/t5_tokenizer/tokenizer_config.json',
 '/soe/npullabh/244_final_project/t5/t5_tokenizer/special_tokens_map.json',
 '/soe/npullabh/244_final_project/t5/t5_tokenizer/spiece.model',
 '/soe/npullabh/244_final_project/t5/t5_tokenizer/added_tokens.json')

In [55]:
# model = T5ForConditionalGeneration.from_pretrained('/soe/npullabh/244_final_project/t5/t5_fine_tuned.pt')
# tokenizer = T5Tokenizer.from_pretrained("/soe/npullabh/244_final_project/t5/t5_tokenizer")

# model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [65]:
dev_references = [[target_sent.split()] for target_sent in all_targets]
dev_candidates = [pred_sent.split() for pred_sent in all_predictions]
dev_bleu = corpus_bleu(dev_references, dev_candidates)
print('Dev BLEU score: ', dev_bleu)

Dev BLEU score:  0.25324874117452356


In [68]:
test_source_sentences = retrieve_data(test_source_text_path)
test_target_sentences = retrieve_data(test_target_text_path)

In [69]:
test_input_ids, test_attn_mask, test_labels = encode_data(test_source_sentences, 
                                                          test_target_sentences)
test_dataset = TensorDataset(test_input_ids, test_attn_mask, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [70]:
test_loss, test_predictions, test_targets = evaluate(model, test_data_loader, tokenizer)

Evaluating...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [04:10<00:00,  1.99s/it]


In [71]:
test_references = [[target_sent.split()] for target_sent in test_targets]
test_candidates = [pred_sent.split() for pred_sent in test_predictions]
test_bleu = corpus_bleu(test_references, test_candidates)
print('Test BLEU score: ', test_bleu)

Test BLEU score:  0.2126823362139773
