In [1]:
from transformers import BertTokenizer
from transformers import GPT2Config, GPT2Model, GPT2Tokenizer
from transformers import BertGenerationConfig, BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader

import nltk
from nltk.translate import bleu_score
from nltk.translate.bleu_score import corpus_bleu

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [2]:
# setup device based on availability
if torch.cuda.is_available():
    print("Total GPUs: ", torch.cuda.device_count())
device = "cuda:5" if torch.cuda.is_available() else "cpu"
print("Using device: {}".format(device))

Total GPUs:  6
Using device: cuda:5


In [3]:
# load the dataset
train_source_text_path = "../data/sup_train.en-fr.fr"
train_target_text_path = "../data/sup_train.en-fr.en"
dev_source_text_path = "../data/sup_valid.en-fr.fr"
dev_target_text_path = "../data/sup_valid.en-fr.en"
test_source_text_path = "../data/test.en-fr.fr"
test_target_text_path = "../data/test.en-fr.en"

In [6]:
# read the data from the files and store them in pandas dataframe
def retrieve_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [7]:
# tokenize the data
def encode_data(source_sentences, target_sentences):
    tokenized_inputs = tokenizer.batch_encode_plus(source_sentences, padding='longest', return_tensors='pt')
    tokenized_outputs = tokenizer.batch_encode_plus(target_sentences, padding='longest', return_tensors='pt')
    
    input_ids = tokenized_inputs['input_ids']
    attention_mask = tokenized_inputs['attention_mask']
    labels = tokenized_outputs['input_ids']
    
    return input_ids, attention_mask, labels

In [5]:
train_source_sentences = retrieve_data(train_source_text_path)
train_target_sentences = retrieve_data(train_target_text_path)
print("Train source n_sentence: ", len(train_source_sentences))
print("Train target n_sentence: ", len(train_target_sentences))

Train source n_sentence:  1566392
Train target n_sentence:  1566392


In [6]:
dev_source_sentences = retrieve_data(dev_source_text_path)
dev_target_sentences = retrieve_data(dev_target_text_path)
print("Dev source n_sentence: ", len(dev_source_sentences))

Dev source n_sentence:  2000


In [7]:
train_source_sentences = train_source_sentences[:len(train_source_sentences)//5]
train_target_sentences = train_target_sentences[:len(train_target_sentences)//5]
print("Train source n_sentence: ", len(train_source_sentences))
print("Dev source n_sentence: ", len(dev_source_sentences))

Train source n_sentence:  313278
Dev source n_sentence:  2000


In [8]:
# create tokenizer...
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [9]:
tmp = tokenizer("This is a very long article to summarize", add_special_tokens=False, return_tensors="pt")
print(tmp)

{'input_ids': tensor([[10747, 10124,   169, 12558, 11695, 13262, 10114, 28439, 65899, 10870]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [11]:
train_input_ids, train_attn_mask, train_labels = encode_data(train_source_sentences, train_target_sentences)
dev_input_ids, dev_attn_mask, dev_labels = encode_data(dev_source_sentences, dev_target_sentences)

In [12]:
train_dataset = TensorDataset(train_input_ids, train_attn_mask, train_labels)
valid_dataset = TensorDataset(dev_input_ids, dev_attn_mask, dev_labels)
print(len(train_dataset))
print(len(valid_dataset))

313278
2000


In [13]:
# create data loader
batch_size = 16
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [14]:
# check if the batching works
for _, batch in zip(range(5), train_data_loader):
    print(batch[0].shape, batch[1].shape, batch[2].shape)

torch.Size([16, 223]) torch.Size([16, 223]) torch.Size([16, 262])
torch.Size([16, 223]) torch.Size([16, 223]) torch.Size([16, 262])
torch.Size([16, 223]) torch.Size([16, 223]) torch.Size([16, 262])
torch.Size([16, 223]) torch.Size([16, 223]) torch.Size([16, 262])
torch.Size([16, 223]) torch.Size([16, 223]) torch.Size([16, 262])


In [15]:
# leverage checkpoints for Bert2Bert model...
# use BERT's cls token as BOS token and sep token as EOS token
encoder = BertGenerationEncoder.from_pretrained("bert-base-multilingual-cased")
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
decoder = BertGenerationDecoder.from_pretrained("bert-base-multilingual-cased", add_cross_attention=True, is_decoder=True)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)

You are using a model of type bert to instantiate a model of type bert-generation. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertGenerationEncoder: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'bert.embeddings.token_type_embeddings.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertGenerationEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertGenerationEncoder from the check

In [16]:
# initialize the parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
n_epochs = 1
print(optimizer)
print(f'Epochs: {n_epochs}')
total_steps = len(train_data_loader) * n_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.0001
    maximize: False
    weight_decay: 0.01
)
Epochs: 1


In [17]:
def train(loader, model, optimizer):
    total_loss = 0.0

    for index, batch in tqdm(enumerate(loader), desc="Training...", total=len(loader)):
        # move X and y values to device
        X = batch[0].to(device)
        attention_mask = batch[1].to(device)
        y = batch[2].to(device)

        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output - run model on the input
        y_pred = model(input_ids=X, decoder_input_ids=y, labels=y)

        # step 3. compute the loss
        loss = y_pred[0]
        total_loss += loss.item()

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()
        scheduler.step()
        
    return total_loss/len(loader)

In [12]:
def evaluate(model, loader, tokenizer):
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    model.eval()
    with torch.no_grad():
        for index, batch in tqdm(enumerate(loader), desc="Evaluating...", total=len(loader)):
            X = batch[0].to(device)
            attention_mask = batch[1].to(device)
            y = batch[2].to(device)
            
            outputs = model.generate(input_ids=X, 
                                     attention_mask=attention_mask, 
                                     max_length=150,
                                     num_beams=4,
                                     early_stopping=True,
                                     no_repeat_ngram_size=2,
                                     decoder_start_token_id=model.config.decoder_start_token_id,
                                     eos_token_id=model.config.eos_token_id,
                                     num_return_sequences=1)
            
            predicted_sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            target_sentences = [tokenizer.decode(target, skip_special_tokens=True) for target in y]
            all_predictions.extend(predicted_sentences)
            all_targets.extend(target_sentences)
            
            loss = model(input_ids=X, decoder_input_ids=y, labels=y).loss
            total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    
    return avg_loss, all_predictions, all_targets

In [19]:
def run_training_and_val(train_loader, valid_loader, model, optimizer, tokenizer):
    all_val_loss = []
    all_preds = []
    all_targets = []
    all_train_loss = []

    try:
        for epoch in range(n_epochs):
            epoch_start_time = time.time()
            train_loss = train(train_loader, model, optimizer)
            val_loss, preds, targets = evaluate(model, valid_loader, tokenizer)
            print("-" * 89)
            print(
                "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.5f}".format(
                    epoch,
                    (time.time() - epoch_start_time),
                    val_loss
                )
            )
            print("-" * 89)
            all_train_loss.append(train_loss)
            all_preds.extend(preds)
            all_targets.extend(targets)
    except KeyboardInterrupt:
        print("-" * 89)
        print("Exiting from training early")
    
    references = [[target_sent.split()] for target_sent in all_targets]
    candidates = [pred_sent.split() for pred_sent in all_predictions]
    bleu_score = corpus_bleu(references, candidates)
    print('BLEU score: ', bleu_score)
    return all_train_loss, all_val_loss, all_preds, all_targets

In [20]:
train_loss, val_loss, val_bleu_scores = run_training_and_val(train_data_loader, valid_data_loader, 
                                                             model, optimizer, tokenizer)

Training...: 100%|██████████████████████| 19580/19580 [3:12:05<00:00,  1.70it/s]
Evaluating...:   1%|▏                         | 1/125 [00:53<1:50:13, 53.33s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.85 GiB (GPU 5; 23.70 GiB total capacity; 13.89 GiB already allocated; 156.31 MiB free; 22.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
model.encoder.save_pretrained('/soe/npullabh/244_final_project/gpt2/bert2bert_ecoder.pt')
model.decoder.save_pretrained('/soe/npullabh/244_final_project/gpt2/bert2bert_decoder.pt')
tokenizer.save_pretrained("/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer")

('/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer/tokenizer_config.json',
 '/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer/special_tokens_map.json',
 '/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer/vocab.txt',
 '/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer/added_tokens.json')

In [22]:
dev_references = [[target_sent.split()] for target_sent in all_targets]
dev_candidates = [pred_sent.split() for pred_sent in all_predictions]
dev_bleu = corpus_bleu(dev_references, dev_candidates)
print('Dev BLEU score: ', dev_bleu)

NameError: name 'all_targets' is not defined

In [4]:
tokenizer = BertTokenizer.from_pretrained("/soe/npullabh/244_final_project/gpt2/bert2bert_tokenizer")

encoder = BertGenerationEncoder.from_pretrained('/soe/npullabh/244_final_project/gpt2/bert2bert_ecoder.pt')
decoder = BertGenerationDecoder.from_pretrained('/soe/npullabh/244_final_project/gpt2/bert2bert_decoder.pt')
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)

Some weights of the model checkpoint at /soe/npullabh/244_final_project/gpt2/bert2bert_decoder.pt were not used when initializing BertGenerationEncoder: ['lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias']
- This IS expected if you are initializing BertGenerationEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertGenerationEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
test_source_sentences = retrieve_data(test_source_text_path)
test_target_sentences = retrieve_data(test_target_text_path)

In [10]:
batch_size = 16
test_input_ids, test_attn_mask, test_labels = encode_data(test_source_sentences, test_target_sentences)
test_dataset = TensorDataset(test_input_ids, test_attn_mask, test_labels)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [13]:
test_loss, test_predictions, test_targets = evaluate(model, test_data_loader, tokenizer)

Evaluating...:   0%|                                    | 0/126 [00:01<?, ?it/s]


NotImplementedError: A model class needs to define a `prepare_inputs_for_generation` method in order to use `generate`.

In [None]:
test_references = [[target_sent.split()] for target_sent in test_targets]
test_candidates = [pred_sent.split() for pred_sent in test_predictions]
test_bleu = corpus_bleu(test_references, test_candidates)
print('Test BLEU score: ', test_bleu)