### Experiments with Large Language Models

- fine tune T5 for qa
- fine tune T5 for span corruption

In [60]:
# General imports
import pandas as pd
import numpy as np
from tqdm import tqdm

#  PyTorch imports
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Transformers imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments

# Types
from typing import List, Dict, Tuple, Union

# Set seed
torch.manual_seed(42)
np.random.seed(42)



#### Data

In [61]:
wiki = []
with open('data/wiki.txt', 'r', encoding='utf8') as f:
    for line in f:
        wiki.append(line)

wiki = pd.DataFrame(wiki, columns=['text'])
wiki.head()

Unnamed: 0,text
0,Khatchig Mouradian. Khatchig Mouradian is a jo...
1,Jacob Henry Studer. Jacob Henry Studer (26 Feb...
2,"John Stephen. Born in Glasgow, Stephen became ..."
3,Georgina Willis. Georgina Willis is an award w...
4,Stanley Corrsin. Corrsin was born on 3 April 1...


In [62]:
# Read tsv file
birth_places_train = pd.read_csv('data/birth_places_train.tsv', sep='\t') 
birth_places_train.head()

Unnamed: 0,Where was Khatchig Mouradian born?,Lebanon
0,Where was Jacob Henry Studer born?,Columbus
1,Where was John Stephen born?,Glasgow
2,Where was Georgina Willis born?,Australia
3,Where was Stanley Corrsin born?,Philadelphia
4,Where was Eduard Ender born?,Rome


In [63]:
birth_places_test = pd.read_csv('data/birth_places_test.tsv', sep='\t')
birth_places_test.head()

Unnamed: 0,Where was Bryan Dubreuiel born?,Atlanta
0,Where was Ralf Wadephul born?,Berlin
1,Where was Joseph Baggaley born?,England
2,Where was Sandhya Sanjana born?,Mumbai
3,Where was Alfred Mele born?,Detroit
4,Where was Murray Esler born?,Geelong


In [64]:
class BirthPlaceDataset(Dataset):
    """
    Dataset for birth place prediction made on a (q, c, a) format,
    where q is the question, c is the context and a is the answer.

    Args:
        df (pd.DataFrame): DataFrame containing the q and a columns.
        wiki (pd.DataFrame): DataFrame containing the c column.
    """
    def __init__(self, df: pd.DataFrame, wiki: pd.DataFrame) -> None:
        self.df = df
        self.wiki = wiki
        self.df_values = self.df.values.tolist()
        self.wiki_values = self.wiki.values.tolist()

        # Create a match between the q and c columns
        # Each question in the q column will be matched with a context in the c column by the name of the person
        # which is after "Where was" in the construction

        # Example: "Where was Barack Obama born?" will be matched with the context of Barack Obama
        self.match = {}
        for i in tqdm(range(len(self.df_values)), desc='Matching questions and contexts', total=len(self.df_values)):
            [q, a] = self.df_values[i]
            person = q.split(' ')[2:-1] # Get the name of the person
            person = ' '.join(person) # Join the name of the person

            # Find the context of the person
            for j in range(len(self.wiki_values)):
                c = self.wiki_values[j][0] # There is only one column in the wiki DataFrame
                if person in c:
                    self.match[person] = (q, c, a)
                    break
        
        print(f'Number of entries preprocessed: {len(self.match.keys())}')

    def __len__(self):
        return len(self.match.keys())

    def __getitem__(self, idx: int) -> Tuple[str, str, str]:
        person = list(self.match.keys())[idx]
        return self.match[person]

In [65]:
train_dataset = BirthPlaceDataset(birth_places_train, wiki)
test_dataset = BirthPlaceDataset(birth_places_test, wiki)



[A[A

[A[A

Matching questions and contexts: 100%|██████████| 1999/1999 [00:00<00:00, 4478.07it/s]


Number of entries preprocessed: 1990




[A[A

[A[A

Matching questions and contexts: 100%|██████████| 499/499 [00:00<00:00, 1936.24it/s]

Number of entries preprocessed: 499





#### A sample from new the new dataset

In [66]:
print(train_dataset[0])

('Where was Jacob Henry Studer born?', 'Jacob Henry Studer. Jacob Henry Studer (26 February 1840 Columbus, Ohio - 2 August 1904 New York City) was a printer, lithographer, painter, and popular ornithologist active in Columbus, Ohio from the 1860s to the 1880s .\n', 'Columbus')


In [67]:
## Load t5 small model for question answering
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)

loading configuration file config.json from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams

In [68]:
tokenizer = T5Tokenizer.from_pretrained('t5-small', return_dict=True)

loading file spiece.model from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads

#### Testing the t5 model

In [107]:
# Test model
input_ids = tokenizer.encode("translate English to German: How old are you?", return_tensors="pt")
outputs = model.generate(input_ids, max_length=40, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Wie alt sind Sie?


#### [ Task 1 ] Cold evaluation of the pretrained T5-small model 

In [108]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [109]:
def evaluate(model: nn.Module, dataloader: DataLoader, device: torch.device) -> None:
    """
    Evaluate the model on the test set.

    Args:
        model (nn.Module): Model to evaluate.
        dataloader (DataLoader): Dataloader containing the test set.
        device (torch.device): Device to use.
    """
    model.eval()
    model.to(device)

    # Create a dictionary to store the predictions
    predictions = {}

    for batch in tqdm(dataloader, desc='Evaluating', total=len(dataloader)):
        # Get the inputs
        [q, c, a] = batch

        # Create the input for the model
        input_ids = tokenizer.encode(f'question: {q} context: {c}', return_tensors='pt').to(device)

        # Generate the answer
        outputs = model.generate(input_ids, max_length=40, num_beams=4, early_stopping=True)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Store the prediction
        predictions[q] = pred

    return predictions

In [110]:
predictions = evaluate(model, test_dataloader, device)

Evaluating: 100%|██████████| 499/499 [01:36<00:00,  5.18it/s]


In [111]:
# Compute the accuracy between answers and predictions
def compute_scores(predictions: Dict[str, str], answers: Dict[str, str]) -> Dict[str, float]:
    """
    ~~ Exact match ~~
    Compute the recall, accuracy and f1-score between the predictions and the answers.

    Args:
        predictions (Dict[str, str]): Dictionary containing the predictions.
        answers (Dict[str, str]): Dictionary containing the answers.

    Returns:
        float: Accuracy, recall and f1-score.
    """
    # Precision
    correct = 0
    for q in predictions.keys():
        if predictions[q] == answers[q]:
            correct += 1
    accuracy = correct / len(predictions.keys())

    # Recall
    recall = 0
    for q in predictions.keys():
        if answers[q] in predictions[q]:
            recall += 1
    recall = recall / len(predictions.keys())

    # F1-score
    f1_score = 2 * (accuracy * recall) / (accuracy + recall)

    return {'accuracy': accuracy, 'recall': recall, 'f1_score': f1_score}

### Compute the scores for the baseline performance of the model

In [113]:
for q in predictions.keys():
    print(f'Question: {q}')
    print(f'Prediction: {predictions[q]}')
    print()
    break

Question: ('Where was Ralf Wadephul born?',)
Prediction: Berlin



In [101]:
answers = {q: a for [q, c, a] in test_dataset}
assert len(answers.keys()) == len(predictions.keys())
assert answers.keys() == predictions.keys()
scores = compute_scores(predictions, answers)
print(scores)

{'accuracy': 0.5350701402805611, 'recall': 0.9378757515030061, 'f1_score': 0.6813954439491228}


#### [Task 2] Finetuning process

In [53]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    log_level='info',
)

In [58]:
trainer  = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    tokenizer=tokenizer,
    data_collator=lambda q, c, a: tokenizer(c, padding='max_length', truncation=True, max_length=512, return_tensors="pt"), # data collator that will take care of batching as well as preprocessing the data for our model.
)

In [59]:
trainer.train()

***** Running training *****
  Num examples = 1990
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 125
  Number of trainable parameters = 60506624


  0%|          | 0/125 [00:00<?, ?it/s]

TypeError: <lambda>() missing 2 required positional arguments: 'c' and 'a'