### Experiments with Large Language Models

In [132]:
# General imports
import pandas as pd
import numpy as np
from tqdm import tqdm

#  PyTorch imports
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Transformers imports
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments, AdamW

# Types
from typing import List, Dict, Tuple, Union

# Set seed
torch.manual_seed(42)
np.random.seed(42)



#### Data

In [133]:
wiki = []
with open('data/wiki.txt', 'r', encoding='utf8') as f:
    for line in f:
        wiki.append(line)

wiki = pd.DataFrame(wiki, columns=['text'])
wiki.head()

Unnamed: 0,text
0,Khatchig Mouradian. Khatchig Mouradian is a jo...
1,Jacob Henry Studer. Jacob Henry Studer (26 Feb...
2,"John Stephen. Born in Glasgow, Stephen became ..."
3,Georgina Willis. Georgina Willis is an award w...
4,Stanley Corrsin. Corrsin was born on 3 April 1...


In [134]:
# Read tsv file
birth_places_train = pd.read_csv('data/birth_places_train.tsv', sep='\t') 
birth_places_train.head()

Unnamed: 0,Where was Khatchig Mouradian born?,Lebanon
0,Where was Jacob Henry Studer born?,Columbus
1,Where was John Stephen born?,Glasgow
2,Where was Georgina Willis born?,Australia
3,Where was Stanley Corrsin born?,Philadelphia
4,Where was Eduard Ender born?,Rome


In [135]:
birth_places_test = pd.read_csv('data/birth_places_test.tsv', sep='\t')
birth_places_test.head()

Unnamed: 0,Where was Bryan Dubreuiel born?,Atlanta
0,Where was Ralf Wadephul born?,Berlin
1,Where was Joseph Baggaley born?,England
2,Where was Sandhya Sanjana born?,Mumbai
3,Where was Alfred Mele born?,Detroit
4,Where was Murray Esler born?,Geelong


In [136]:
class BirthPlaceDataset(Dataset):
    """
    Dataset for birth place prediction made on a (q, c, a) format,
    where q is the question, c is the context and a is the answer.

    Args:
        df (pd.DataFrame): DataFrame containing the q and a columns.
        wiki (pd.DataFrame): DataFrame containing the c column.
    """
    def __init__(self, df: pd.DataFrame, wiki: pd.DataFrame) -> None:
        self.df = df
        self.wiki = wiki
        self.df_values = self.df.values.tolist()
        self.wiki_values = self.wiki.values.tolist()

        # Create a match between the q and c columns
        # Each question in the q column will be matched with a context in the c column by the name of the person
        # which is after "Where was" in the construction

        # Example: "Where was Barack Obama born?" will be matched with the context of Barack Obama
        self.match = {}
        for i in tqdm(range(len(self.df_values)), desc='Matching questions and contexts', total=len(self.df_values)):
            [q, a] = self.df_values[i]
            person = q.split(' ')[2:-1] # Get the name of the person
            person = ' '.join(person) # Join the name of the person

            # Find the context of the person
            for j in range(len(self.wiki_values)):
                c = self.wiki_values[j][0] # There is only one column in the wiki DataFrame
                if person in c:
                    self.match[person] = (q, c, a)
                    break
        
        print(f'Number of entries preprocessed: {len(self.match.keys())}')

    def __len__(self):
        return len(self.match.keys())

    def __getitem__(self, idx: int) -> Tuple[str, str, str]:
        person = list(self.match.keys())[idx]
        return self.match[person]

In [137]:
train_dataset = BirthPlaceDataset(birth_places_train, wiki)
test_dataset = BirthPlaceDataset(birth_places_test, wiki)

Matching questions and contexts: 100%|██████████| 1999/1999 [00:00<00:00, 4616.22it/s]


Number of entries preprocessed: 1990


Matching questions and contexts: 100%|██████████| 499/499 [00:00<00:00, 2010.31it/s]

Number of entries preprocessed: 499





#### A sample from new the new dataset

In [138]:
print(train_dataset[0])

('Where was Jacob Henry Studer born?', 'Jacob Henry Studer. Jacob Henry Studer (26 February 1840 Columbus, Ohio - 2 August 1904 New York City) was a printer, lithographer, painter, and popular ornithologist active in Columbus, Ohio from the 1860s to the 1880s .\n', 'Columbus')


In [139]:
## Load t5 small model for question answering
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)

loading configuration file config.json from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams

In [140]:
tokenizer = T5Tokenizer.from_pretrained('t5-small', return_dict=True)

loading file spiece.model from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at C:\Users\Popescu Andrei/.cache\huggingface\hub\models--t5-small\snapshots\02bc3551ce463fd96bd4c3735822469bace0396d\config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads

#### Testing the t5 model

In [141]:
# Test model
input_ids = tokenizer.encode("translate English to German: How old are you?", return_tensors="pt")
outputs = model.generate(input_ids, max_length=40, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Wie alt sind Sie?


#### [ Task 1 ] Cold evaluation of the pretrained T5-small model 

In [142]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [143]:
def evaluate(model: nn.Module, dataloader: DataLoader, device: torch.device) -> None:
    """
    Evaluate the model on the test set.

    Args:
        model (nn.Module): Model to evaluate.
        dataloader (DataLoader): Dataloader containing the test set.
        device (torch.device): Device to use.
    """
    model.eval()
    model.to(device)

    # Create a dictionary to store the predictions
    predictions = {}

    for batch in tqdm(dataloader, desc='Evaluating', total=len(dataloader)):
        # Get the inputs
        [q, c, a] = batch

        for i in range(len(q)):
            # Create the input for the model
            input_ids = tokenizer.encode(f'question: {q[i]} context: {c[i]}', return_tensors='pt').to(device)

            # Generate the answer
            outputs = model.generate(input_ids, max_length=40, num_beams=4, early_stopping=True)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Store the prediction
            predictions[q[i]] = pred

    return predictions

In [144]:
predictions = evaluate(model, test_dataloader, device)

Evaluating: 100%|██████████| 499/499 [01:16<00:00,  6.54it/s]


In [145]:
# Compute the accuracy between answers and predictions
def compute_scores(predictions: Dict[str, str], answers: Dict[str, str]) -> Dict[str, float]:
    """
    ~~ Exact match ~~
    Compute the recall, accuracy and f1-score between the predictions and the answers.

    Args:
        predictions (Dict[str, str]): Dictionary containing the predictions.
        answers (Dict[str, str]): Dictionary containing the answers.

    Returns:
        float: Accuracy, recall and f1-score.
    """
    # Precision
    correct = 0
    for q in predictions.keys():
        if predictions[q] == answers[q]:
            correct += 1
    accuracy = correct / len(predictions.keys())

    # Recall
    recall = 0
    for q in predictions.keys():
        if answers[q] in predictions[q]:
            recall += 1
    recall = recall / len(predictions.keys())

    # F1-score
    f1_score = 2 * (accuracy * recall) / (accuracy + recall)

    return {'accuracy': accuracy, 'recall': recall, 'f1_score': f1_score}

### Compute the scores for the baseline performance of the model

In [146]:
for q in predictions.keys():
    print(f'Question: {q}')
    print(f'Prediction: {predictions[q]}')
    print()
    break

Question: Where was Ralf Wadephul born?
Prediction: Berlin



In [147]:
answers = {q: a for [q, c, a] in test_dataset}
assert len(answers.keys()) == len(predictions.keys())
assert answers.keys() == predictions.keys()
scores = compute_scores(predictions, answers)
print(scores)

{'accuracy': 0.5350701402805611, 'recall': 0.9378757515030061, 'f1_score': 0.6813954439491228}


#### [Task 2] Finetuning process

In [155]:
optimizer = AdamW(model.parameters(), lr=1e-4, correct_bias=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Duplicated from above, but useful to have it here



In [156]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True) # Use batch_size=1 for now

In [152]:
def fine_tune(model: nn.Module, train_dataloader: DataLoader, optimizer: torch.optim.Optimizer, device: torch.device, test_dataloader: DataLoader) -> None:
    """
    Fine tune the model on the train set.

    Args:
        model (nn.Module): Model to fine tune.
        dataloader (DataLoader): Dataloader containing the train set.
        optimizer (torch.optim.Optimizer): Optimizer to use.
        device (torch.device): Device to use.
    """

    for epoch in range(1):
        print(f'Epoch {epoch}')
        losses = []
        model.train()
        model.to(device)

        for batch in tqdm(train_dataloader, desc='Fine tuning', total=len(train_dataloader)):
            # Get the inputs
            [q, c, a] = batch

            for i in range(len(q)):
                # Create the input for the model
                input_ids = tokenizer.encode(f'question: {q[i]} context: {c[i]}', return_tensors='pt').to(device)
                attention_mask = (input_ids != tokenizer.pad_token_id).to(device)
                targets = tokenizer.encode(a[i], return_tensors='pt').to(device)

                assert input_ids.device == attention_mask.device == targets.device == model.device, 'Data is not on the same device'

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
                loss = outputs.loss

                # Backward pass
                loss.backward()
                optimizer.step()
                losses.append(loss.item())

                # Evaluate the model
                if epoch % 10 == 0:
                    predictions = evaluate(model, test_dataloader, device)
                    scores = compute_scores(predictions, answers)
                    print(scores)

        print(f'Loss: {np.mean(losses)}')
        # Save the model
        model.to('cpu')
        model.save_pretrained('t5-small-finetuned-birthplaces-answers_{}'.format(epoch))

In [153]:
fine_tune(model, train_dataloader, optimizer, device, test_dataloader)

Epoch 0


Fine tuning:   1%|          | 17/1990 [00:08<17:23,  1.89it/s]


KeyboardInterrupt: 