# Introduction
In this laboratory we will get our hands dirty working with Large Language Models (e.g. GPT and BERT) to do various useful things. I you haven't already, it is highly recommended to:

+ Read the [Attention is All you Need](https://arxiv.org/abs/1706.03762) paper, which is the basis for all transformer-based LLMs.
+ Watch (and potentially *code along*) with this [Andrej Karpathy video](https://www.youtube.com/watch?v=kCc8FmEb1nY) which shows you how to build an autoregressive GPT model from the ground up.

# Exercise 1: Warming Up
In this first exercise you will train a *small* autoregressive GPT model for character generation (the one used by Karpathy in his video) to generate text in the style of Dante Aligheri. Use [this file](https://archive.org/stream/ladivinacommedia00997gut/1ddcd09.txt), which contains the entire text of Dante's Inferno (**note**: you will have to delete some introductory text at the top of the file before training). Train the model for a few epochs, monitor the loss, and generate some text at the end of training. Qualitatively evaluate the results

### 1.1 GPT

#### 1.1.1 Importing libraries

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F


#### 1.1.2 Hyperparameters

In [None]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 3000 # I noticed a degradatino after 3000 iterations
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

#### 1.1.3 Data preparation

In [None]:
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#with open('input.txt', 'r', encoding='utf-8') as f:
    #text = f.read()

with open('divina_commedia.txt', 'r', encoding='utf-8') as f:
  text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


#### 1.1.4 Evaluation boilerplate

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

#### 1.1.5 Model definition

In [None]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple GPT
class GPT(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # Weights initialization dependeding of the type of layer and the activation function
        self.apply(self.init_weights)

    def init_weights(self, module):

        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            if module.weight is not None:
                nn.init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.BatchNorm2d):
            if module.weight is not None:
                nn.init.constant_(module.weight, 1)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

#### 1.1.6 Training

In [None]:
model = GPT()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
print('Training is over: text generation from the model...')
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

**Perplexity**

In [None]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [None]:
# Set your GPT model in evaluation mode
m.eval()

def calculate_perplexity(model, dataset):
    total_loss = 0.0
    total_words = 0

    with torch.no_grad():
        for k in range(len(dataset)):
            X, y = get_batch('val')
            X = X.to(device)
            y = y.to(device)

            # Forward pass to get logits
            logits, loss = model(X, y)

            total_loss += loss.item() * y.numel()
            total_words += y.numel()

    # Calculate perplexity
    perplexity = torch.exp(total_loss / torch.tensor(total_words))

    return perplexity


#perplexity_train = calculate_perplexity(m, train_data)
perplexity_val = calculate_perplexity(m, val_data)

#print(f"Perplexity on Train Set: {perplexity_train:.2f}")
print(f"Perplexity on Validation Set: {perplexity_val:.2f}")

Perplexity on Validation Set: 4.63


# Exercise 2: Working with Real LLMs

Our toy GPT can only take us so far. In this exercise we will see how to use the [Hugging Face](https://huggingface.co/) model and dataset ecosystem to access a *huge* variety of pre-trained transformer models.

## Exercise 2.1: Installation and text tokenization

First things first, we need to install the [Hugging Face transformer library](https://huggingface.co/docs/transformers/index):

    conda install -c huggingface -c conda-forge transformers
    
The key classes that you will work with are `GPT2Tokenizer` to encode text into sub-word tokens, and the `GPT2LMHeadModel`. **Note** the `LMHead` part of the class name -- this is the version of the GPT2 architecture that has the text prediction heads attached to the final hidden layer representations (i.e. what we need to **generate** text).

Instantiate the `GPT2Tokenizer` and experiment with encoding text into integer tokens. Compare the length of input with the encoded sequence length.

**Tip**: Pass the `return_tensors='pt'` argument to the togenizer to get Pytorch tensors as output (instead of lists).

### 2.1.1 Install HuggingFace

In [None]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m45.9 MB/s[0m eta [36m0:00:0

### 2.1.2: Select the prompt

In [None]:
# Prompt for text generation
dante_prompt = "In the middle of the journey of our life, I found myself in a"
theweeknd_prompt = "I was born in a city Where the winter nights don't ever sleep So this life's always with me The ice inside my veins will never bleed My, ooh My, ooh Uh, every time you try to fix me I know you'll never find that missing piece When you cry and say you miss me I'll lie and tell you that I'll never leave But I sacrificed (sacrificed) Your love for more of the night (of the night) I try to put up a fight (up a fight) Can't tie me down (down) I don't wanna sacrifice For your love, I try I don't wanna sacrifice But I love my time My, ooh My, ooh"
martin_luther_king_prompt = "I have a dream that one day every valley shall be engulfed, every hill shall be exalted and every mountain shall be made low, the rough places will be made plains and the crooked places will be made straight and the glory of the Lord shall be revealed and all flesh shall see it together"

### 2.1.3: Text Tokenizer

In [None]:
# Import Required Modules
from transformers import GPT2Tokenizer

# Instantiate the GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

**Dante Tokenizer**

In [None]:
# Encode Text into Integer Tokens
input_text = dante_prompt

# Encode the text
dante_encoded_text = tokenizer(input_text, return_tensors='pt')

# Print the input text
print("Input Text: ", input_text)

# Print the encoded tokens
print("Encoded Tokens: ", dante_encoded_text['input_ids'])

# Compare the length of input with the encoded sequence length
print("Input Text Length: ", len(input_text))
print("Encoded Sequence Length: ", dante_encoded_text['input_ids'].shape[1])


Input Text:  In the middle of the journey of our life, I found myself in a
Encoded Tokens:  tensor([[ 818,  262, 3504,  286,  262, 7002,  286,  674, 1204,   11,  314, 1043,
         3589,  287,  257]])
Input Text Length:  61
Encoded Sequence Length:  15


**The Weeknd Tokenizer**

In [None]:
# Encode Text into Integer Tokens
input_text = theweeknd_prompt

# Encode the text
tw_encoded_text = tokenizer(input_text, return_tensors='pt')

# Print the input text
print("Input Text: ", input_text)

# Print the encoded tokens
print("Encoded Tokens: ", tw_encoded_text['input_ids'])

# Compare the length of input with the encoded sequence length
print("Input Text Length: ", len(input_text))
print("Encoded Sequence Length: ", tw_encoded_text['input_ids'].shape[1])

Input Text:  I was born in a city Where the winter nights don't ever sleep So this life's always with me The ice inside my veins will never bleed My, ooh My, ooh Uh, every time you try to fix me I know you'll never find that missing piece When you cry and say you miss me I'll lie and tell you that I'll never leave But I sacrificed (sacrificed) Your love for more of the night (of the night) I try to put up a fight (up a fight) Can't tie me down (down) I don't wanna sacrifice For your love, I try I don't wanna sacrifice But I love my time My, ooh My, ooh
Encoded Tokens:  tensor([[   40,   373,  4642,   287,   257,  1748,  6350,   262,  7374, 12513,
           836,   470,  1683,  3993,  1406,   428,  1204,   338,  1464,   351,
           502,   383,  4771,  2641,   616, 32375,   481,  1239, 30182,  2011,
            11,   267,  1219,  2011,    11,   267,  1219, 28574,    11,   790,
           640,   345,  1949,   284,  4259,   502,   314,   760,   345,  1183,
          1239,  1064,   326,

**Martin Luther King Tokenizer**

In [None]:
# Encode Text into Integer Tokens
input_text = martin_luther_king_prompt

# Encode the text
mlk_encoded_text = tokenizer(input_text, return_tensors='pt')

# Print the input text
print("Input Text: ", input_text)

# Print the encoded tokens
print("Encoded Tokens: ", mlk_encoded_text['input_ids'])

# Compare the length of input with the encoded sequence length
print("Input Text Length: ", len(input_text))
print("Encoded Sequence Length: ", mlk_encoded_text['input_ids'].shape[1])

Input Text:  I have a dream that one day every valley shall be engulfed, every hill shall be exalted and every mountain shall be made low, the rough places will be made plains and the crooked places will be made straight and the glory of the Lord shall be revealed and all flesh shall see it together
Encoded Tokens:  tensor([[   40,   423,   257,  4320,   326,   530,  1110,   790, 19272,  2236,
           307, 40997,    11,   790, 12788,  2236,   307, 46683,   290,   790,
          8598,  2236,   307,   925,  1877,    11,   262,  5210,  4113,   481,
           307,   925, 36149,   290,   262, 45571,  4113,   481,   307,   925,
          3892,   290,   262, 13476,   286,   262,  4453,  2236,   307,  4602,
           290,   477, 11222,  2236,   766,   340,  1978]])
Input Text Length:  287
Encoded Sequence Length:  57


## Exercise 2.2: Generating Text

There are a lot of ways we can, given a *prompt* in input, sample text from a GPT2 model. Instantiate a pre-trained `GPT2LMHeadModel` and use the [`generate()`](https://huggingface.co/docs/transformers/v4.27.2/en/main_classes/text_generation#transformers.GenerationMixin.generate) method to generate text from a prompt.

**Note**: The default inference mode for GPT2 is *greedy* which might not results in satisfying generated text. Look at the `do_sample` and `temperature` parameters.

### 2.2.2: Instantiate the pre-trained GPT2 model

In [None]:
# Import GPT2LMHeadModel
from transformers import GPT2LMHeadModel

# Instantiate the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Dante Text Generator**

In [None]:
# Prompt for text generation
dante_prompt = "In the middle of the journey of our life, I found myself in a"

In [None]:
# prompt
test_prompt = dante_prompt
test_ids = tokenizer.encode(test_prompt, return_tensors='pt')


# Generate text using the GPT2 model
output = model.generate(test_ids, max_length=150, num_return_sequences=1, do_sample=True, temperature=0.3)

# Decode the generated text tokens and convert them to a string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:\n", generated_text)

**The Weeknd Text Generator**

In [None]:
# Prompt for text generation
theweeknd_prompt = "I was born in a city Where the winter nights don't ever sleep So this life's always with me The ice inside my veins will never bleed My, ooh My, ooh Uh, every time you try to fix me I know you'll never find that missing piece When you cry and say you miss me I'll lie and tell you that I'll never leave But I sacrificed (sacrificed) Your love for more of the night (of the night) I try to put up a fight (up a fight) Can't tie me down (down) I don't wanna sacrifice For your love, I try I don't wanna sacrifice But I love my time My, ooh My, ooh"


In [None]:
# prompt
test_prompt = theweeknd_prompt
test_ids = tokenizer.encode(test_prompt, return_tensors='pt')


# Generate text using the GPT2 model
output = model.generate(test_ids, max_length=150, num_return_sequences=1, do_sample=True, temperature=0.9)

# Decode the generated text tokens and convert them to a string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:\n", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 I was born in a city Where the winter nights don't ever sleep So this life's always with me The ice inside my veins will never bleed My, ooh My, ooh Uh, every time you try to fix me I know you'll never find that missing piece When you cry and say you miss me I'll lie and tell you that I'll never leave But I sacrificed (sacrificed) Your love for more of the night (of the night) I try to put up a fight (up a fight) Can't tie me down (down) I don't wanna sacrifice For your love, I try I don't wanna sacrifice But I love my time My, ooh My, ooh Uh, every time you try


**Martin Luther King Text Generator**

In [None]:
martin_luther_king_prompt = "I have a dream that one day every valley shall be engulfed, every hill shall be exalted and every mountain shall be made low, the rough places will be made plains and the crooked places will be made straight and the glory of the Lord shall be revealed and all flesh shall see it together"

In [None]:
# prompt
test_prompt = martin_luther_king_prompt
test_ids = tokenizer.encode(test_prompt, return_tensors='pt')


# Generate text using the GPT2 model
output = model.generate(test_ids, max_length=150, num_return_sequences=1, do_sample=True, temperature=0.9)

# Decode the generated text tokens and convert them to a string
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:\n", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 I have a dream that one day every valley shall be engulfed, every hill shall be exalted and every mountain shall be made low, the rough places will be made plains and the crooked places will be made straight and the glory of the Lord shall be revealed and all flesh shall see it together with the glory of God; for I know ye not that ye have heard these words of mine Father in Heaven; and the Lord your God shall say unto you, Go up and be a man, and make him a man; and he that hath it in his heart shall he cast out his hand, and he that hath it out thereof shall set it ablaze; and if anyone that hath it in his heart cast out the hand of the Lord shall cast it


# Exercise 3: Reusing Pre-trained LLMs (choose one)

Choose **one** of the following exercises (well, *at least* one). In each of these you are asked to adapt a pre-trained LLM (`GPT2Model` or `DistillBERT` are two good choices) to a new Natural Language Understanding task. A few comments:

+ Since GPT2 is a *autoregressive* model, there is no latent space aggregation at the last transformer layer (you get the same number of tokens out that you give in input). To use a pre-trained model for a classification or retrieval task, you should aggregate these tokens somehow (or opportunistically select *one* to use).

+ BERT models (including DistillBERT) have a special [CLS] token prepended to each latent representation in output from a self-attention block. You can directly use this as a representation for classification (or retrieval).

+ The first *two* exercises below can probably be done *without* any fine-tuning -- that is, just training a shallow MLP to classify or represent with the appropriate loss function.

# Exercise 3.1: Training a Text Classifier (easy)

Peruse the [text classification datasets on Hugging Face](https://huggingface.co/datasets?task_categories=task_categories:text-classification&sort=downloads). Choose a *moderately* sized dataset and use a LLM to train a classifier to solve the problem.

**Note**: A good first baseline for this problem is certainly to use an LLM *exclusively* as a feature extractor and then train a shallow model.

# Exercise 3.2: Training a Question Answering Model (harder)

Peruse the [multiple choice question answering datasets on Hugging Face](https://huggingface.co/datasets?task_categories=task_categories:multiple-choice&sort=downloads). Chose a *moderately* sized one and train a model to answer contextualized multiple-choice questions. You *might* be able to avoid fine-tuning by training a simple model to *rank* the multiple choices (see margin ranking loss in Pytorch).

# Exercise 3.3: Training a Retrieval Model (hardest)

The Hugging Face dataset repository contains a large number of ["text retrieval" problems](https://huggingface.co/datasets?task_categories=task_categories:text-retrieval&p=1&sort=downloads). These tasks generally require that the model measure *similarity* between text in some metric space -- naively, just a cosine similarity between [CLS] tokens can get you pretty far. Find an interesting retrieval problem and train a model (starting from a pre-trained LLM of course) to solve it.

**Tip**: Sometimes identifying the *retrieval* problems in these datasets can be half the challenge. [This dataset](https://huggingface.co/datasets/BeIR/scifact) might be a good starting point.

### 3.1.0: Training a text classifier

Use `DistillBERT` from HugginFace on the `imdb` dataset to extract features and train a shallow MLP to classify them.

### 3.1.1: Importing libraries

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:03[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/7.4 MB[0m [31m23.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m6.7/7.4 MB[0m [31m64.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.4/7.4 MB[0m [31m67.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

from transformers import DistilBertTokenizer, DistilBertModel
from datasets import load_dataset

### 3.1.2: Hyperparameters

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### 3.1.3: Instantiate the tokenizer and the model

I had to put `max_length' hyperparameter because apparentely the RAM memory was completely full filled until the runtime crashed

In [None]:
# Instantiate the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', max_length = 512)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
model.to(device)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

### 3.1.4: Data Preparation

In [None]:
# Load the IMDB dataset
dataset = load_dataset("imdb")

Reduce the number of samples to avoid GPU memory saturation.

To avoid to do that I tried several solutions, including using a dataloader, reducing the batch size, using a gradient accumulation in the process of feature extraction. Eventually, I had to reduce the number of samples.

In [None]:
reduced_train_dataset = dataset["train"].shuffle(seed=42)[:2500]
reduced_test_dataset = dataset["test"].shuffle(seed=42)[:500]

In [None]:
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

train_data = reduced_train_dataset
test_data = reduced_test_dataset
# Split train_data into train and validation sets
#train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# Step 2: Tokenize the text and convert it into PyTorch tensors
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', max_length=128)


# Tokenize the text and convert it to PyTorch tensors
def tokenize_text(text):
    encoded = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoded['input_ids']
    attention_mask = encoded['attention_mask']
    return input_ids, attention_mask

# Prepare the data and create DataLoaders
train_input_ids, train_attention_mask = tokenize_text(train_data['text'])
train_labels = torch.tensor(train_data['label'], dtype=torch.long)

test_input_ids, test_attention_mask = tokenize_text(test_data['text'])
test_labels = torch.tensor(test_data['label'], dtype=torch.long)

#val_input_ids, val_attention_mask = tokenize_text(val_data['text'])
#val_labels = torch.tensor(val_data['label'], dtype=torch.long)



In [None]:
batch_size = 8
train_dataloader = DataLoader(TensorDataset(train_input_ids, train_attention_mask, train_labels), batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(TensorDataset(test_input_ids, test_attention_mask, test_labels), batch_size=batch_size)
#val_dataloader = DataLoader(TensorDataset(val_input_ids, val_attention_mask, val_labels), batch_size=batch_size)

### 3.1.5: Feature extractor

First version of feature extractor that is used, without gradient accumulator. Since it was not working (i.e. the GPU memory got saturated very fast) I had to change it and use a different one. I leave it here just for reference.

In [None]:

# Extract features (representations) from the model
def extract_features(dataloader):
    features_list = []
    labels_list = []

    model.eval()
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            features_list.append(outputs.last_hidden_state)
            labels_list.append(labels)

    features = torch.cat(features_list, dim=0)
    labels = torch.cat(labels_list, dim=0)

    return features, labels

Second version with gradient accumulator.

In [None]:
def extract_features(dataloader, gradient_accumulation_steps=2):
    features_list = []
    labels_list = []

    model.eval()
    with torch.no_grad():
        for i, (input_ids, attention_mask, labels) in enumerate(dataloader):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            features_list.append(outputs.last_hidden_state)
            labels_list.append(labels)

            # Perform gradient accumulation every gradient_accumulation_steps batches
            if (i + 1) % gradient_accumulation_steps == 0:
                features = torch.cat(features_list, dim=0)
                labels = torch.cat(labels_list, dim=0)

                yield features, labels

                features_list = []
                labels_list = []

        # Perform the final gradient accumulation for any remaining batches
        if len(features_list) > 0:
            features = torch.cat(features_list, dim=0)
            labels = torch.cat(labels_list, dim=0)

            yield features, labels

    # At the end of the function, return the collected features and labels as tensors
    features = torch.cat(features_list, dim=0)
    labels = torch.cat(labels_list, dim=0)
    return features, labels


In [None]:
train_features, train_labels = list(extract_features(train_dataloader))[0]
test_features, test_labels = list(extract_features(test_dataloader))[0]

#train_features, train_labels = extract_features(train_dataloader)
#test_features, test_labels = extract_features(test_dataloader)
#val_features, val_labels = extract_features(val_dataloader)

# Now you have the extracted features for training and validation sets
print("Train Features:", train_features.size())
print("Train Labels:", train_labels.size())
print("Test Features:", test_features.size())
print("Test Labels:", test_labels.size())


Train Features: torch.Size([16, 512, 768])
Train Labels: torch.Size([16])
Test Features: torch.Size([16, 512, 768])
Test Labels: torch.Size([16])


### 3.1.6: Text Classifiers
In this section we present the text classifiers training. We will train different model classifiers and compare their performance. Since our goal is not to focus on the architecture of the classifiers, I decided to use some pre built models from scikit learn.



1.   MLPClassifier
2.   Logistic Regression
3.   SVM
4.   Random Forest




Since to train our classifier we are using models from the scikit learn library, they are not compatible with torch tensor. We have to convert them to numpy array

In [None]:
import numpy as np
# Convert PyTorch tensors to NumPy arrays and flatten them
train_features_np = train_features.cpu().numpy().reshape(len(train_features), -1)
train_labels_np = train_labels.cpu().numpy()
test_features_np = test_features.cpu().numpy().reshape(len(test_features), -1)
test_labels_np = test_labels.cpu().numpy()


**MLP**

In [None]:
from sklearn.neural_network import MLPClassifier


# Instantiate the MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=100, random_state=42)

# Train the classifier on the extracted features
mlp_classifier.fit(train_features_np, train_labels_np)

# Evaluate the classifier on the test set
mlp_accuracy = mlp_classifier.score(test_features_np, test_labels_np)
print("MLP Accuracy:", mlp_accuracy)


**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression


# Instantiate the Logistic Regression Classifier
logreg_classifier = LogisticRegression(max_iter=50, random_state=42)

# Train the classifier on the extracted features
logreg_classifier.fit(train_features_np, train_labels_np)

# Evaluate the classifier on the test set
logreg_accuracy = logreg_classifier.score(test_features_np, test_labels_np)
print("Logistic Regression Accuracy:", logreg_accuracy)



The following code is useful to check overfitting.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# Create a learning curve with 5 different training set sizes (e.g., 20%, 40%, 60%, 80%, 100%)
train_sizes, train_scores, valid_scores = learning_curve(
    logreg_classifier, train_features_np, train_labels_np, cv=5, train_sizes=np.linspace(0.2, 1.0, 5)
)

# Compute mean and standard deviation of training and validation scores across the 5 folds
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(10, 5))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training Set Size")
plt.ylabel("Score")
plt.legend(loc="best")
plt.title("Logistic Regression Learning Curve")
plt.show()


**SVM**

In [None]:
from sklearn.svm import SVC

# Instantiate the SVM Classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier on the extracted features
svm_classifier.fit(train_features_np, train_labels_np)

# Evaluate the classifier on the test set
svm_accuracy = svm_classifier.score(test_features_np, test_labels_np)
print("SVM Accuracy:", svm_accuracy)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve



# Create a learning curve with 5 different training set sizes (e.g., 20%, 40%, 60%, 80%, 100%)
train_sizes, train_scores, valid_scores = learning_curve(
    svm_classifier, train_features_np, train_labels_np, cv=5, train_sizes=np.linspace(0.2, 1.0, 5)
)

# Compute mean and standard deviation of training and validation scores across the 5 folds
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(10, 5))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std, valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training Set Size")
plt.ylabel("Score")
plt.legend(loc="best")
plt.title("SVM Learning Curve")
plt.show()


**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the extracted features
rf_classifier.fit(train_features_np, train_labels_np)

# Evaluate the classifier on the test set
rf_accuracy = rf_classifier.score(test_features_np, test_labels_np)
print("Random Forest Accuracy:", rf_accuracy)


**Evaluation**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict labels for the test set
mlp_predicted_labels = mlp_classifier.predict(test_features_np)
logreg_predicted_labels = logreg_classifier.predict(test_features_np)
svm_predicted_labels = svm_classifier.predict(test_features_np)
rf_predicted_labels = rf_classifier.predict(test_features_np)

# Compute confusion matrix
mlp_cm = confusion_matrix(test_labels_np, mlp_predicted_labels)
logreg_cm = confusion_matrix(test_labels_np, logreg_predicted_labels)
svm_cm = confusion_matrix(test_labels_np, svm_predicted_labels)
rf_cm = confusion_matrix(test_labels_np, rf_predicted_labels)

# Compute classification report
mlp_report = classification_report(test_labels_np, mlp_predicted_labels)
logreg_report = classification_report(test_labels_np, logreg_predicted_labels)
svm_report = classification_report(test_labels_np, svm_predicted_labels)
rf_report = classification_report(test_labels_np, rf_predicted_labels)
print("MLP Confusion Matrix:")
print(mlp_cm)
print("MLP Classification Report:")
print(mlp_report)

print("Logistic Regression Confusion Matrix:")
print(logreg_cm)
print("Logistic Regression Classification Report:")
print(logreg_report)

print("SVM Confusion Matrix:")
print(svm_cm)
print("SVM Classification Report:")
print(svm_report)

print("Random Forest Confusion Matrix:")
print(rf_cm)
print("Random Forest Classification Report:")
print(rf_report)


MLP Confusion Matrix:
[[3 4]
 [1 8]]
MLP Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.43      0.55         7
           1       0.67      0.89      0.76         9

    accuracy                           0.69        16
   macro avg       0.71      0.66      0.65        16
weighted avg       0.70      0.69      0.67        16

Logistic Regression Confusion Matrix:
[[7 0]
 [8 1]]
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.47      1.00      0.64         7
           1       1.00      0.11      0.20         9

    accuracy                           0.50        16
   macro avg       0.73      0.56      0.42        16
weighted avg       0.77      0.50      0.39        16

SVM Confusion Matrix:
[[6 1]
 [8 1]]
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.86      0.57         7
           1 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
