<a href="https://colab.research.google.com/github/suhas-09/100-Days-Of-ML-Code/blob/master/Copy_of_UMC301_Assignment_3_3_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# UMC 301: Applied Data Science and Artificial Intelligence
## Assignment 3

### Submission instructions:

1.   The assignment is to be submitted in ONE single notebook.
2.   Submit the .ipynb file with all cells open and the pdf for part 2 through Teams Assignment.
3. If your IISc email ID is < username > @iisc.ac.in, then name the file < username >_Assgn_3. E.g. jonathan_Assgn_3 for email ID jonathan@iisc.ac.in.
4. Before submission, execute the ’Restart session and run all’ option from the Runtime/Kernel tab. Verify that there are no errors and that you are getting the output you expect.
5. Use the dataset: https://github.com/taivop/joke-dataset
6. The assignment is divided into two questions.

In [88]:
# Necessary imports

!pip install transformers
!pip install datasets

import math
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader

from datetime import datetime
from tqdm import tqdm



## Question 1

### Part 1

Load the dataset, create train, validation and test splits. Preprocess the dataset as required to train a decoder model. (10 marks)



In [89]:
import os
import requests

# Now load the dataset from the downloaded JSON file, specifying splits
# Define split proportions (70%, 15%, 25%)
train_size = 0.75
val_size = 0.15
test_size = 0.10

# Load the entire dataset first
dataset = load_dataset("json", data_files='./data/reddit_jokes.json')

# add the title column to the start of the body column
dataset = dataset.map(lambda x: {"body": x["title"] + " | " + x["body"]})

# Remove the title column
dataset = dataset.remove_columns(["title"])

# Remove all the bad parts
dataset = dataset.map(lambda x: {"body": x["body"].replace("\n", "")})
dataset = dataset.map(lambda x: {"body": x["body"].replace("\r", "")})
dataset = dataset.map(lambda x: {"body": "".join([c if c.isalnum() or c in ["?", ",", ".", "'","\"", "|"] else " " for c in x["body"]])})


dataset = dataset.map(lambda x: {"body": " ".join(x["body"].split())})

# Remove the datapoints which have more than 48 words, to reduce "noise"
dataset = dataset.filter(lambda x: len(x["body"].split()) < 48)

# Make all lowercase
dataset = dataset.map(lambda x: {"body": x["body"].lower()})

#Remove the score and id columns
dataset = dataset.remove_columns(["score", "id"])

# Then, create the splits using the `train_test_split` method
dataset = dataset["train"].train_test_split(
    test_size=val_size + test_size,  # Combine validation and test size
    seed=42,  # Set a seed for reproducibility
)

# Further split the test set into validation and test sets
dataset["validation"] = dataset["test"].train_test_split(
    test_size=test_size / (val_size + test_size),  # Proportion of test within test+val
    seed=42,  # Set a seed for reproducibility
)["test"]
dataset["test"] = dataset["test"].train_test_split(
    test_size=test_size / (val_size + test_size),  # Proportion of test within test+val
    seed=42,  # Set a seed for reproducibility
)["train"]

print(dataset)
# Just checking if the preprocessing is fine
for i in range(10):
    print(dataset["train"][i]["body"])

DatasetDict({
    train: Dataset({
        features: ['body'],
        num_rows: 115567
    })
    test: Dataset({
        features: ['body'],
        num_rows: 23113
    })
    validation: Dataset({
        features: ['body'],
        num_rows: 15410
    })
})
my father told me a joke. how many germans does it take screw in a lightbulb? he said nein | my dads jokes are the wurst i tell you.
what do you call a porno set in space. | apollo 13 inches
moron | q why did the moron throw the butter out the window?a he wanted to see a butterfly.
five reasons not to use an electric toilet | number two will shock you
what is punctuation's favorite curry? | l a
what is another name for a jewish pokemon trainer? | ash.
i asked god for a bike, but some bastard stole it. | it's a good thing that i didn't take off the gps tracker from whoever had owned it first.
today someone was killed with a starter pistol. | police think it might be race related.
what is long and hard that a australian bride gets

In [90]:
# Tokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer)

tokenizer.model_max_length = 48

In [91]:
tokenized_datasets = dataset.map(lambda x: tokenizer(x["body"], truncation=True, padding=True))
data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/15410 [00:00<?, ? examples/s]

In [92]:
#Remove the body
tokenized_datasets = tokenized_datasets.remove_columns("body")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 115567
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 23113
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 15410
    })
})

In [93]:
# Sanity check
for i in range(5):
    print(tokenized_datasets["train"][i])

{'input_ids': [101, 1139, 1401, 1500, 1143, 170, 8155, 119, 1293, 1242, 176, 14170, 1116, 1674, 1122, 1321, 13084, 1107, 170, 1609, 27515, 1830, 136, 1119, 1163, 24928, 1394, 197, 1139, 4153, 1116, 13948, 1132, 1103, 192, 7719, 1204, 178, 1587, 1128, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 1184, 1202, 1128, 1840, 170, 185, 8456, 1186, 1383, 1107, 2000, 119, 197, 170, 23043, 2858, 1492, 4519, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 182, 14824, 1179, 197, 186, 1725, 1225, 1103, 182, 14824, 1179, 4932, 1103, 13742, 1149, 1103, 2487, 136, 170, 1119, 1458, 1106, 1267, 170, 11057, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 1421, 3672, 1136, 1106, 1329, 1126, 3651, 12356, 197, 1295, 1160, 1209, 4900, 11

### Part 2

Create a decoder model and train it using the joke-dataset. Try out different hyperparameters for the model, like the number of decoder blocks, hidden dimension, sequence length etc. Choose the model which gives you the best results. Log the results for the different hyperparameters combinations used. Submit the logs in a pdf. (18 marks)


In [94]:
class CausalSelfAttention(nn.Module):
  def __init__(self, d_k,d_model, n_heads, max_len):
    super().__init__()

    # Assume d_v =d_k
    self.d_k=d_k
    self.n_heads = n_heads

    self.key = nn. Linear (d_model, d_k*n_heads)
    self.query = nn. Linear (d_model, d_k*n_heads)
    self.value = nn. Linear (d_model, d_k*n_heads)

    # final linear layer
    self.fc=nn.Linear (d_k*n_heads, d_model)

    # casual mask
    # make it so that diagonal is 0
    # this way we don't have to shift the inputs to make targets

    cm=torch.tril(torch.ones(int(max_len),int(max_len)))
    self.register_buffer("causal_mask",  cm.view(1,1, int(max_len), int(max_len)))


  def forward(self, q, k, v, pad_mask=None):
    q=self.query(q) # N x T x (hd_k)
    k=self.key(k) # N x T x (hd_k)
    v=self.value(v) # N x T x (hd_k)

    N = q.shape[0]
    T = q.shape[1]


    # change the shape to:
    # (N, T, h, d_k) --> N, h, T, d_k)
    # in order for matrix multiply to work properly
    q=q.view (N, T, self.n_heads, self.d_k).transpose(1,2)
    k=k.view (N, T, self.n_heads, self.d_k).transpose(1,2)
    v=v.view (N, T, self.n_heads, self.d_k).transpose(1,2)

    # Copute attention weights
    # (N, h, T, d_k)   x  (N, h, d_k, T )  --> (N, h, T, T)
    attn_scores = q@k.transpose(-2,-1)/math.sqrt(self.d_k) # Scaled dot product;  @ --> torch.matmul

    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(pad_mask[:,None,None,:] == 0, float('-inf'))

    attn_scores = attn_scores.masked_fill(self.causal_mask[:, :, :T, :T] == 0, float('-inf'))

    attn_weights = F.softmax(attn_scores, dim =-1)

    # Compute attention-weighted values
    # (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    # reshape it back before final linear layer
    A = A.transpose(1,2)  # (N, T, h, d_k)
    A = A.contiguous(). view(N, T, self.d_k*self.n_heads) # (N, T, h*d_k)

    # projection
    return self.fc(A)

In [95]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalSelfAttention(d_k, d_model, n_heads, max_len)

    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob),
    )

    self.dropout = nn.Dropout(p=dropout_prob)

  def forward (self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x, x, x, pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [96]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len =2048, dropout_porb=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p = dropout_porb)

    position = torch.arange(max_len).unsqueeze(1)
    i = torch.arange(0, d_model//2)
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position / (10000)**(2*i/d_model))
    pe[0, :, 1::2] = torch.cos(position / (10000)**(2*i/d_model))
    self.register_buffer('pe', pe)    # If you have parameters in your model which should be saved and restored in the state_dict
                                      # but not trained by the optimizer, then you should register them as buffers.
                                      # Buffers won’t be returned in model.parameters(), so that the optimizer won’t have a change to update them.

  def forward(self, x):
    # x.shape : N x T x D
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [97]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, max_len, d_k, d_model, n_heads, n_layers, dropout_prob):
    super().__init__()

    self.embedding=nn.Embedding(vocab_size, d_model)

    self.pos_encoding=PositionalEncoding(d_model, max_len, dropout_prob)

    transformer_blocks=[TransformerBlock(d_k, d_model, n_heads, max_len, dropout_prob) for _ in range(n_layers) ]

    self.transformer_blocks = nn.Sequential(*transformer_blocks)

    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, x, pad_mask=None):
    x=self.embedding(x)
    x=self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)
    x = self.ln(x)
    x = self.fc(x)  # many-to-many
    return x

In [98]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_loader =  DataLoader(tokenized_datasets["train"], shuffle = True, batch_size = 16, collate_fn = data_collator)
valid_loader = DataLoader(tokenized_datasets["validation"], batch_size = 16, collate_fn = data_collator)

In [99]:
def train(model, criterion, optimizer, train_loader, epochs, early_stop_threshold=10**10):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0=datetime.now()
    train_loss = []
    for batch in tqdm(train_loader):
      # move data to GPU
      batch = {k:v.to(device) for k, v, in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift the targets backwards
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets,shifts = -1, dims = 1) #Shifting the sequence for creating target
      targets[:,-1] = tokenizer.pad_token_id

      # forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs.transpose(2,1),targets)

      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # get  train Loss and test loss
    train_loss = np.mean(train_loss)

    # save lossess
    train_losses[it] = train_loss

    dt = datetime.now() - t0
    print(f'Epoch: {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')

    if train_loss > early_stop_threshold:
      print("Too much loss due to bad hyperparameters, aborting train")
      break

  return train_losses

In [100]:
# Dummy Test for sanity check

# Instantiate
model = Decoder(
    vocab_size = tokenizer.vocab_size,
    max_len=tokenizer.model_max_length,
    d_k=16,
    d_model = 64,
    n_heads = 4,
    n_layers =2,
    dropout_prob =0.1
    )
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# train the model for one epoch
train_losses = train(model, criterion, optimizer, train_loader, epochs = 1)

In [None]:
def model_completion_test(joke, model):
    # Split into premise and punchline
    premise = joke[:joke.index(" | ") + 4]

    # Encode the premise
    tokenized_premise = tokenizer(premise, return_tensors='pt')
    input_ids = tokenized_premise["input_ids"][:, :-1].to(device)
    mask = tokenized_premise["attention_mask"][:, :-1].to(device)

    # Complete with upto 50 words
    for _ in range(50):
        try:
            outputs = model(input_ids, mask)
            prediction_id = torch.argmax(outputs[:, -1, :], axis=-1)    # Generated word
            input_ids = torch.hstack((input_ids, prediction_id.view(1,1)))  # add this generated word word to the input
            mask = torch.ones_like(input_ids)
        except:
            break
        if prediction_id == tokenizer.sep_token_id:
            break

    print('Joke:', joke)
    print('premise:', premise)
    print('model completion:', tokenizer.decode(input_ids[0]))

In [None]:
# Testing the completion
model.eval()

joke = dataset["validation"][120]["body"]

try:
    model_completion_test(joke, model)
except:
    pass

In [None]:
# Hyperparameter Tuning

# Define the hyperparameter combinations
hyperparams = [
    (16, 64, 4, 2, 0.1),
    (32, 256, 8, 3, 0.1),
    (32, 512, 8, 2, 0.1),
    (64, 768, 10, 5, 0.15),
]

best_valid_loss = 1e12  # Initialize with a very large value
best_hyperparams = None

# Loop through the hyperparameter combinations
for d_k, d_model, n_heads, n_layers, dropout_prob in hyperparams:
    print(f"Training model with hyperparameters: d_k={d_k}, d_model={d_model}, n_heads={n_heads}, n_layers={n_layers}, dropout_prob={dropout_prob}")

    # Instantiate the model with the current hyperparameters
    model = Decoder(
        vocab_size=tokenizer.vocab_size,
        max_len=tokenizer.model_max_length,
        d_k=d_k,
        d_model=d_model,
        n_heads=n_heads,
        n_layers=n_layers,
        dropout_prob=dropout_prob,
    )
    model.to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters(), 0.001)

    # Train the model for 5 epochs
    train_losses = train(model, criterion, optimizer, train_loader, epochs=5, early_stop_threshold=6)   #Putting stop threshold at 6 as the basic model started of with loss roughly 4

    # Validation
    model.eval()
    valid_losses = []
    with torch.no_grad():
        for batch in valid_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            targets = batch['input_ids'].clone().detach()
            targets = torch.roll(targets, shifts=-1, dims=1)
            targets[:, -1] = tokenizer.pad_token_id
            outputs = model(batch['input_ids'], batch['attention_mask'])
            loss = criterion(outputs.transpose(2, 1), targets)
            valid_losses.append(loss.item())

    valid_loss = np.mean(valid_losses)

    print(f"Validation Loss: {valid_loss:.4f}")

    # Testing on random stuff
    print('\n Testing this model on 4 random jokes')
    for _ in range(4):
        joke = dataset["validation"][random.randint(0, 1000)]["body"]
        try:
            model_completion_test(joke, model)
        except:
            pass

    # Check if this model is the best so far
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_hyperparams = (d_k, d_model, n_heads, n_layers, dropout_prob)

# Print the best hyperparameters and validation loss
print("\nBest Hyperparameters:")
print(f"d_k: {best_hyperparams[0]}, d_model: {best_hyperparams[1]}, n_heads: {best_hyperparams[2]}, n_layers: {best_hyperparams[3]}, dropout_prob: {best_hyperparams[4]}")
print(f"Best Validation Loss: {best_valid_loss:.4f}")


In [None]:
torch.cuda.empty_cache()

In [None]:
# Get the best model from the best hyperparams
model = Decoder(
    vocab_size=tokenizer.vocab_size,
    max_len=tokenizer.model_max_length,
    d_k=best_hyperparams[0],
    d_model=best_hyperparams[1],
    n_heads=best_hyperparams[2],
    n_layers=best_hyperparams[3],
    dropout_prob=best_hyperparams[4],
)
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model for 12 epochs
train_losses = train(model, criterion, optimizer, train_loader, epochs=10)

In [None]:
# Save the model
torch.save(model.state_dict(), "model.pth")

# Save the tokenizer
tokenizer.save_pretrained("tokenizer")

### Part 3

Choose any 5 samples from the test dataset and generate the model outputs. (2 marks)

In [None]:
# Choose 5 samples from the test dataset
test_samples = dataset["test"]["body"][:5]

# Generate the model outputs
model.eval()

for joke in test_samples:
    model_completion_test(joke, model)

## Question 2

Finetune a GPT-2 model using the dataset and compare the responses of your model and the finetuned GPT-2 model. (18 marks)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch


# Custom Dataset for jokes
class JokesDataset(Dataset):
    def __init__(self, dataset, tokenizer, block_size=128):
        self.instruction_set = []
        for joke in dataset["body"]:
            if len(joke) < 65:  # 64, more than the previous cutoff of 48
                tokenized_joke = tokenizer(
                    joke,
                    truncation=True,
                    max_length=block_size,
                    padding="max_length",
                    return_tensors="pt"
                )

                self.instruction_set.append({
                    "input_ids": tokenized_joke["input_ids"].squeeze(0),
                    "attention_mask": tokenized_joke["attention_mask"].squeeze(0)
                })

    def __len__(self):
        return len(self.instruction_set)

    def __getitem__(self, idx):
        return self.instruction_set[idx]

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Handle padding for GPT-2
model = GPT2LMHeadModel.from_pretrained(model_name)


block_size = 128
dataset = JokesDataset(dataset["train"], tokenizer, block_size)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-jokes-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,     # Smaller batch size as larger model
    save_steps=10000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10000,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./gpt2-jokes-finetuned")
tokenizer.save_pretrained("./gpt2-jokes-finetuned")

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def gpt_completion(prompt, max_length=100, temperature=1.0, top_k=50, top_p=0.95):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    output_ids = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [None]:
# Comparson with decoder made from scratch

for joke in test_samples:
    print("joke:", joke)
    premise = premise = joke[:joke.index(" | ") + 4]
    print("premise:", premise)
    print("GPT2 Completion:", gpt_completion(premise))