In [None]:
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,random_split
import torch
import torch.nn as nn
from collections import Counter
from tqdm import tqdm
import re
import pathlib
import numpy as np
import torch
from datasets import load_dataset

VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5
UNKOWN_WORD = "<unk>"
PAD_TOKEN = "<pad>"


# Let's load wine reviews dataset and tokenize it

In [None]:
from gpt_torch import load_wine_dataset_into_hg_datasets,get_wine_ds_with_country_variety
from gpt_torch import get_tokenized_wine_reviews,flatten_tokenized_wine_reviews,get_wine_review_word_to_id,get_wine_review_id_to_word
from gpt_torch import tokenize_and_convert_to_ids,batch_tokenize,get_input_ids_as_tensors,get_x_and_y_from_input_ids_tensor,softmax_over_gpt_scores


In [None]:
wine_ds = load_wine_dataset_into_hg_datasets()

In [None]:
wine_ds = get_wine_ds_with_country_variety(wine_ds)

In [None]:
def text_to_lower(example):
    return {"text":example["text"].lower()}

In [None]:
# converting every wine review to lowercase text
wine_ds = wine_ds.map(text_to_lower,batch_size=None)

In [None]:
# let's explore one sample from wine review
wine_ds["train"]["text"][100]

In [None]:
# this will produce list of lists of tokens. every wine review is broken down into list of tokens
wine_reviews = wine_ds["train"]["text"]
wr_tokenized = get_tokenized_wine_reviews(wine_reviews)

In [None]:
# essentially flattening list of lists
wr_tokens = flatten_tokenized_wine_reviews(wr_tokenized)

In [None]:
# building word to id dictionary
wr_word_to_id = get_wine_review_word_to_id(wr_tokens)

In [None]:
# and id to word
wr_id_to_word = get_wine_review_id_to_word(wr_word_to_id)

In [None]:
# tokenizing every wine review in the dataset. batch_tokenize will act on every individual wine review. it returns integer representations of tokens as input_ids
# it will pad input_ids of every sample to maximum length
wine_ds = wine_ds.map(lambda x : batch_tokenize(x,wr_word_to_id),batch_size=None)

In [None]:
input_ids = wine_ds["train"]["input_ids"]
print(f"input_ids length : {len(input_ids)}")

In [None]:
# convert every input_id to tensor form
input_ids_tensors = get_input_ids_as_tensors(input_ids)

In [None]:
# get x and y tensors that we will use when training our GPT model
# this simply treats the first MAX_LEN elements of input_id as x and shifted values of input_id by one as y
x,y = get_x_and_y_from_input_ids_tensor(input_ids_tensors)

# Let's build torch Dataset and DataLoader for wine reviews

In [None]:
# building pytorch dataset from x and y
# remember that pytorch Dataset must implement __len__ and __getitem__
from gpt_torch import WineReviewDataset

wr_torch_dataset = WineReviewDataset(x,y)


In [None]:
# splitting dataset into train, test and validation datasets
total_size = len(wr_torch_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size
train_ds, val_ds, test_ds = random_split(wr_torch_dataset,[train_size,val_size,test_size])

In [None]:
# initializing train, validation and test data loaders which can be used to access data in batches
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds,batch_size=BATCH_SIZE,shuffle=True)
test_loader = DataLoader(test_ds,batch_size=BATCH_SIZE,shuffle=True)

In [None]:
# let's just confirm that we can get batches of x and y and they have expected shapes
for idx, (batch_x, batch_y) in enumerate(train_loader):
    print(f"batch_x shape : {batch_x.size()}, batch_y shape {batch_y.size()}")
    if idx>5:
        break

# Let's initialize our own precious GPT model

In [None]:
from gpt_torch import TokenPositionEmbedding,TransformerBlock,GPT

In [None]:
vocab_size = len(wr_word_to_id)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
dropout_rate = 0.1

token_pos_embedding = TokenPositionEmbedding(vocab_size,MAX_LEN,EMBEDDING_DIM,device=device)
transformer_block = TransformerBlock(N_HEADS,KEY_DIM,EMBEDDING_DIM,FEED_FORWARD_DIM,dropout_rate,device=device)

token_pos_embedding = token_pos_embedding.to(device)
transformer_block = transformer_block.to(device)

In [None]:
gpt_model = GPT(token_pos_embedding,transformer_block,EMBEDDING_DIM,vocab_size)
gpt_model = gpt_model.to(device)

In [None]:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
embeddings = token_pos_embedding(batch_x)

In [None]:

scores,attn_weights = gpt_model(batch_x)
out=softmax_over_gpt_scores(scores)

In [None]:
out[0,0,:].sum()

In [None]:
class TextGenerator:
    def __init__(self,word_to_id, id_to_word,gpt_model,device=torch.device("cpu")) -> None:
        self.word_to_id = word_to_id
        self.id_to_word = id_to_word
        self.gpt_model = gpt_model
        self.device = device
    
    def sample_from(self,probs,temperature):
        probs = probs ** (1/temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs),p=probs), probs
    
    def generate(self,start_prompt,max_tokens,temperature):
        start_tokens = tokenize_and_convert_to_ids(start_prompt,self.word_to_id)
        sample_token = None
        info = []

        while len(start_tokens) < max_tokens and sample_token != self.word_to_id[PAD_TOKEN]:
            x = np.array([start_tokens])
            x_tensor = torch.tensor(x)
            x_tensor = x_tensor.to(self.device)
            with torch.no_grad():
                scores, attn_weights = self.gpt_model(x_tensor)
                y = softmax_over_gpt_scores(scores)
                y_np = y.cpu().numpy()
                sample_token, probs = self.sample_from(y_np[0][-1],temperature)
                info.append({"prompt":start_prompt,"word_probs":probs,"attns":attn_weights[0,-1,:]})
                start_tokens.append(sample_token)
                start_prompt = f"{start_prompt} {self.id_to_word[sample_token]}"
        print(f"generated text : {start_prompt}")
        return info

In [None]:
text_generator = TextGenerator(wr_word_to_id,wr_id_to_word,gpt_model,device=device)

In [None]:
generated_info = text_generator.generate("wine review : ",MAX_LEN,0.1)

# Let's Train our precious model

In [None]:
def compute_loss(model,loss_fn,data_loader,device=torch.device("cpu")):
    total_loss = 0
    for x,y in data_loader:
        x = x.to(device)
        y = y.to(device)

        with torch.no_grad():
            gpt_output, attn_weights = model(x)
            # remember gpt_output will have shape (N,L,E) while y has (N,L) shape
            loss = loss_fn(gpt_output.view(-1, vocab_size),y.view(-1))
            total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
def training_loop(n_epochs, optimizer, model:GPT,text_generator:TextGenerator,loss_fn, train_loader,val_loader,chkpoints_folder:pathlib.Path,max_gen_tokens=MAX_LEN,gen_temperature=1.0,device=torch.device("cpu")):
    best_val_loss = float("inf")
    train_losses = []
    val_losses = []
    generated_infos = []

    

    for epoch in tqdm(range(n_epochs+1),position=0,desc="Epoch loop"):
        loss_train = 0
        model.train()
        for x,y in tqdm(train_loader, position=0, desc="Train Loop"):
            x = x.to(device)
            y = y.to(device)

            gpt_output, attn_weights = model(x)

            loss = loss_fn(gpt_output.reshape(-1,vocab_size), y.reshape(-1))

            train_losses.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_train += loss.item()
        loss_train_avg = loss_train/len(train_loader)
        print(f"Epoch {epoch} , average loss : {loss_train_avg}")

        model.eval()
        val_loss = compute_loss(model,loss_fn,val_loader,device)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            pt_weights_file = chkpoints_folder/f"gpt_pytorch_{epoch}.pt"
            torch.save(model.state_dict(),str(pt_weights_file))
        val_losses.append(val_loss)
        generated_infos.append(text_generator.generate("wine review : ",MAX_LEN,gen_temperature))
    return train_losses,val_losses,generated_infos

In [None]:
import torch.optim as optim
learning_rate=0.001

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(gpt_model.parameters(),lr=learning_rate)

chkpoints_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\notebooks\09_transformer\gpt\checkpoint")

train_losses,val_losses,generated_infos = training_loop(EPOCHS,optimizer,gpt_model,text_generator,loss_fn,train_loader,val_loader,chkpoints_folder,MAX_LEN,gen_temperature=0.9,device=device)

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses)
plt.title("Train losses")
plt.show()

plt.figure()
plt.plot(val_losses)
plt.title("Validation losses")
plt.show()

In [None]:
info=text_generator.generate("sweet wine with lemon aftertaste",MAX_LEN,1.0)

In [None]:
info[-1]