In [93]:
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,random_split
import torch
import torch.nn as nn
from collections import Counter

from tqdm import tqdm
import re
import pathlib
import numpy as np
from datasets import load_dataset

VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5
UNKOWN_WORD = "<unk>"
PAD_TOKEN = "<pad>"


# Let's load wine reviews dataset and tokenize it

In [94]:
from gpt_torch import load_wine_dataset_into_hg_datasets,get_wine_ds_with_country_variety
from gpt_torch import get_tokenized_wine_reviews,flatten_tokenized_wine_reviews,get_wine_review_word_to_id,get_wine_review_id_to_word
from gpt_torch import tokenize_and_convert_to_ids,batch_tokenize,get_input_ids_as_tensors,get_x_and_y_from_input_ids_tensor,softmax_over_gpt_scores
from gpt_torch import TextGenerator


In [95]:
wine_ds = load_wine_dataset_into_hg_datasets()

In [96]:
wine_ds = get_wine_ds_with_country_variety(wine_ds)

In [97]:
def text_to_lower(example):
    return {"text":example["text"].lower()}

In [98]:
wine_ds = wine_ds.map(text_to_lower,batch_size=None)

In [99]:
wine_ds["train"]["text"][100]

"us : new york : pinot gris : fresh apple, lemon and pear flavors are accented by a hint of smoked nuts in this bold, full-bodied pinot gris. rich and a bit creamy in mouthfeel yet balanced briskly, it's a satisfying white with wide pairing appeal. drink now through 2019."

In [100]:
# wr_tokenized is a list of lists. Every element of wr_tokenized is a list of tokens
wine_reviews = wine_ds["train"]["text"]
wr_tokenized = get_tokenized_wine_reviews(wine_reviews)

In [102]:
# we are flattening list of lists. wr_tokens is now a list that consists of individual tokens from the whole dataset
wr_tokens = flatten_tokenized_wine_reviews(wr_tokenized)

In [104]:
# we are getting mapping of unique words to integer ids
wr_word_to_id = get_wine_review_word_to_id(wr_tokens)

In [105]:
# this is simply constructing id to word mapping
wr_id_to_word = get_wine_review_id_to_word(wr_word_to_id)

In [107]:
# we are tokenizing every sample of wine_ds, which is a review against specific wine
# notice we are setting batch_size to None. This will apply batch_tokenize to every individual sample, not a batch of samples. batch_tokenize will receive one review at a time
# batch_tokenize will tokenize every review and returns input_ids. It will pad every input_id to a maximum length. This way we can construct a dataset for training our GPT model with consistent shapes
wine_ds = wine_ds.map(lambda x : batch_tokenize(x,wr_word_to_id),batch_size=None)

Map:   0%|          | 0/129971 [00:00<?, ? examples/s]

In [108]:
input_ids = wine_ds["train"]["input_ids"]
print(f"input_ids length : {len(input_ids)}")

input_ids length : 129971


In [110]:
print(" ".join([wr_id_to_word[id] for id in input_ids[1000]]))


us : oregon : cabernet sauvignon : <unk> ' s cab is stylistically apart from either california or washington . it defines its own space . there ' s plenty of new oak , but the fruit , acid and tannins stand up to it . this is sharp and tangy ; cranberry and raspberry , strawberry and citric acids all playing their part . still young , give it some time in a decanter or in your cellar to come


In [111]:
input_ids_tensors = get_input_ids_as_tensors(input_ids)

In [112]:
input_ids_tensors.size()

torch.Size([129971, 81])

In [113]:
# get x and y tensors that we will use when training our GPT model
x,y = get_x_and_y_from_input_ids_tensor(input_ids_tensors)

# Build train, test loaders

In [114]:
from gpt_torch import WineReviewDataset

wr_torch_dataset = WineReviewDataset(x,y)


In [115]:
total_size = len(wr_torch_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size
train_ds, val_ds, test_ds = random_split(wr_torch_dataset,[train_size,val_size,test_size])

In [116]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds,batch_size=BATCH_SIZE,shuffle=True)
test_loader = DataLoader(test_ds,batch_size=BATCH_SIZE,shuffle=True)

In [117]:
for idx, (batch_x, batch_y) in enumerate(train_loader):
    print(f"batch_x shape : {batch_x.size()}, batch_y shape {batch_y.size()}")
    if idx>5:
        break

batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])


# Let's initialize our precious model

In [None]:
from gpt_torch import TokenPositionEmbedding,TransformerBlock,GPT

In [None]:
vocab_size = len(wr_word_to_id)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
dropout_rate = 0.1

token_pos_embedding = TokenPositionEmbedding(vocab_size,MAX_LEN,EMBEDDING_DIM,device=device)
transformer_block = TransformerBlock(N_HEADS,KEY_DIM,EMBEDDING_DIM,FEED_FORWARD_DIM,dropout_rate,device=device)

token_pos_embedding = token_pos_embedding.to(device)
transformer_block = transformer_block.to(device)

In [None]:
gpt_model = GPT(token_pos_embedding,transformer_block,EMBEDDING_DIM,vocab_size)
gpt_model = gpt_model.to(device)

In [None]:
batch_x = batch_x.to(device)
batch_y = batch_y.to(device)
embeddings = token_pos_embedding(batch_x)

In [None]:

scores,attn_weights = gpt_model(batch_x)
out=softmax_over_gpt_scores(scores)

In [None]:
# this demonstrates that ouput is probability distribution over vocabulary
# the sum over probability distribution must equal to 1
out[0,0,:].sum()

# Generate some text

In [None]:
text_generator = TextGenerator(wr_word_to_id,wr_id_to_word,gpt_model,device)

In [None]:
# observe that untrained model generates bunch of gibberish
generated_info = text_generator.generate("wine review : ",MAX_LEN,0.1)

In [None]:
weights_folder=pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\notebooks\09_transformer\gpt\checkpoint")
weights_file="gpt_pytorch_5.pt"

gpt_model.load_state_dict(torch.load(str(weights_folder/weights_file),map_location=device))

In [None]:
# observe how a trained model generates some coherent text
generated_info = text_generator.generate("italy : ",MAX_LEN, 0.5)

In [None]:
wine_reviews[200]