# Let's load wine reviews dataset with load_dataset from datasets library of HuggingFace

In [1]:
import re
import pathlib
import numpy as np
import torch
from datasets import load_dataset

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5
UNKOWN_WORD = "<unk>"
PAD_TOKEN = "<pad>"

In [3]:
# Load the full dataset
datasets_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\data")
wine_review_filepath=datasets_folder/"wine_reviews"/"winemag-data-130k-v2.json"
data = load_dataset(str(wine_review_filepath.parent),'json')

In [4]:
def prepare_text_with_country_variety(batch):
    text_with_country_variety = [f"{country} : {province} : {variety} : {description}" for country, province, variety, description in zip(batch['country'],batch['province'], batch['variety'], batch['description'])]
    return {"text": text_with_country_variety}

In [5]:
wine_ds = data.map(prepare_text_with_country_variety,batched=True, batch_size=None)

In [6]:
wine_reviews = wine_ds["train"]["text"]

# Let's build custom tokenizer
This tokenizer will split text on whitespace and punctuation marks

In [7]:
def custom_tokenize(text):
    return re.findall(r"\w+|[^\w\s]",text,re.UNICODE)

In [8]:
tokens=custom_tokenize(wine_reviews[0])
print(tokens)

['Italy', ':', 'Sicily', '&', 'Sardinia', ':', 'White', 'Blend', ':', 'Aromas', 'include', 'tropical', 'fruit', ',', 'broom', ',', 'brimstone', 'and', 'dried', 'herb', '.', 'The', 'palate', 'isn', "'", 't', 'overly', 'expressive', ',', 'offering', 'unripened', 'apple', ',', 'citrus', 'and', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '.']


In [9]:
# build a vocabulary

tokenized_wine_reviews = [custom_tokenize(text) for text in wine_reviews]

In [10]:
wine_review_tokens = [token for tokenized_text in tokenized_wine_reviews for token in tokenized_text]

In [11]:
from collections import Counter

wine_review_token_counter = Counter(wine_review_tokens)
wine_review_vocab_words = wine_review_token_counter.most_common(VOCAB_SIZE)

wr_word_to_id = { word:idx for idx, (word,_) in enumerate(wine_review_vocab_words)}

wr_word_to_id[UNKOWN_WORD] = len(wr_word_to_id)
wr_word_to_id[PAD_TOKEN] = len(wr_word_to_id)

In [12]:
wr_id_to_word = {id:word for word, id in wr_word_to_id.items()}

In [13]:
def enhanced_tokenize(text, word_to_id):
    tokens = custom_tokenize(text)
    return [token if token in word_to_id else UNKOWN_WORD for token in tokens]

In [14]:
def tokenize_and_convert_to_ids(text, word_to_id):
    tokens = enhanced_tokenize(text,word_to_id)
    return [wr_word_to_id[token] for token in tokens]

In [15]:
input_ids = tokenize_and_convert_to_ids(wine_reviews[0],wr_word_to_id)
for id in input_ids:
    print(f"{id} : {wr_id_to_word[id]}")

38 : Italy
1 : :
445 : Sicily
447 : &
469 : Sardinia
1 : :
165 : White
36 : Blend
1 : :
214 : Aromas
980 : include
249 : tropical
19 : fruit
0 : ,
2301 : broom
0 : ,
4091 : brimstone
3 : and
117 : dried
131 : herb
2 : .
17 : The
25 : palate
1019 : isn
14 : '
233 : t
1174 : overly
1069 : expressive
0 : ,
356 : offering
10000 : <unk>
71 : apple
0 : ,
81 : citrus
3 : and
117 : dried
482 : sage
162 : alongside
457 : brisk
29 : acidity
2 : .


In [16]:
def batch_tokenize(example):
    """
    example will represent one element from Dataset batch
    """
    input_ids = tokenize_and_convert_to_ids(example["text"],wr_word_to_id)
    if len(input_ids) > MAX_LEN+1:
        return {"input_ids":input_ids[:MAX_LEN+1]}
    else:
        input_ids = input_ids + [wr_word_to_id[PAD_TOKEN]]*(MAX_LEN+1-len(input_ids))
        # for idx in range(len(input_ids),MAX_LEN+1):
        #     input_ids.append(wr_word_to_id[PAD_TOKEN])
        return {"input_ids":input_ids}

In [17]:
wine_ds2 = wine_ds.map(batch_tokenize, batch_size=None)

In [18]:
input_ids=wine_ds2["train"]["input_ids"]
print(len(input_ids))

129971


In [19]:
input_ids_lengthes = [len(input_id_list) for input_id_list in input_ids]
print(f"input ids with maximum length : {max(input_ids_lengthes)}")

input ids with maximum length : 81


In [20]:
# let's confirm that all input_ids have the same length
import pandas as pd
pd.Series(input_ids_lengthes).unique()

array([81], dtype=int64)

In [21]:
for id in input_ids[0]:
    print(f"{id} : {wr_id_to_word[id]}")

38 : Italy
1 : :
445 : Sicily
447 : &
469 : Sardinia
1 : :
165 : White
36 : Blend
1 : :
214 : Aromas
980 : include
249 : tropical
19 : fruit
0 : ,
2301 : broom
0 : ,
4091 : brimstone
3 : and
117 : dried
131 : herb
2 : .
17 : The
25 : palate
1019 : isn
14 : '
233 : t
1174 : overly
1069 : expressive
0 : ,
356 : offering
10000 : <unk>
71 : apple
0 : ,
81 : citrus
3 : and
117 : dried
482 : sage
162 : alongside
457 : brisk
29 : acidity
2 : .
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>
10001 : <pad>

In [22]:
def input_id_to_tensor(example):
    return {"input_ids_pt":torch.tensor(example["input_ids"])}

In [23]:
wine_ds3 = wine_ds2.map(input_id_to_tensor,batch_size=None)

In [24]:
input_ids_tensor = torch.cat([torch.tensor(input_ids_record).unsqueeze(0) for input_ids_record in input_ids],dim=0)
print(f"input_ids tensor shape : {input_ids_tensor.shape}")

input_ids tensor shape : torch.Size([129971, 81])


In [25]:
x, y = input_ids_tensor[:,:80],input_ids_tensor[:,1:]

In [26]:
print(f"x shape : {x.size()}, y shape : {y.size()}")

x shape : torch.Size([129971, 80]), y shape : torch.Size([129971, 80])


In [27]:
from torch.utils.data import Dataset

class WineReviewDataset(Dataset):
    def __init__(self, x, y) -> None:
        super().__init__()
        self.x = x
        self.y = y
    
    def __len__(self):
        return self.x.size(0)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [28]:
twine_ds = WineReviewDataset(x,y)
one_x, one_y=twine_ds[100]
print(f"onex shape : {one_x.size()}")

onex shape : torch.Size([80])


In [29]:
from torch.utils.data import DataLoader

wine_loader = DataLoader(twine_ds,batch_size=BATCH_SIZE, shuffle=True)

In [30]:
for idx, (batch_x, batch_y) in enumerate(wine_loader):
    print(f"batch_x shape : {batch_x.size()}, batch_y shape {batch_y.size()}")
    if idx>5:
        break

batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])
batch_x shape : torch.Size([32, 80]), batch_y shape torch.Size([32, 80])


In [31]:
torch.tril(torch.ones(5,5)).unsqueeze(0).shape

torch.Size([1, 5, 5])

In [32]:
query=key=value=torch.rand(5,2).unsqueeze(0)
scores = torch.bmm(query,key.transpose(1,2))
print(scores.shape)

torch.Size([1, 5, 5])


In [33]:
mask=torch.tril(torch.ones(5,5)).unsqueeze(0)
scores.masked_fill(mask==0,-float("inf"))

tensor([[[0.1421,   -inf,   -inf,   -inf,   -inf],
         [0.2509, 0.6434,   -inf,   -inf,   -inf],
         [0.3876, 0.5177, 1.1969,   -inf,   -inf],
         [0.3457, 0.4763, 1.0552, 0.9312,   -inf],
         [0.3212, 0.6456, 0.8112, 0.7292, 0.7569]]])

# Let's implement Token and Positional embedding

In [34]:
import torch.nn as nn

class TokenPositionEmbedding(nn.Module):
    def __init__(self,vocab_size, max_len,embed_dim):
        super(TokenPositionEmbedding,self).__init__()
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(self.vocab_size,self.embed_dim)
        self.pos_embedding = nn.Embedding(self.max_len, self.embed_dim)
    
    def forward(self,x):
        # x is (N,L) tensor where each row is a list of token IDs
        positions = torch.arange(self.max_len)
        positions_embeddings = self.pos_embedding(positions)
        return self.embedding(x) + positions_embeddings



In [35]:
VOCAB_SIZE=len(wr_word_to_id)
seq_len=80
embed_dim=100

In [36]:
token_pos_embed = TokenPositionEmbedding(VOCAB_SIZE,seq_len,embed_dim)

In [37]:
one_embedded = token_pos_embed(batch_x)

In [38]:
one_embedded.shape

torch.Size([32, 80, 100])

In [39]:
# first argument is embedding dimension which is 100 in our case, second argument is the number of attention heads which is 4 in our case
# MultiheadAttention will project query,key and value vectors to embed_dim/num_heads dimension and apply attention separately
# remember scaled dot product between query and key produces scores, which are passed through softmax to normalized and then weighted product of values is the output of representation of the token
# this is done num_heads times and representations of each tokens from each head is horizontally concatenated
multihead = nn.MultiheadAttention(embed_dim,4,batch_first=True)

In [40]:
# we can use this as input to attn_mask argument of MultiHeadAttention when calling it
# as you can see we are first using torch.tril to generate square matrix where values above diagonal are zero
# then we set elements where values are zero to True, indicating those are the positions that we want to mask
mask = torch.tril(torch.ones(seq_len, seq_len))==0

In [41]:
attn_out, attn_weights = multihead(one_embedded,one_embedded,one_embedded,attn_mask=mask)

In [46]:
attn_weights.size()

torch.Size([32, 80, 80])

In [43]:
layer_norm = nn.LayerNorm(normalized_shape=embed_dim,eps=1e-6)

In [44]:
out = layer_norm(attn_out)

In [45]:
print(f"attention output first ten elements : {attn_out[0,0,:10]}")
print(f"layer norm output first ten elements : {out[0,0,:10]}")
print(f"mean of first element : {out[0,:].mean(axis=1)[:10]}, std of first element : {out[0,:].std(axis=1)[:10]}")

attention output first ten elements : tensor([-0.2083,  1.0783, -0.4119,  0.8707,  0.5460, -0.7446,  0.2810,  0.5366,
        -0.2513,  0.4071], grad_fn=<SliceBackward0>)
layer norm output first ten elements : tensor([-0.4388,  1.9099, -0.8106,  1.5308,  0.9382, -1.4179,  0.4544,  0.9209,
        -0.5173,  0.6845], grad_fn=<SliceBackward0>)
mean of first element : tensor([-2.8610e-08,  2.1458e-08, -2.3842e-09,  2.9802e-08,  1.7881e-08,
        -3.5763e-09, -4.7684e-09,  4.7684e-09,  9.5367e-09,  3.8147e-08],
       grad_fn=<SliceBackward0>), std of first element : tensor([1.0050, 1.0050, 1.0050, 1.0050, 1.0050, 1.0050, 1.0050, 1.0050, 1.0050,
        1.0050], grad_fn=<SliceBackward0>)
