In [1]:
from torchtext.data.utils import get_tokenizer

In [2]:
tokenizer = get_tokenizer("basic_english")

In [3]:
tokens = tokenizer("He goes to school, he studies at school, he has a friend, he has friends. He likes a flower. He likes flowers")

In [4]:
print(tokens)

['he', 'goes', 'to', 'school', ',', 'he', 'studies', 'at', 'school', ',', 'he', 'has', 'a', 'friend', ',', 'he', 'has', 'friends', '.', 'he', 'likes', 'a', 'flower', '.', 'he', 'likes', 'flowers']


# Let's load wine review dataset and build vocabulary from it and build x, y inputs to potentially train our own GPT

In [5]:
import pathlib
import numpy as np
import torch
from datasets import load_dataset

In [6]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5

In [7]:
# Load the full dataset
import pathlib
datasets_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\data")
wine_review_filepath=datasets_folder/"wine_reviews"/"winemag-data-130k-v2.json"
data = load_dataset(str(wine_review_filepath.parent),'json')

In [8]:
def prepare_text_with_country_variety(batch):
    text_with_country_variety = [f"{country} : {province} : {variety} : {description}" for country, province, variety, description in zip(batch['country'],batch['province'], batch['variety'], batch['description'])]
    return {"text": text_with_country_variety}

In [9]:
wine_ds = data.map(prepare_text_with_country_variety,batched=True, batch_size=None)

In [10]:
wine_reviews = wine_ds["train"]["text"]

In [11]:
print(f"type of wine_reviews {type(wine_reviews)} and its length {len(wine_reviews)}")

type of wine_reviews <class 'list'> and its length 129971


In [12]:
# define a generator that yields text from a long list of texts
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

In [13]:
# let's build a vocabulary from wine_reviews. To achieve this we will use build_vocab_from_iterator of torchtext.vocab module
from torchtext.vocab import build_vocab_from_iterator

# build_vocab_from_iterator accepts a generator that yields text one by one, specials with special tokens when the token falls outside of vocabulary, and finally vocab size to argument max_tokens
vocab = build_vocab_from_iterator(yield_tokens(wine_reviews), specials=["<unk>"], max_tokens=VOCAB_SIZE)

In [14]:
word_to_index = vocab.get_stoi()
print(f"word to index has {len(word_to_index)} elements")

word to index has 10000 elements


In [15]:
#let's confirm that vocab returns a list of integer IDs just like HuggingFace Tokenizer
#tokens = vocab(tokenizer(wine_reviews[0]))
#print(tokens)

In [16]:
print(wine_reviews[0])
print(tokenizer(wine_reviews[0]))

Italy : Sicily & Sardinia : White Blend : Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.
['italy', 'sicily', '&', 'sardinia', 'white', 'blend', 'aromas', 'include', 'tropical', 'fruit', ',', 'broom', ',', 'brimstone', 'and', 'dried', 'herb', '.', 'the', 'palate', 'isn', "'", 't', 'overly', 'expressive', ',', 'offering', 'unripened', 'apple', ',', 'citrus', 'and', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '.']


In [17]:
tokens = tokenizer(wine_reviews[0])
tokens.remove("&")

In [18]:
print(tokens)
print(len(tokens))

['italy', 'sicily', 'sardinia', 'white', 'blend', 'aromas', 'include', 'tropical', 'fruit', ',', 'broom', ',', 'brimstone', 'and', 'dried', 'herb', '.', 'the', 'palate', 'isn', "'", 't', 'overly', 'expressive', ',', 'offering', 'unripened', 'apple', ',', 'citrus', 'and', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '.']
37


In [19]:
import torch.nn as nn

emb=nn.Embedding(10000,100)

In [24]:
input_ids = torch.tensor([100,200,3000,4000,2345,342,99,10])
out=emb(input_ids)
print(out.shape)

torch.Size([8, 100])
