# Let's load wine reviews dataset with load_dataset from datasets library of HuggingFace

In [9]:
import re
import pathlib
import numpy as np
import torch
from datasets import load_dataset

In [20]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5
UNKOWN_WORD = "<unk>"

In [3]:
# Load the full dataset
datasets_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\data")
wine_review_filepath=datasets_folder/"wine_reviews"/"winemag-data-130k-v2.json"
data = load_dataset(str(wine_review_filepath.parent),'json')

In [4]:
def prepare_text_with_country_variety(batch):
    text_with_country_variety = [f"{country} : {province} : {variety} : {description}" for country, province, variety, description in zip(batch['country'],batch['province'], batch['variety'], batch['description'])]
    return {"text": text_with_country_variety}

In [5]:
wine_ds = data.map(prepare_text_with_country_variety,batched=True, batch_size=None)

In [7]:
wine_reviews = wine_ds["train"]["text"]

# Let's build custom tokenizer
This tokenizer will split text on whitespace and punctuation marks

In [11]:
def custom_tokenize(text):
    return re.findall(r"\w+|[^\w\s]",text,re.UNICODE)

In [12]:
tokens=custom_tokenize(wine_reviews[0])
print(tokens)

['Italy', ':', 'Sicily', '&', 'Sardinia', ':', 'White', 'Blend', ':', 'Aromas', 'include', 'tropical', 'fruit', ',', 'broom', ',', 'brimstone', 'and', 'dried', 'herb', '.', 'The', 'palate', 'isn', "'", 't', 'overly', 'expressive', ',', 'offering', 'unripened', 'apple', ',', 'citrus', 'and', 'dried', 'sage', 'alongside', 'brisk', 'acidity', '.']


In [13]:
# build a vocabulary

tokenized_wine_reviews = [custom_tokenize(text) for text in wine_reviews]

In [16]:
wine_review_tokens = [token for tokenized_text in tokenized_wine_reviews for token in tokenized_text]

In [21]:
from collections import Counter

wine_review_token_counter = Counter(wine_review_tokens)
wine_review_vocab_words = wine_review_token_counter.most_common(VOCAB_SIZE)

wr_word_to_id = { word:idx for idx, (word,_) in enumerate(wine_review_vocab_words)}

wr_word_to_id[UNKOWN_WORD] = len(wr_word_to_id)

In [24]:
wr_id_to_word = {id:word for word, id in wr_word_to_id.items()}

In [22]:
def enhanced_tokenize(text, word_to_id):
    tokens = custom_tokenize(text)
    return [token if token in word_to_id else UNKOWN_WORD for token in tokens]

In [23]:
def tokenize_and_convert_to_ids(text, word_to_id):
    tokens = enhanced_tokenize(text,word_to_id)
    return [wr_word_to_id[token] for token in tokens]

In [25]:
input_ids = tokenize_and_convert_to_ids(wine_reviews[0],wr_word_to_id)
for id in input_ids:
    print(f"{id} : {wr_id_to_word[id]}")

38 : Italy
1 : :
445 : Sicily
447 : &
469 : Sardinia
1 : :
165 : White
36 : Blend
1 : :
214 : Aromas
980 : include
249 : tropical
19 : fruit
0 : ,
2301 : broom
0 : ,
4091 : brimstone
3 : and
117 : dried
131 : herb
2 : .
17 : The
25 : palate
1019 : isn
14 : '
233 : t
1174 : overly
1069 : expressive
0 : ,
356 : offering
10000 : <unk>
71 : apple
0 : ,
81 : citrus
3 : and
117 : dried
482 : sage
162 : alongside
457 : brisk
29 : acidity
2 : .
