In [1]:
from gpt_tf import VOCAB_SIZE, EMBEDDING_DIM, MAX_LEN
from gpt_tf import TransformerBlock, TokenAndPositionEmbedding, causal_attention_mask

In [2]:
import numpy as np
import pathlib
from tensorflow.keras import layers, models

In [3]:
# Load the full dataset
import pathlib
import json

datasets_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\data")
wine_review_filepath=datasets_folder/"wine_reviews"/"winemag-data-130k-v2.json"
with open(str(wine_review_filepath)) as json_data:
    wine_data = json.load(json_data)

In [4]:
# Filter the dataset
filtered_data = [
    "wine review : "
    + x["country"]
    + " : "
    + x["province"]
    + " : "
    + x["variety"]
    + " : "
    + x["description"]
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None
]

In [5]:
import re
import string

# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [6]:
import tensorflow as tf
from gpt_tf import BATCH_SIZE
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [7]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [8]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [9]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

In [17]:
one_x, one_y = train_ds.take(1).get_single_element()

In [24]:
word_to_id = {word:idx for idx, word in enumerate(vocab)}
id_to_word = {id:word for word,id in word_to_id.items()}

In [None]:
word_to

In [30]:
for id in list(one_x[0].numpy()):
    print(f"{id} : {id_to_word[id]}")

7 : wine
10 : review
2 : :
20 : us
2 : :
29 : california
2 : :
24 : red
27 : blend
2 : :
12 : this
87 : bright
5 : and
1043 : vivid
7 : wine
41 : has
56 : fresh
97 : raspberry
5 : and
36 : cherry
26 : aromas
3 : ,
135 : concentrated
22 : fruit
16 : flavors
5 : and
8 : a
77 : texture
25 : that
41 : has
334 : enough
413 : tannin
5 : and
30 : acidity
46 : for
99 : good
160 : balance
4 : .
15 : it
18 : '
21 : s
425 : nicely
128 : made
17 : in
8 : a
1255 : traditional
58 : dry
63 : style
4 : .
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
0 : 
