In [61]:
%load_ext autoreload
%autoreload 2
import numpy as np
import json
import re
import string
from IPython.display import display, HTML

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 1

In [63]:
import os 
os.chdir(r'C:\Users\prajw\OneDrive\Desktop\MLP')

In [64]:
# Load the full dataset
with open('winemag-data-130k-v2.json') as json_data:
    wine_data = json.load(json_data)

In [65]:
wine_data[10]

{'points': '87',
 'title': 'Kirkland Signature 2011 Mountain Cuvée Cabernet Sauvignon (Napa Valley)',
 'description': 'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.',
 'taster_name': 'Virginie Boone',
 'taster_twitter_handle': '@vboone',
 'price': 19,
 'designation': 'Mountain Cuvée',
 'variety': 'Cabernet Sauvignon',
 'region_1': 'Napa Valley',
 'region_2': 'Napa',
 'province': 'California',
 'country': 'US',
 'winery': 'Kirkland Signature'}

In [66]:
# Filter the dataset
filtered_data = [
    "wine review : "
    + x["country"]
    + " : "
    + x["province"]
    + " : "
    + x["variety"]
    + " : "
    + x["description"]
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None
]

In [67]:
# Count the recipes
n_wines = len(filtered_data)
print(f"{n_wines} recipes loaded")

129907 recipes loaded


In [68]:
example = filtered_data[25]
print(example)

wine review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard-designated Pinot that hails from a high-elevation site. Small in production, it offers intense, full-bodied raspberry and blackberry steeped in smoky spice and smooth texture.


In [69]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [70]:
# Display an example of a recipe
example_data = text_data[25]
example_data

'wine review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard - designated Pinot that hails from a high - elevation site . Small in production , it offers intense , full - bodied raspberry and blackberry steeped in smoky spice and smooth texture . '

In [71]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [72]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [73]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [74]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: :
3: ,
4: .
5: and
6: the
7: wine
8: a
9: of


In [75]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[   7   10    2   20    2   29    2   43   62    2   55    5  243 4145
  453  634   26    9  497  499  667   17   12  142   14 2214   43   25
 2484   32    8  223   14 2213  948    4  594   17  987    3   15   75
  237    3   64   14   82   97    5   74 2633   17  198   49    5  125
   77    4    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


### **Create the Training Set**

In [76]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

In [77]:
example_input_output = train_ds.take(1).get_single_element()

In [78]:
# Example Input
example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   10,    2,   20,    2,  103,    2,   65,   14,   63,   24,
         27,    2,   12,  591,   14,  142,   27,   18,   21,   16, 1176,
         84,  127,    6,  383,  366,  143,  166,  194,    4,   15, 5848,
         91, 1887,   51,   15, 2772,   84,    6,  534,    3, 1773,   17,
        243,    5,   37,  209,    3,   11,  866,    5,  159,   91, 1762,
        188,   17,    6,  267,    1, 1843,    4,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int64)>

In [79]:
# Example Output (shifted by one token)
example_input_output[1][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  10,    2,   20,    2,  103,    2,   65,   14,   63,   24,   27,
          2,   12,  591,   14,  142,   27,   18,   21,   16, 1176,   84,
        127,    6,  383,  366,  143,  166,  194,    4,   15, 5848,   91,
       1887,   51,   15, 2772,   84,    6,  534,    3, 1773,   17,  243,
          5,   37,  209,    3,   11,  866,    5,  159,   91, 1762,  188,
         17,    6,  267,    1, 1843,    4,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int64)>

### **Create the causal attention mask function**

In [80]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

### **Create a Transformer Block layer**

In [81]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
        )
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

### **Create the Token and Position Embedding**

In [82]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

### **Build the Transformer model**

In [83]:
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])

In [84]:
gpt.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 256)        2580480   
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_2 (Transf  ((None, None, 256),      658688    
 ormerBlock)                  (None, 2, None, None))             
                                                                 
 dense_8 (Dense)             (None, None, 10000)       2570000   
                                                                 
Total params: 5,809,168
Trainable params: 5,809,168
Non-trainable params: 0
_________________________________________________

In [85]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    gpt = models.load_model("./models/gpt", compile=True)

#### **Train the Transformer**

In [86]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("wine review", max_tokens=80, temperature=1.0)

In [87]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [88]:
gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

generated text:
wine review : argentina : mendoza province : chardonnay : woody , sticky peach , apple and melon aromas register as a mild straw mouthfeel is just flabby . the palate feels ripe and good . bland on a waxy , toasty , pineapple and citrus flavors are less known in santa traditional or zap . lasting on the lees aging has staunch acids on the bouquet along with white - fruit flavors . 



<keras.callbacks.History at 0x1a8d4f23f40>

In [89]:
# Save the final model
gpt.save("./models/gpt")



INFO:tensorflow:Assets written to: ./models/gpt\assets


INFO:tensorflow:Assets written to: ./models/gpt\assets


### **Generate text using the Transformer**

In [90]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        highlighted_text = []
        for word, att_score in zip(
            i["prompt"].split(), np.mean(i["atts"], axis=0)
        ):
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                + str(att_score / max(np.mean(i["atts"], axis=0)))
                + ');">'
                + word
                + "</span>"
            )
        highlighted_text = " ".join(highlighted_text)
        display(HTML(highlighted_text))

        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [91]:
info = text_generator.generate(
    "wine review : us", max_tokens=80, temperature=1.0
)


generated text:
wine review : us : oregon : chardonnay : this effort is a lot hybrid ' s current release here if the reserve level was not for all , but flavorful price . thin as a look implies [UNK] . 



In [92]:
info = text_generator.generate(
    "wine review : italy", max_tokens=80, temperature=0.5
)


generated text:
wine review : italy : piedmont : nebbiolo : this offers aromas of underbrush , scorched earth , game and a whiff of barnyard . the palate offers raw red cherry , star anise , sage and a hint of licorice alongside astringent , astringent tannins . 



In [93]:
info = text_generator.generate(
    "wine review : germany", max_tokens=80, temperature=0.5
)
print_probs(info, vocab)


generated text:
wine review : germany : mosel : riesling : a whiff of petrol leads to a slightly sweet , slightly bitter almond flavors on the palate of this off - dry wine . a bit bitter on the finish . 



::   	100.0%
-:   	0.0%
blend:   	0.0%
grosso:   	0.0%
zealand:   	0.0%
--------



mosel:   	71.72%
rheinhessen:   	20.66%
pfalz:   	4.87%
rheingau:   	1.59%
nahe:   	0.57%
--------



::   	99.96%
-:   	0.04%
other:   	0.0%
,:   	0.0%
grosso:   	0.0%
--------



riesling:   	99.99%
dornfelder:   	0.01%
pinot:   	0.0%
gewürztraminer:   	0.0%
cabernet:   	0.0%
--------



::   	100.0%
-:   	0.0%
blanc:   	0.0%
blend:   	0.0%
grosso:   	0.0%
--------



a:   	43.13%
dusty:   	10.52%
while:   	9.09%
whiffs:   	7.42%
the:   	5.44%
--------



whiff:   	44.26%
bit:   	25.68%
hint:   	8.59%
delicate:   	3.44%
slightly:   	1.97%
--------



of:   	100.0%
to:   	0.0%
from:   	0.0%
[UNK]:   	0.0%
for:   	0.0%
--------



petrol:   	67.69%
struck:   	6.08%
smoke:   	4.42%
dusty:   	2.61%
dried:   	1.74%
--------



leads:   	29.07%
lends:   	25.17%
,:   	17.27%
and:   	5.88%
are:   	5.43%
--------



to:   	80.27%
the:   	10.76%
into:   	4.63%
onto:   	1.67%
with:   	1.62%
--------



a:   	99.61%
the:   	0.12%
an:   	0.08%
fresh:   	0.06%
sweet:   	0.05%
--------



slightly:   	43.11%
fresh:   	10.36%
sweet:   	8.58%
palate:   	4.51%
nose:   	4.13%
--------



off:   	31.91%
sweet:   	29.82%
bitter:   	19.65%
dusty:   	4.56%
waxy:   	3.72%
--------



,:   	98.43%
palate:   	0.27%
and:   	0.24%
nose:   	0.24%
-:   	0.19%
--------



slightly:   	74.83%
off:   	3.23%
intensely:   	2.8%
savory:   	2.66%
herbaceous:   	1.32%
--------



bitter:   	50.14%
sweet:   	27.12%
sour:   	4.93%
waxy:   	4.86%
off:   	4.35%
--------



edge:   	28.15%
almond:   	18.06%
lemon:   	14.6%
tone:   	6.56%
note:   	6.29%
--------



note:   	26.93%
and:   	19.17%
finish:   	17.23%
tone:   	14.85%
flavors:   	4.88%
--------



in:   	44.32%
and:   	34.5%
on:   	10.31%
that:   	5.25%
,:   	2.88%
--------



the:   	87.38%
this:   	12.6%
a:   	0.02%
its:   	0.0%
an:   	0.0%
--------



palate:   	49.43%
nose:   	30.13%
finish:   	20.03%
midpalate:   	0.37%
long:   	0.01%
--------



.:   	41.05%
of:   	34.55%
,:   	17.15%
in:   	3.0%
and:   	1.77%
--------



this:   	99.92%
the:   	0.05%
a:   	0.01%
lemon:   	0.0%
pineapple:   	0.0%
--------



off:   	69.24%
dry:   	16.72%
wine:   	2.42%
semi:   	1.88%
exceptionally:   	0.95%
--------



-:   	100.0%
dry:   	0.0%
by:   	0.0%
,:   	0.0%
a:   	0.0%
--------



dry:   	100.0%
sweet:   	0.0%
putting:   	0.0%
like:   	0.0%
with:   	0.0%
--------



riesling:   	68.43%
,:   	30.6%
wine:   	0.62%
auslese:   	0.12%
and:   	0.05%
--------



.:   	98.19%
,:   	1.34%
':   	0.29%
that:   	0.07%
by:   	0.03%
--------



it:   	87.08%
the:   	7.56%
a:   	2.19%
:   	1.72%
there:   	0.15%
--------



bit:   	66.08%
slightly:   	11.6%
dry:   	1.94%
bitter:   	1.88%
delicate:   	1.39%
--------



lean:   	50.59%
short:   	12.18%
watery:   	6.43%
sharp:   	5.99%
bitter:   	2.58%
--------



in:   	49.33%
on:   	33.63%
,:   	13.78%
and:   	2.33%
but:   	0.18%
--------



the:   	99.96%
its:   	0.04%
it:   	0.0%
a:   	0.0%
entrance:   	0.0%
--------



finish:   	98.64%
palate:   	1.27%
midpalate:   	0.08%
tongue:   	0.0%
bitter:   	0.0%
--------



,:   	70.98%
.:   	27.78%
is:   	0.42%
but:   	0.28%
with:   	0.15%
--------



:   	99.61%
drink:   	0.28%
it:   	0.1%
the:   	0.01%
serve:   	0.0%
--------

