In [1]:
!pip install kaggle



In [2]:
!mkdir ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [3]:
!kaggle datasets download zynicide/wine-reviews

Downloading wine-reviews.zip to /content
 65% 33.0M/50.9M [00:00<00:00, 121MB/s] 
100% 50.9M/50.9M [00:00<00:00, 133MB/s]


In [4]:
!unzip *.zip

Archive:  wine-reviews.zip
  inflating: winemag-data-130k-v2.csv  
  inflating: winemag-data-130k-v2.json  
  inflating: winemag-data_first150k.csv  


In [5]:
%load_ext autoreload
%autoreload 2
import numpy as np
import re
import string
import json
# For viz
from IPython.display import display, HTML
# For model
import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

In [6]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5

In [7]:
with open("/content/winemag-data-130k-v2.json") as json_data:
  wine_data = json.load(json_data)

wine_data[1]

{'points': '87',
 'title': 'Quinta dos Avidagos 2011 Avidagos Red (Douro)',
 'description': "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'taster_name': 'Roger Voss',
 'taster_twitter_handle': '@vossroger',
 'price': 15,
 'designation': 'Avidagos',
 'variety': 'Portuguese Red',
 'region_1': None,
 'region_2': None,
 'province': 'Douro',
 'country': 'Portugal',
 'winery': 'Quinta dos Avidagos'}

In [8]:
filtered_data = [
    "wine_review : "
    + x["country"]
    + " : "
    + x["province"]
    + " : "
    + x["variety"]
    + " : "
    + x["description"]
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None
]

In [9]:
n_wines = len(filtered_data)

In [10]:
print(n_wines)

129907


In [11]:
example = filtered_data[25]
print(example)

wine_review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard-designated Pinot that hails from a high-elevation site. Small in production, it offers intense, full-bodied raspberry and blackberry steeped in smoky spice and smooth texture.


# Tokenization

In [12]:
def pad_punctuation(s):
  s = re.sub(f"([{string.punctuation},'\n'])",r" \1 ",s)
  s = re.sub(" +", " ", s)
  return s

text_data = [pad_punctuation(x) for x in filtered_data]

In [13]:
example_data = text_data[25]
example_data

'wine _ review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard - designated Pinot that hails from a high - elevation site . Small in production , it offers intense , full - bodied raspberry and blackberry steeped in smoky spice and smooth texture . '

In [14]:
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    # batch data point creation
    .batch(BATCH_SIZE)
    # shuffling
    .shuffle(1000)
)

In [15]:
vectorize_layer = layers.TextVectorization(
    standardize = 'lower',
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = MAX_LEN+1
)

In [16]:
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [17]:
for i, word in enumerate(vocab[:10]):
  print(f"{i}: {word}")

0: 
1: [UNK]
2: :
3: ,
4: .
5: and
6: the
7: wine
8: a
9: of


In [18]:
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[   7   11   10    2   21    2   30    2   44   63    2   56    5  244
 4146  454  635   27    9  498  500  668   18   13  143   15 2215   44
   26 2485   33    8  224   15 2214  949    4  595   18  988    3   16
   76  238    3   65   15   83   98    5   75 2634   18  199   50    5
  126   78    4    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


In [19]:
def prepare_inputs(text):
  text = tf.expand_dims(text, -1)
  tokenized_sentences = vectorize_layer(text)
  x = tokenized_sentences[:,:-1]
  y = tokenized_sentences[:,1:]
  return x, y

train_ds = text_ds.map(prepare_inputs)

In [20]:
example_input_output = train_ds.take(1).get_single_element()
example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   43,    2,  345,    2,  558,    2,    6,
         61,   14,  654,   34,   42,  314,   96,  183,    4,    6,   29,
        140, 1266,  554,   96,    5,  152,  278,    4,   13, 3992,    6,
         97,  193,    9,    6,  107,    4,    8,  274, 2155,  597, 8724,
         13,   33, 2387,  326,   58,    5, 1073,    6,  554,   23,   24,
          6,  228,   15,   59,   32,    4,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

In [21]:
example_input_output[0][1]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([   7,   11,   10,    2,   43,    2,  345,    2,  558,    2,  107,
          5,  575, 6823,   90,  346, 3683,   27,   24,    6,   61,    4,
         24,    6,   29,    3, 1014,   50, 2033, 1683,    2,   91,   52,
         72,   52,  216,    4,    6,   65,   23, 1700,   24,    6,  130,
         15,   73,   29,  649, 2937,   33,   26,  216,  270,    2,   13,
         14,  110,    3,  248,    5, 2171,    4,   36,   81,   27,   40,
         89,  665, 1730,    4,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0])>

# Creation of Causal Attention Mask

In [22]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
  i = tf.range(n_dest)[:,None ]
  j = tf.range(n_src)
  m = i>= j - n_src + n_dest
  mask = tf.cast(m,dtype)
  mask = tf.reshape(mask, [1,n_dest, n_src])
  mult = tf.concat([tf.expand_dims(batch_size, -1),tf.constant([1,1],dtype =tf.int32)],0)
  return tf.tile(mask, mult)

np.transpose(causal_attention_mask(1,10,10,dtype = tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
        )
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

In [None]:
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])

In [None]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    gpt = models.load_model("./models/gpt", compile=True)

In [None]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("wine review", max_tokens=80, temperature=1.0)

In [None]:
# Create a model save checkpoint
# callback used to save model checkpoints during training
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    # model checkpoint should be saved at the end of each epoch
    save_freq="epoch",
    verbose=0,
)
# visualizing training metrics and other information in TensorBoard
tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [None]:
gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

In [None]:
# Save the final model
gpt.save("./models/gpt")

In [None]:
# info - A list of dictionaries containing information about generated text, including the prompt, word probabilities, and attention scores.
# tok_k specifies the number of top words to display along with their probabilities
def print_probs(info, vocab, top_k=5):
    for i in info:
        highlighted_text = []
        for word, att_score in zip(
            i["prompt"].split(), np.mean(i["atts"], axis=0)
        ):
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                + str(att_score / max(np.mean(i["atts"], axis=0)))
                + ');">'
                + word
                + "</span>"
            )
        highlighted_text = " ".join(highlighted_text)
        display(HTML(highlighted_text))

        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [None]:
info = text_generator.generate(
    "wine review : us", max_tokens=80, temperature=1.0
)

In [None]:
info = text_generator.generate(
    "wine review : italy", max_tokens=80, temperature=0.5
)

In [None]:
info = text_generator.generate(
    "wine review : germany", max_tokens=80, temperature=0.5
)
print_probs(info, vocab)

In [None]:
info = text_generator.generate(
    "wine review : germany", max_tokens=80, temperature=0.5
)
print_probs(info, vocab,top_k = 4)