In [34]:
import pandas as pd
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [36]:


# Step 1: Install the Kaggle API client
# Uncomment the following line if you haven't installed the Kaggle API client
# !pip install kaggle

# Step 2: Set up the environment variables
os.environ['KAGGLE_USERNAME'] = ""
os.environ['KAGGLE_KEY'] = ""

# Step 3: Use the Kaggle API to download the Wine Reviews dataset
# The identifier for the Wine Reviews dataset
dataset_identifier = 'zynicide/wine-reviews'

# Download and unzip the dataset
kaggle.api.dataset_download_files(dataset_identifier, path='./wine_reviews', unzip=True)

print("Wine Reviews dataset downloaded successfully")


Dataset URL: https://www.kaggle.com/datasets/zynicide/wine-reviews
Wine Reviews dataset downloaded successfully


In [38]:
# Load the dataset from the CSV file
# The dataset is expected to be in the directory './wine_reviews' with the filename 'winemag-data-130k-v2.csv'
wine_reviews_df = pd.read_csv('./wine_reviews/winemag-data-130k-v2.csv')

# Create a list containing the text descriptions of each wine
# The 'description' column in the DataFrame contains the text descriptions of the wines
wine_descriptions = wine_reviews_df['description'].tolist()

# Print confirmation message
print("Wine descriptions loaded successfully")

# Print the first few descriptions to verify
print(wine_descriptions[:5])


Wine descriptions loaded successfully
["Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.", "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.", 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.', 'Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.', "Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think o

In [42]:
def pad_punctuation(text):
    """
    This function takes a string and pads punctuation marks with spaces
    so that each punctuation mark is treated as a separate word.
    
    Parameters:
    text (str): The input text string to process.
    
    Returns:
    str: The processed text string with padded punctuation.
    """
    # Iterate over each punctuation mark defined in the string.punctuation constant
    for punctuation in string.punctuation:
        # Replace the punctuation mark with the same mark padded with spaces
        text = text.replace(punctuation, f' {punctuation} ')
    return text

# Apply the padding function to each wine description in the list
# This will create a new list with padded descriptions
padded_wine_descriptions = [pad_punctuation(description) for description in wine_descriptions]

# Print confirmation message
print("Punctuation padded successfully")

# Print the first few padded descriptions to verify
print(padded_wine_descriptions[:5])


Punctuation padded successfully
["Aromas include tropical fruit ,  broom ,  brimstone and dried herb .  The palate isn ' t overly expressive ,  offering unripened apple ,  citrus and dried sage alongside brisk acidity . ", "This is ripe and fruity ,  a wine that is smooth while still structured .  Firm tannins are filled out with juicy red berry fruits and freshened with acidity .  It ' s  already drinkable ,  although it will certainly be better from 2016 . ", 'Tart and snappy ,  the flavors of lime flesh and rind dominate .  Some green pineapple pokes through ,  with crisp acidity underscoring the flavors .  The wine was all stainless - steel fermented . ', 'Pineapple rind ,  lemon pith and orange blossom start off the aromas .  The palate is a bit more opulent ,  with notes of honey - drizzled guava and mango giving way to a slightly astringent ,  semidry finish . ', "Much like the regular bottling from 2012 ,  this comes across as rather rough and tannic ,  with rustic ,  earthy , 

In [44]:
# Define parameters for the TextVectorization layer
max_tokens = 20000  # Maximum number of tokens to keep, based on word frequency
output_sequence_length = 100  # Length to pad or clip the output sequences

# Create the TextVectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=output_sequence_length
)

# Adapt the TextVectorization layer to the padded descriptions
vectorize_layer.adapt(padded_wine_descriptions)

# Apply the TextVectorization layer to the padded descriptions
vectorized_wine_descriptions = vectorize_layer(padded_wine_descriptions)

print("Tokenization and padding/clipping completed successfully")
# Print the first few tokenized and padded/clipped descriptions to verify
print(vectorized_wine_descriptions[:5])


Tokenization and padding/clipping completed successfully
tf.Tensor(
[[   18   864   202    15  1978  3610     2    90   107     3    19   898
    197  1018   919   307 14577    49    63     2    90   419   141   359
     20     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [    7     8    28     2    77     4    10    17     8   101    62   126
    144    80    24    29  1050    92     6    74    32    43    50     2
   3739     6    20     9    14   375   695   292     9    89   676    98
    439    22   420     0     0     0     0     0     0     0     0     0
      0     0    

In [46]:
import numpy as np

# Function to create inputs and outputs for training
def create_training_data(vectorized_texts):
    """
    This function creates a training set where the inputs are the tokenized text strings
    and the outputs to predict are the same strings shifted by one token.

    Parameters:
    vectorized_texts (tf.Tensor): The tokenized and padded text strings.

    Returns:
    tuple: A tuple containing the inputs and outputs for training.
    """
    inputs = vectorized_texts[:, :-1]  # All tokens except the last one
    outputs = vectorized_texts[:, 1:]  # All tokens except the first one
    return inputs, outputs

# Apply the function to the vectorized descriptions
inputs, outputs = create_training_data(vectorized_wine_descriptions)

# Print the shapes of the inputs and outputs to verify
print(f"Inputs shape: {inputs.shape}")
print(f"Outputs shape: {outputs.shape}")

# Print the first few examples to verify
print("First few input sequences:")
print(inputs[:5])
print("First few output sequences:")
print(outputs[:5])


Inputs shape: (129971, 99)
Outputs shape: (129971, 99)
First few input sequences:
tf.Tensor(
[[   18   864   202    15  1978  3610     2    90   107     3    19   898
    197  1018   919   307 14577    49    63     2    90   419   141   359
     20     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0]
 [    7     8    28     2    77     4    10    17     8   101    62   126
    144    80    24    29  1050    92     6    74    32    43    50     2
   3739     6    20     9    14   375   695   292     9    89   676    98
    439    22   420     0     0     0     0     0     0     0     0     

In [58]:
from tensorflow.keras.layers import MultiHeadAttention, Input, Dense
from tensorflow.keras.models import Model

# Define input shapes
query_input = Input(shape=(None, 128), name='query')
value_input = Input(shape=(None, 128), name='value')  # Should match key_dim for keys/queries

# Create a MultiHeadAttention layer
multi_head_attention = MultiHeadAttention(
    num_heads=4,
    key_dim=128,
    value_dim=64,
    name='multi_head_attention'
)

# Apply the MultiHeadAttention layer
attention_output = multi_head_attention(query=query_input, value=value_input, key=query_input)

# Define a simple model
model = Model(inputs=[query_input, value_input], outputs=attention_output)

# Print the model summary
model.summary()


In [62]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Creates a causal attention mask to be used in a multi-head attention layer.

    Parameters:
    - batch_size (int): The number of batches.
    - n_dest (int): The size of the destination sequence (target sequence length).
    - n_src (int): The size of the source sequence (input sequence length).
    - dtype (tf.DType): The data type of the mask tensor.

    Returns:
    - tf.Tensor: A causal attention mask of shape (batch_size, n_dest, n_src).
    """
    # Create range tensors for destination and source
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    
    # Create a mask where positions i >= j are set to True
    m = i >= j - n_src + n_dest
    
    # Cast the mask to the desired dtype
    mask = tf.cast(m, dtype)
    
    # Reshape the mask to add a batch dimension
    mask = tf.reshape(mask, [1, n_dest, n_src])
    
    # Create a multiplier for tiling the mask to match the batch size
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    
    # Tile the mask to match the batch size
    return tf.tile(mask, mult)

# Testing the causal_attention_mask function
mask = causal_attention_mask(1, 10, 10, dtype=tf.int32)
print("Causal Attention Mask (1 batch, 10 dest, 10 src):")
print(np.transpose(mask[0]))

# Ensure the mask is as expected for visualization purposes
np.set_printoptions(threshold=np.inf)
print(np.transpose(mask[0].numpy()))


Causal Attention Mask (1 batch, 10 dest, 10 src):
[[1 1 1 1 1 1 1 1 1 1]
 [0 1 1 1 1 1 1 1 1 1]
 [0 0 1 1 1 1 1 1 1 1]
 [0 0 0 1 1 1 1 1 1 1]
 [0 0 0 0 1 1 1 1 1 1]
 [0 0 0 0 0 1 1 1 1 1]
 [0 0 0 0 0 0 1 1 1 1]
 [0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 1]]
[[1 1 1 1 1 1 1 1 1 1]
 [0 1 1 1 1 1 1 1 1 1]
 [0 0 1 1 1 1 1 1 1 1]
 [0 0 0 1 1 1 1 1 1 1]
 [0 0 0 0 1 1 1 1 1 1]
 [0 0 0 0 0 1 1 1 1 1]
 [0 0 0 0 0 0 1 1 1 1]
 [0 0 0 0 0 0 0 1 1 1]
 [0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 1]]


In [71]:
from tensorflow.keras import layers

# Ensure the causal_attention_mask function is defined
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Creates a causal attention mask to be used in a multi-head attention layer.

    Parameters:
    - batch_size (int): The number of batches.
    - n_dest (int): The size of the destination sequence (target sequence length).
    - n_src (int): The size of the source sequence (input sequence length).
    - dtype (tf.DType): The data type of the mask tensor.

    Returns:
    - tf.Tensor: A causal attention mask of shape (batch_size, n_dest, n_src).
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0)
    return tf.tile(mask, mult)

class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        """
        Initializes the TransformerBlock layer.

        Parameters:
        - num_heads (int): Number of attention heads.
        - key_dim (int): Size of each attention head for query and key.
        - embed_dim (int): Dimensionality of the output space.
        - ff_dim (int): Dimensionality of the feed-forward network.
        - dropout_rate (float): Dropout rate.
        """
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        
        # Multi-head attention layer
        self.attn = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=key_dim, value_dim=key_dim, output_shape=embed_dim
        )
        
        # Dropout and normalization layers
        self.dropout_1 = layers.Dropout(dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed-forward network
        self.ffn_1 = layers.Dense(ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(embed_dim)
        
        # Dropout and normalization layers
        self.dropout_2 = layers.Dropout(dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs):
        """
        Forward pass for the TransformerBlock.

        Parameters:
        - inputs (tf.Tensor): Input tensor of shape (batch_size, seq_len, embed_dim).

        Returns:
        - tf.Tensor: Output tensor of shape (batch_size, seq_len, embed_dim).
        - tf.Tensor: Attention scores of shape (batch_size, num_heads, seq_len, seq_len).
        """
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        
        # Create a causal mask
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        
        # Apply multi-head attention
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=causal_mask,
            return_attention_scores=True
        )
        
        # Apply dropout and layer normalization
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        
        # Apply feed-forward network
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        
        # Apply layer normalization and return the output and attention scores
        return self.ln_2(out1 + ffn_output), attention_scores


In [75]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        """
        Initializes the TokenAndPositionEmbedding layer.

        Parameters:
        - maxlen (int): Maximum length of the input sequences.
        - vocab_size (int): Size of the vocabulary.
        - embed_dim (int): Dimensionality of the embeddings.
        """
        super(TokenAndPositionEmbedding, self).__init__()
        
        # Store parameters
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
        # Embedding layer for token embeddings
        self.token_emb = layers.Embedding(
            input_dim=vocab_size,  # Vocabulary size
            output_dim=embed_dim  # Embedding dimension
        )
        
        # Embedding layer for positional embeddings
        self.pos_emb = layers.Embedding(
            input_dim=maxlen,  # Maximum length of the sequence
            output_dim=embed_dim  # Embedding dimension
        )
    
    def call(self, x):
        """
        Forward pass for the TokenAndPositionEmbedding layer.

        Parameters:
        - x (tf.Tensor): Input tensor of shape (batch_size, seq_len).

        Returns:
        - tf.Tensor: Output tensor of shape (batch_size, seq_len, embed_dim).
        """
        # Get the length of the input sequences (seq_len)
        maxlen = tf.shape(x)[-1]
        
        # Create a tensor of positions [0, 1, 2, ..., maxlen-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        
        # Apply position embedding to the positions tensor
        positions = self.pos_emb(positions)
        
        # Apply token embedding to the input tensor x
        x = self.token_emb(x)
        
        # Add the token embeddings and position embeddings
        # The broadcasting mechanism will add the positional embeddings to each sequence in the batch
        return x + positions

Now we are ready to build and train our GPT model! To put everything together, we
need to pass our input text through the token and position embedding layer, then
through our Transformer block. The final output of the network is a simple Dense
layer with softmax activation over the number of words in the vocabulary.

In [None]:
from tensorflow.keras import models, losses

MAX_LEN = 80
VOCAB_SIZE = 10000
EMBEDDING_DIM = 256
N_HEADS = 2
KEY_DIM = 256
FEED_FORWARD_DIM = 256

inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(x)

gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile(optimizer="adam", loss=[losses.SparseCategoricalCrossentropy(from_logits=True), None])

# Prepare the dataset for training
BATCH_SIZE = 32
BUFFER_SIZE = 5000

# Create TensorFlow Dataset from the processed inputs and outputs
train_inputs, train_outputs = create_training_data(vectorized_wine_descriptions)

# Ensure label values are within the vocabulary range
train_inputs = np.clip(train_inputs, 0, VOCAB_SIZE - 1)
train_outputs = np.clip(train_outputs, 0, VOCAB_SIZE - 1)

train_ds = tf.data.Dataset.from_tensor_slices((train_inputs, train_outputs))
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Use callbacks for early stopping and model checkpointing
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('gpt_model.keras', save_best_only=True)
]

# Train the model
gpt.fit(train_ds, epochs=5, callbacks=callbacks)
