In [1]:
import pandas as pd

In [2]:
reviews = pd.read_csv("data/general_data/amazon_reviews/Reviews.csv")

In [3]:
reviews.shape

(568454, 10)

In [4]:
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
reviews.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [6]:
# Remove null values and unneeded features
reviews = reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator','Score','Time'],axis=1)
reviews = reviews.reset_index(drop=True)

In [7]:
reviews.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [8]:
reviews.to_csv('data/general_data/amazon_reviews/reviews.txt', sep='\t', index=False,header=False)

In [5]:
from models.data.data_utils import contractions        # for contractions (link will be added here)

ImportError: attempted relative import with no known parent package

In [9]:
import re
from nltk.corpus import stopwords

def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [10]:
print("Uncleaned Text 1 : ", reviews.Text[0], "\nTexts -> length : ",len(reviews.Text))
print("Uncleaned Summary 1 : ", reviews.Summary[0], "\nSummaries -> length : ",len(reviews.Summary))

Uncleaned Text 1 :  I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. 
Texts -> length :  568401
Uncleaned Summary 1 :  Good Quality Dog Food 
Summaries -> length :  568401


In [11]:
import nltk
nltk.download('stopwords')
  
# Clean the summaries and texts
clean_summaries = []
for summary in reviews.Summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")

clean_texts = []
for text in reviews.Text:
    clean_texts.append(clean_text(text))
print("Texts are complete.")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Summaries are complete.
Texts are complete.


In [12]:
print("Clean Text 1 : ", clean_texts[0], "\nTexts -> length : ",len(clean_texts))
print("Clean Summary 1 : ", clean_summaries[0], "\nSummaries -> length : ",len(clean_summaries))

Clean Text 1 :  bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better 
Texts -> length :  568401
Clean Summary 1 :  good quality dog food 
Summaries -> length :  568401


In [13]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [14]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 132883


In [15]:
# Load Conceptnet Numberbatch's (CN) embeddings, similar to GloVe, but probably better 
# (https://github.com/commonsense/conceptnet-numberbatch)
# for faster download refer to kaggle link -> https://www.kaggle.com/datasets/joeskimo/conceptnet -> ThankYou Kaggle -> 300 dim encoding
import numpy as np

embeddings_index = {}
with open('data/numberbatch/numberbatch-en-19.08.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 516783


In [16]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 3845
Percent of words that are missing from vocabulary: 2.8899999999999997%


In [17]:
# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 132883
Number of words we will use: 60433
Percent of words we will use: 45.48%


In [18]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

60433


In [19]:
res = 0                                                                     # Nothing interesting (-_-)
for i in embeddings_index['king'] - embeddings_index['man'] + embeddings_index["woman"] - embeddings_index["queen"] :
    res+=i**2
res

0.8310931989857823

In [20]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS (EndOfSequence) token to the end of texts''' 
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [21]:
# Apply convert_to_ints to clean_summaries and clean_texts
word_count = 0
unk_count = 0

int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in review + summary:", word_count)
print("Total number of UNKs in review + summary:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in review + summary: 25679738
Total number of UNKs in review + summary: 189827
Percent of words that are UNK: 0.74%


In [22]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [23]:
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())

Summaries:
              counts
count  568401.000000
mean        4.181645
std         2.657886
min         0.000000
25%         2.000000
50%         4.000000
75%         5.000000
max        48.000000

Texts:
              counts
count  568401.000000
mean       41.997266
std        42.521111
min         1.000000
25%        18.000000
50%        29.000000
75%        50.000000
max      2085.000000


In [24]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [25]:

# Sort the summaries and texts by the length of the texts, shortest to longest using ********** WORLD'S GREATEST SORTING TECHNIQUE ***********
# Limit the length of summaries and texts based on the min and max ranges.
# Remove reviews that include too many UNKs

sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0

for length in range(min(lengths_texts.counts), max_text_length): 
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and
            len(int_summaries[count]) <= max_summary_length and
            len(int_texts[count]) >= min_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])                                     # cries for optimization
           ):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])
        
# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))

425965
425965


In [26]:
import tensorflow as tf


def model_inputs():
    '''Create input layers for the model'''
    
    input_data = tf.keras.Input(shape=(None,), dtype=tf.int32, name='input')
    targets = tf.keras.Input(shape=(None,), dtype=tf.int32, name='targets')
    lr = tf.keras.Input(shape=(), dtype=tf.float32, name='learning_rate')
    keep_prob = tf.keras.Input(shape=(), dtype=tf.float32, name='keep_prob')
    summary_length = tf.keras.Input(shape=(None,), dtype=tf.int32, name='summary_length')
    text_length = tf.keras.Input(shape=(None,), dtype=tf.int32, name='text_length')
    
    # Calculate max_summary_length using Keras backend
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    
    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [27]:

def process_encoding_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the beginning of each batch'''
    
    ending = target_data[:, :-1]  # Remove the last word ID from each batch
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], axis=1)

    return dec_input


In [28]:

def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    '''Create the encoding layer'''
    enc_state = []
    
    for layer in range(num_layers):
        # Define forward and backward LSTM cells with dropout
        cell_fw = tf.keras.layers.LSTMCell(rnn_size,
                                           kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2),
                                           dropout=1 - keep_prob)
        cell_bw = tf.keras.layers.LSTMCell(rnn_size,
                                           kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2),
                                           dropout=1 - keep_prob)

        # Create bidirectional RNN layer
        enc_output, enc_state_fw, enc_state_bw = tf.keras.layers.Bidirectional(
            tf.keras.layers.RNN([cell_fw, cell_bw], return_sequences=True, return_state=True)
        )(rnn_inputs, mask=tf.sequence_mask(sequence_length), training=True)

        # Concatenate forward and backward outputs
        enc_output = tf.concat(enc_output, axis=-1)
        enc_state.extend([tf.concat([enc_state_fw[i], enc_state_bw[i]], axis=-1) for i in range(2)])
        
    return enc_output, enc_state


In [29]:
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
                            vocab_size, max_summary_length):
    '''Create the training logits'''
    
    # Create TrainingHelper
    training_helper = tf.keras.layers.Masking(mask_value=0.0)(
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=dec_embed_input.shape[-1], mask_zero=True)(dec_embed_input),
        sequence_length=summary_length
    )

    # Create BasicDecoder
    training_decoder = tf.keras.layers.RNN(dec_cell, return_sequences=True, return_state=True)(
        training_helper, initial_state=initial_state
    )
    
    # Apply output layer
    training_logits = output_layer(training_decoder[0])
    
    return training_logits


In [30]:

def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    '''Create the inference logits'''
    
    # Tile start tokens
    start_tokens = tf.tile(tf.expand_dims([start_token], 0), [batch_size], name='start_tokens')
    
    # Create GreedyEmbeddingHelper
    inference_helper = tf.keras.layers.GreedyEmbeddingHelper(embeddings,
                                                             start_tokens,
                                                             end_token)
    
    # Create BasicDecoder
    inference_decoder = tf.keras.layers.RNN(dec_cell, return_sequences=True, return_state=True)(
        inference_helper.initialize(), initial_state=initial_state
    )
    
    # Apply output layer
    inference_logits = output_layer(inference_decoder[0])
    
    return inference_logits


In [31]:
from tensorflow_addons.seq2seq import BahdanauAttention, AttentionWrapper
from tensorflow.keras.layers import LSTMCell, Dense, Masking, Embedding, Dropout

def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    '''Create the decoding cell and attention for the training and inference decoding layers'''
    
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = LSTMCell(rnn_size,
                            initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = Dropout(1 - keep_prob)(lstm)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    
    attn_mech = BahdanauAttention(rnn_size,
                                  enc_output,
                                  text_length,
                                  normalize=False,
                                  name='BahdanauAttention')

    dec_cell = AttentionWrapper(dec_cell,
                                attn_mech,
                                rnn_size)
            
    initial_state = dec_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=enc_state[0])

    with tf.variable_scope("decode"):
        training_decoder = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
        
        training_logits,_ ,_ = tf.compat.v1.estimator.seq2seq.dynamic_decode(training_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_decoder = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
        
        inference_logits,_ ,_ = tf.compat.v1.estimator.seq2seq.dynamic_decode(inference_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)

    return training_logits, inference_logits



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [32]:
import tensorflow as tf

def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
    '''Use the previous functions to create the training and inference logits'''
    
    # Use Numberbatch's embeddings and the newly created ones as our embeddings
    embeddings = tf.constant(word_embedding_matrix)
    # embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    # Define output layer
    output_layer = tf.keras.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    
    training_logits, inference_logits = decoding_layer(dec_embed_input, 
                                                       embeddings,
                                                       enc_output,
                                                       enc_state, 
                                                       vocab_size, 
                                                       text_length, 
                                                       summary_length, 
                                                       max_summary_length,
                                                       rnn_size, 
                                                       vocab_to_int, 
                                                       keep_prob, 
                                                       batch_size,
                                                       num_layers,
                                                       output_layer)
    
    return training_logits, inference_logits


In [33]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [34]:
def get_batches(summaries, texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [35]:
# Set the Hyperparameters
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

In [36]:
# Load the model inputs    
input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

# Create the training and inference logits
training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                  targets, 
                                                  keep_prob,   
                                                  text_length,
                                                  summary_length,
                                                  max_summary_length,
                                                  len(vocab_to_int) + 1,
                                                  rnn_size, 
                                                  num_layers, 
                                                  vocab_to_int,
                                                  batch_size)

# Create the weights for sequence_loss
masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss = loss_object(targets, training_logits, sample_weight=masks)

# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate)

# Gradient Clipping
gradients = optimizer.get_gradients(loss, [training_logits])
clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_norm)
train_op = optimizer.apply_gradients(zip(clipped_gradients, [training_logits]))

# Define logits and predictions as identity tensors
training_logits = tf.identity(training_logits, name='logits')
inference_logits = tf.identity(inference_logits, name='predictions')

print("Graph is built.")


TypeError: Keras symbolic inputs/outputs do not implement `__len__`. You may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model. This error will also get raised if you try asserting a symbolic input/output directly.