In [1]:
from collections import Counter
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk
import pandas as pd
import re
import string
import tensorflow as tf

2023-01-10 17:05:34.261131: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# download the stop words
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sergani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load reviews into dataframe
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

df.shape


(1000, 2)

In [4]:
df.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
print((df.Liked == 0).sum())  # bad
print((df.Liked == 1).sum())  # good


500
500


In [6]:
# Text preprocessing
def remove_urls(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# check the string module here
# https://www.digitalocean.com/community/tutorials/python-string-module


def remove_punct(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


def remove_mention_hashtag(text):
    mention_hashtag = re.compile(r'(@|#)[a-zA-Z0-9_]*')
    return mention_hashtag.sub(r'', text)


def remove_spaces(text):
    leading_spaces = re.compile(r'^\s+')
    trailing_spaces = re.compile(r'\s+$')
    text = leading_spaces.sub('', text)

    return trailing_spaces.sub('', text)


def remove_stopwords(text):
    cleaned_words = [word.lower()
                     for word in text.split() if word.lower() not in stop_words]
    return ' '.join(cleaned_words)


string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
# clean up all the reviews
df['Review'] = df.Review.map(remove_urls)
df['Review'] = df.Review.map(remove_mention_hashtag)
df['Review'] = df.Review.map(remove_punct)
df['Review'] = df.Review.map(remove_spaces)
df['Review'] = df.Review.map(remove_stopwords)


In [8]:
df.head()


Unnamed: 0,Review,Liked
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices,1


In [9]:
# count all the unique words
counter = Counter()

for text in df['Review'].values:
    for word in text.split():
        counter[word] += 1

counter


Counter({'wow': 3,
         'loved': 10,
         'place': 106,
         'crust': 2,
         'good': 95,
         'tasty': 11,
         'texture': 3,
         'nasty': 3,
         'stopped': 2,
         'late': 2,
         'may': 5,
         'bank': 1,
         'holiday': 1,
         'rick': 1,
         'steve': 1,
         'recommendation': 3,
         'selection': 10,
         'menu': 15,
         'great': 70,
         'prices': 10,
         'getting': 8,
         'angry': 1,
         'want': 13,
         'damn': 4,
         'pho': 6,
         'honeslty': 1,
         'didnt': 13,
         'taste': 13,
         'fresh': 14,
         'potatoes': 3,
         'like': 46,
         'rubber': 1,
         'could': 16,
         'tell': 6,
         'made': 17,
         'ahead': 1,
         'time': 42,
         'kept': 5,
         'warmer': 1,
         'fries': 9,
         'touch': 2,
         'service': 83,
         'prompt': 1,
         'would': 28,
         'go': 43,
         'back': 61,
  

In [10]:
# number of unique words
len(counter)


1965

In [11]:
# top common words
counter.most_common(10)


[('food', 124),
 ('place', 106),
 ('good', 95),
 ('service', 83),
 ('great', 70),
 ('back', 61),
 ('like', 46),
 ('go', 43),
 ('time', 42),
 ('really', 36)]

In [12]:
num_uniq_words = len(counter)


In [13]:
# split dataset to train and validation datasets
train_size = int(df.shape[0] * 0.8)  # 80%

train_df = df[:train_size]
val_df = df[train_size:]

# split text and labels
train_sentences = train_df['Review'].values
train_labels = train_df['Liked'].values

val_sentences = val_df['Review'].values
val_labels = val_df['Liked'].values


In [14]:
train_sentences, train_labels


(array(['wow loved place', 'crust good', 'tasty texture nasty',
        'stopped late may bank holiday rick steve recommendation loved',
        'selection menu great prices', 'getting angry want damn pho',
        'honeslty didnt taste fresh',
        'potatoes like rubber could tell made ahead time kept warmer',
        'fries great', 'great touch', 'service prompt', 'would go back',
        'cashier care ever say still ended wayyy overpriced',
        'tried cape cod ravoli chicken cranberrymmmm',
        'disgusted pretty sure human hair', 'shocked signs indicate cash',
        'highly recommended', 'waitress little slow service',
        'place worth time let alone vegas', 'like', 'burrittos blah',
        'food amazing', 'service also cute',
        'could care less interior beautiful', 'performed',
        'thats rightthe red velvet cakeohhh stuff good',
        'never brought salad asked',
        'hole wall great mexican street tacos friendly staff',
        'took hour get foo

In [15]:
train_sentences.shape, val_sentences.shape


((800,), (200,))

In [16]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)


In [17]:
# word indices
word_index = tokenizer.word_index


In [18]:
word_index


{'food': 1,
 'good': 2,
 'place': 3,
 'service': 4,
 'great': 5,
 'back': 6,
 'like': 7,
 'time': 8,
 'go': 9,
 'really': 10,
 'never': 11,
 'best': 12,
 'also': 13,
 'friendly': 14,
 'one': 15,
 'delicious': 16,
 'ever': 17,
 'restaurant': 18,
 'nice': 19,
 'dont': 20,
 'amazing': 21,
 'vegas': 22,
 'would': 23,
 'staff': 24,
 'minutes': 25,
 'definitely': 26,
 'chicken': 27,
 'ive': 28,
 'im': 29,
 'disappointed': 30,
 'get': 31,
 'first': 32,
 'well': 33,
 'going': 34,
 'even': 35,
 'made': 36,
 'pretty': 37,
 'bad': 38,
 'got': 39,
 'came': 40,
 'us': 41,
 'much': 42,
 'always': 43,
 'experience': 44,
 'steak': 45,
 'menu': 46,
 'taste': 47,
 'could': 48,
 'salad': 49,
 'quality': 50,
 'ordered': 51,
 'stars': 52,
 'times': 53,
 'love': 54,
 'fantastic': 55,
 'wont': 56,
 'pizza': 57,
 'tasty': 58,
 'didnt': 59,
 'fresh': 60,
 'say': 61,
 'burger': 62,
 'think': 63,
 'way': 64,
 'night': 65,
 'come': 66,
 'loved': 67,
 'selection': 68,
 'prices': 69,
 'worst': 70,
 'enough': 71,
 '

In [19]:
# convert sentences to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)


In [20]:
train_sentences[:6], train_sequences[:6]


(array(['wow loved place', 'crust good', 'tasty texture nasty',
        'stopped late may bank holiday rick steve recommendation loved',
        'selection menu great prices', 'getting angry want damn pho'],
       dtype=object),
 [[267, 67, 3],
  [377, 2],
  [58, 378, 379],
  [380, 381, 268, 648, 649, 650, 651, 269, 67],
  [68, 46, 5, 69],
  [164, 652, 79, 208, 130]])

In [21]:
# pad all sequences to ensure they all have the same length for the training model
max_length = 0
for sequence in train_sequences:
    # print(sequence)
    if len(sequence) > max_length:
        print(sequence)
        max_length = len(sequence)

max_length

[267, 67, 3]
[380, 381, 268, 648, 649, 650, 651, 269, 67]
[382, 7, 654, 48, 209, 36, 655, 8, 270, 656]
[133, 165, 31, 1, 396, 211, 18, 1, 680, 212, 681, 277, 111, 7, 278, 397]
[991, 151, 523, 523, 81, 85, 151, 992, 81, 524, 85, 151, 123, 993, 346, 300, 994]


17

In [22]:
# increase the max length a bit
max_length = 20

train_sequences_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding='post', truncating='post')
val_sequences_padded = pad_sequences(
    val_sequences, maxlen=max_length, padding='post', truncating='post')

train_sequences_padded.shape, val_sequences_padded.shape


((800, 20), (200, 20))

In [23]:
print(train_sentences[5])
print(train_sequences[5])
print(train_sequences_padded[5])

getting angry want damn pho
[164, 652, 79, 208, 130]
[164 652  79 208 130   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [24]:
# reverse the word index for lookups later on
reverse_word_index = dict((v, k) for (k, v) in word_index.items())
reverse_word_index

{1: 'food',
 2: 'good',
 3: 'place',
 4: 'service',
 5: 'great',
 6: 'back',
 7: 'like',
 8: 'time',
 9: 'go',
 10: 'really',
 11: 'never',
 12: 'best',
 13: 'also',
 14: 'friendly',
 15: 'one',
 16: 'delicious',
 17: 'ever',
 18: 'restaurant',
 19: 'nice',
 20: 'dont',
 21: 'amazing',
 22: 'vegas',
 23: 'would',
 24: 'staff',
 25: 'minutes',
 26: 'definitely',
 27: 'chicken',
 28: 'ive',
 29: 'im',
 30: 'disappointed',
 31: 'get',
 32: 'first',
 33: 'well',
 34: 'going',
 35: 'even',
 36: 'made',
 37: 'pretty',
 38: 'bad',
 39: 'got',
 40: 'came',
 41: 'us',
 42: 'much',
 43: 'always',
 44: 'experience',
 45: 'steak',
 46: 'menu',
 47: 'taste',
 48: 'could',
 49: 'salad',
 50: 'quality',
 51: 'ordered',
 52: 'stars',
 53: 'times',
 54: 'love',
 55: 'fantastic',
 56: 'wont',
 57: 'pizza',
 58: 'tasty',
 59: 'didnt',
 60: 'fresh',
 61: 'say',
 62: 'burger',
 63: 'think',
 64: 'way',
 65: 'night',
 66: 'come',
 67: 'loved',
 68: 'selection',
 69: 'prices',
 70: 'worst',
 71: 'enough',
 7

In [25]:
def decode_sequence(seq):
    return ' '.join(reverse_word_index.get(idx, '?') for idx in seq)

In [26]:
# let's give this a test
print(train_sentences[5])
print(train_sequences[5])
print(decode_sequence(train_sequences[5]))

getting angry want damn pho
[164, 652, 79, 208, 130]
getting angry want damn pho


In [119]:
# create the RNN model
# reset all previous sessions
keras.backend.clear_session()
model = keras.models.Sequential()

model.add(layers.Embedding(num_uniq_words, 32, input_length=max_length))
model.add(layers.LSTM(32, dropout=0.1, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(64, dropout=0.1, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(128, dropout=0.1, return_sequences=True))
model.add(layers.BatchNormalization())
model.add(layers.LSTM(256, dropout=0.1))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            62880     
                                                                 
 lstm (LSTM)                 (None, 20, 32)            8320      
                                                                 
 batch_normalization (BatchN  (None, 20, 32)           128       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 20, 64)            24832     
                                                                 
 batch_normalization_1 (Batc  (None, 20, 64)           256       
 hNormalization)                                                 
                                                                 
 lstm_2 (LSTM)               (None, 20, 128)           9

In [120]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
metrics = ['accuracy']

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)


In [121]:
history = model.fit(train_sequences_padded, train_labels, epochs=20,
          validation_data=(val_sequences_padded, val_labels), verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [122]:
pred = model.predict(val_sequences_padded)
pred = [0 if p < 0.5 else 1 for p in pred]



In [123]:
print(pred)
print(val_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
[0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1
 0 0 1 0 1 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 1 1
 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0