# Modeling a Multi-Class Prediction using TensorFlow LSTM With More Balanced Classes

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

# standard sklearn imports
from sklearn.datasets import make_classification, make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# tensorflow imports for Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Conv2D, MaxPooling2D, GRU, LSTM, Embedding
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

# Import regularizers
from tensorflow.keras.regularizers import l2
# Import Dropout
from tensorflow.keras.layers import Dropout
# Import Early Stopping
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.utils import to_categorical, plot_model

# CNN imports 
import os
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# GridSearch imports 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# RNN imports 
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator




# imports for reports on classification
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
df = pd.read_csv('../../Data/reviews_stemmed_balanced.csv')
df.head()

Unnamed: 0,business_id,name,review_id,review_stars,text,amb_casual,amb_classy,amb_target,text_length,clean_text,clean_text_length,clean_text_stem,clean_text_stem_length
0,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,wve8w6gIuPpCfo5J--AHjg,3,"The menu sounded promising, with over fifty di...",0.0,0.0,0,121,menu sounded promising fifty different dishes ...,68,menu sound promis fifti differ dish differ sty...,66
1,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,5rFuHGGbimVxPHxgM0sNSA,3,This wasn't the worst Chinese food but it wasn...,0.0,0.0,0,78,wasn' worst chinese food wasn' best egg foo yo...,41,worst chines food best egg foo young dri overc...,39
2,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,2iD3Rdbw0DUzjZSqBq3hXQ,1,I have been coming to this restaurant for over...,0.0,0.0,0,52,coming restaurant 20 years purchased shrimp fr...,27,come restaur 20 year purchas shrimp fri rice g...,26
3,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,e61y5ZlNwg04mAGtcD3vbQ,5,My husband and I love this place.\nGreat price...,0.0,0.0,0,23,husband love place great price lot food make s...,13,husband love place great price lot food make s...,12
4,kZFTi8FKjs30EuzurZ3v3g,Donerick's Pub,38lN2ONaypsfBDLwhGxcSg,5,Great place for beverages with your friends wh...,0.0,0.0,0,61,great place beverages friends watch game lots ...,43,great place beverag friend watch game lot tv g...,43


## 1) Modeling with stemmed text

In [5]:
# Sets up X and y
X = df['clean_text_stem']
y = df['amb_target']

In [6]:
# One-hot encoding for the categorical y response variable 
y = to_categorical(y)

In [7]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

In [8]:
X_train

8210     look breakfast place town tri stumbl upon mcca...
12563    love mohawk atmospher food outstand bad thing ...
55374    tri sever chines place powel worth mention twi...
34856    boyfriend stop dinner first night drive across...
17500    stop ice cream treat dinner main street love i...
                               ...                        
47234    great food comfort atmospher food great gorgeo...
49006    bang buck price got awesom mediterrenean food ...
62754    bibibop nice wide select fresh ingredi broad s...
25465    best servic small oper much staff partli accou...
26940    chicken avocado sandwich favorit mine none foo...
Name: clean_text_stem, Length: 48650, dtype: object

### Basic NLP

In [9]:
# import the Counter function
from collections import Counter

# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
# Creates a function that counts unique words
def counter_word(text):
    count = Counter()
    for doc in text.values:
        for word in doc.split():
            count[word] += 1
    return count

In [11]:
X_train

8210     look breakfast place town tri stumbl upon mcca...
12563    love mohawk atmospher food outstand bad thing ...
55374    tri sever chines place powel worth mention twi...
34856    boyfriend stop dinner first night drive across...
17500    stop ice cream treat dinner main street love i...
                               ...                        
47234    great food comfort atmospher food great gorgeo...
49006    bang buck price got awesom mediterrenean food ...
62754    bibibop nice wide select fresh ingredi broad s...
25465    best servic small oper much staff partli accou...
26940    chicken avocado sandwich favorit mine none foo...
Name: clean_text_stem, Length: 48650, dtype: object

In [12]:
X_train.values[0]

'look breakfast place town tri stumbl upon mccarthi smaller size resteraunt think make due space busi saturday afternoon expect got tabl pretti quickli order skillet scrambler pancak signific got egg benedict skillet good howev call skillet like scrambl egg minc meat side scallop potato potato melt mouth good egg cook excel toast brought delici pancak superb size plate nice circumfer also thicker pancak great tast great said egg benedict alright egg slightli undercook hollandais alright much flavor also would like ham thicker piec ham instead shave use overal price reason food good servic nice throughout visit definit go back'

In [13]:
# Counts the number of times a unique word appears
counter = counter_word(X_train)

In [14]:
# Finds the length or the number of unique words
len(counter)

28152

In [15]:
counter

Counter({'look': 7820,
         'breakfast': 3070,
         'place': 27596,
         'town': 2618,
         'tri': 14430,
         'stumbl': 176,
         'upon': 665,
         'mccarthi': 4,
         'smaller': 545,
         'size': 2286,
         'resteraunt': 6,
         'think': 5441,
         'make': 8633,
         'due': 936,
         'space': 1567,
         'busi': 4994,
         'saturday': 1543,
         'afternoon': 637,
         'expect': 3589,
         'got': 11134,
         'tabl': 8251,
         'pretti': 6886,
         'quickli': 1435,
         'order': 25399,
         'skillet': 205,
         'scrambler': 45,
         'pancak': 832,
         'signific': 64,
         'egg': 3230,
         'benedict': 149,
         'good': 30628,
         'howev': 2895,
         'call': 3175,
         'like': 19968,
         'scrambl': 181,
         'minc': 25,
         'meat': 4288,
         'side': 5949,
         'scallop': 614,
         'potato': 3020,
         'melt': 777,
         'm

#### Define max number of words in a sequence 
* Setting this max number is important because we need to define a maximum sequence length that we can set to a number we pick 
* Note: Depending on the text, it is better to set this number high
* (ex: Tweet - it is better to set this number to a high number between 50-70) 
* (ex: bigger text - you can set it to 200 or more) 
* In our trial, we're just going to start off small with 50 words

The reason we need to define the sequence length is because when we use it with Tensorflow, we're going to need the same number of words/sequence length for each sequence. 

We won't be able to have sequences of different lengths. We need to map them to the same sequence size. 

In [35]:
num_words = len(counter)

# Max number of words in a sequence
max_length = 50

#### Use the Tokenizer Class

The next thing we need to use is the Tokenizer class from keras to tokenize the train sentences

In [17]:
# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
# Fit the tokenizer onto the train sentences 
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

In [19]:
# Pulls the word index from the tokenizer 
word_index = tokenizer.word_index

# This is an attribute of the tokenizer
# The attribute is a dictionary where
# key = actual word 
# value = the number that will now represent that word

In [20]:
word_index

{'food': 1,
 'good': 2,
 'place': 3,
 'order': 4,
 'great': 5,
 'time': 6,
 'like': 7,
 'get': 8,
 'go': 9,
 'servic': 10,
 'one': 11,
 'back': 12,
 'tri': 13,
 'would': 14,
 'realli': 15,
 'chicken': 16,
 'love': 17,
 'restaur': 18,
 'also': 19,
 'got': 20,
 'come': 21,
 'drink': 22,
 'menu': 23,
 'even': 24,
 'wait': 25,
 'nice': 26,
 'delici': 27,
 'us': 28,
 'well': 29,
 'pizza': 30,
 'best': 31,
 'alway': 32,
 'make': 33,
 'bar': 34,
 'fri': 35,
 'tabl': 36,
 'want': 37,
 'sauc': 38,
 'eat': 39,
 'price': 40,
 'littl': 41,
 'look': 42,
 'tast': 43,
 'chees': 44,
 'first': 45,
 'came': 46,
 'flavor': 47,
 'definit': 48,
 'staff': 49,
 'meal': 50,
 'salad': 51,
 'friendli': 52,
 'pretti': 53,
 'columbu': 54,
 'amaz': 55,
 'never': 56,
 'went': 57,
 'experi': 58,
 'could': 59,
 'much': 60,
 'ask': 61,
 'made': 62,
 'locat': 63,
 'thing': 64,
 'recommend': 65,
 'peopl': 66,
 'night': 67,
 'take': 68,
 'server': 69,
 'side': 70,
 'fresh': 71,
 'sandwich': 72,
 'say': 73,
 'friend': 74,

In [21]:
# Creates the sequences from our tokenizer, based on the indices from the word_index
train_sequences = tokenizer.texts_to_sequences(X_train)

In [22]:
X_train.values[0]

'look breakfast place town tri stumbl upon mccarthi smaller size resteraunt think make due space busi saturday afternoon expect got tabl pretti quickli order skillet scrambler pancak signific got egg benedict skillet good howev call skillet like scrambl egg minc meat side scallop potato potato melt mouth good egg cook excel toast brought delici pancak superb size plate nice circumfer also thicker pancak great tast great said egg benedict alright egg slightli undercook hollandais alright much flavor also would like ham thicker piec ham instead shave use overal price reason food good servic nice throughout visit definit go back'

In [23]:
train_sequences[0]  # This sequence has 24 words

[42,
 179,
 3,
 211,
 13,
 1754,
 744,
 10483,
 857,
 243,
 8880,
 82,
 33,
 577,
 354,
 95,
 361,
 774,
 151,
 20,
 36,
 53,
 392,
 4,
 1611,
 3632,
 626,
 3045,
 20,
 170,
 1950,
 1611,
 2,
 191,
 173,
 1611,
 7,
 1734,
 170,
 4798,
 116,
 70,
 794,
 181,
 181,
 662,
 619,
 2,
 170,
 123,
 135,
 450,
 318,
 27,
 626,
 1313,
 243,
 216,
 26,
 9572,
 19,
 2317,
 626,
 5,
 43,
 5,
 101,
 170,
 1950,
 1284,
 170,
 695,
 1448,
 3097,
 1284,
 60,
 47,
 19,
 14,
 7,
 1090,
 2317,
 344,
 1090,
 349,
 2584,
 143,
 180,
 40,
 185,
 1,
 2,
 10,
 26,
 1099,
 78,
 48,
 9,
 12]

In [24]:
len(train_sequences[0])

99

In [25]:
# Now adding padding
from tensorflow.keras.preprocessing.sequence import pad_sequences 

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding='post', truncating='post'
)

Using TensorFlow backend.


In [26]:
train_padded[0]

array([   42,   179,     3,   211,    13,  1754,   744, 10483,   857,
         243,  8880,    82,    33,   577,   354,    95,   361,   774,
         151,    20,    36,    53,   392,     4,  1611,  3632,   626,
        3045,    20,   170,  1950,  1611,     2,   191,   173,  1611,
           7,  1734,   170,  4798,   116,    70,   794,   181,   181,
         662,   619,     2,   170,   123,   135,   450,   318,    27,
         626,  1313,   243,   216,    26,  9572,    19,  2317,   626,
           5,    43,     5,   101,   170,  1950,  1284,   170,   695,
        1448,  3097,  1284,    60,    47,    19,    14,     7,  1090,
        2317,   344,  1090,   349,  2584,   143,   180,    40,   185,
           1,     2,    10,    26,  1099,    78,    48,     9,    12,
           0], dtype=int32)

In [27]:
# Creates the test dataset sequences and padding
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding='post', truncating='post'
)

In [28]:
print(X_train.values[0])
print(train_sequences[0])

look breakfast place town tri stumbl upon mccarthi smaller size resteraunt think make due space busi saturday afternoon expect got tabl pretti quickli order skillet scrambler pancak signific got egg benedict skillet good howev call skillet like scrambl egg minc meat side scallop potato potato melt mouth good egg cook excel toast brought delici pancak superb size plate nice circumfer also thicker pancak great tast great said egg benedict alright egg slightli undercook hollandais alright much flavor also would like ham thicker piec ham instead shave use overal price reason food good servic nice throughout visit definit go back
[42, 179, 3, 211, 13, 1754, 744, 10483, 857, 243, 8880, 82, 33, 577, 354, 95, 361, 774, 151, 20, 36, 53, 392, 4, 1611, 3632, 626, 3045, 20, 170, 1950, 1611, 2, 191, 173, 1611, 7, 1734, 170, 4798, 116, 70, 794, 181, 181, 662, 619, 2, 170, 123, 135, 450, 318, 27, 626, 1313, 243, 216, 26, 9572, 19, 2317, 626, 5, 43, 5, 101, 170, 1950, 1284, 170, 695, 1448, 3097, 1284,

In [29]:
# Checks to make sure that you can decode in reverse 
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode(train_sequences[0])

'look breakfast place town tri stumbl upon mccarthi smaller size resteraunt think make due space busi saturday afternoon expect got tabl pretti quickli order skillet scrambler pancak signific got egg benedict skillet good howev call skillet like scrambl egg minc meat side scallop potato potato melt mouth good egg cook excel toast brought delici pancak superb size plate nice circumfer also thicker pancak great tast great said egg benedict alright egg slightli undercook hollandais alright much flavor also would like ham thicker piec ham instead shave use overal price reason food good servic nice throughout visit definit go back'

In [30]:
# Checks the shape of the train and the shape of the test
print(f'Shape of train {train_padded.shape}')
print(f'Shape of test {test_padded.shape}')

Shape of train (48650, 100)
Shape of test (16217, 100)


### Set up the Network topology

* We could've used One Hot Encoding (OHE) to convert these indices into vectors of 0s an 1s, but this would increase the dimensionality of our features 

**Instead...**
* The Embedding layer - maps each word to a vector of a fixed size with real value elements...
* In contrast to One Hot Encoding, we can use finite size vector to represent an infinite number of real numbers. 
* We're going to use dimensionality of this embedding layer (32) and the input length will be the max length

In [31]:
# Sets up network topology 
model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(4, activation='softmax'))

optimizer = Adam(lr=3e-4)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

In [32]:
# model_l.add(Bidirectional(LSTM(24)))

In [33]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 32)           900864    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 4)                 260       
Total params: 925,956
Trainable params: 925,956
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Fit the model
history = model.fit(
    train_padded, y_train, epochs=25, batch_size=256, 
    validation_data=(test_padded, y_test)
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [37]:
df['amb_target'].value_counts(normalize=True)

1    0.328703
3    0.279911
2    0.207394
0    0.183992
Name: amb_target, dtype: float64

## 2) Modeling with original text with stopwords removed

In [38]:
# Sets up X and y
X2 = df['clean_text']
y2 = df['amb_target']

In [39]:
# One-hot encoding for the categorical y response variable 
y2 = to_categorical(y2)

In [40]:
# Splits the data into training and test sets from sample
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, 
                                                    stratify=y2, 
                                                    random_state=42)

In [41]:
X_train2

8210     looking breakfast place town try stumbled upon...
12563    love mohawk atmosphere food outstanding bad th...
55374    tried several chinese places powell worth ment...
34856    boyfriend stopped dinner first night driving a...
17500    stopped ice cream treat dinner main street lov...
                               ...                        
47234    great food comfortable atmosphere food great g...
49006    bang buck price got awesome mediterrenean food...
62754    bibibop nice wide selection fresh ingredients ...
25465    best service small operation much staff partly...
26940    chicken avocado sandwich favorite mine none fo...
Name: clean_text, Length: 48650, dtype: object

### Basic NLP

In [42]:
# import the Counter function
from collections import Counter

# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [43]:
# Creates a function that counts unique words
def counter_word(text):
    count = Counter()
    for doc in text.values:
        for word in doc.split():
            count[word] += 1
    return count

In [44]:
X_train2

8210     looking breakfast place town try stumbled upon...
12563    love mohawk atmosphere food outstanding bad th...
55374    tried several chinese places powell worth ment...
34856    boyfriend stopped dinner first night driving a...
17500    stopped ice cream treat dinner main street lov...
                               ...                        
47234    great food comfortable atmosphere food great g...
49006    bang buck price got awesome mediterrenean food...
62754    bibibop nice wide selection fresh ingredients ...
25465    best service small operation much staff partly...
26940    chicken avocado sandwich favorite mine none fo...
Name: clean_text, Length: 48650, dtype: object

In [45]:
X_train2.values[0]

"looking breakfast place town try stumbled upon mccarthy' it' smaller sized resteraunt think make due space busy saturday afternoon expected got table pretty quickly ordered skillet scrambler pancake significant got eggs benedict skillet good however wouldn' call skillet like scrambled eggs minced meats side scalloped potatoes potatoes melt mouth good eggs cooked excellently toast brought delicious pancake superb size plate nice circumference also thicker pancake great tasted great said eggs benedict alright eggs slightly undercooked hollandaise alright didn' much flavor also would liked ham thicker piece ham instead shaved use overall price reasonable food good service nice throughout visit definitely going back"

In [46]:
# Counts the number of times a unique word appears
counter = counter_word(X_train2)

In [47]:
# Finds the length or the number of unique words
len(counter)

42840

In [48]:
counter

Counter({'looking': 2886,
         'breakfast': 3024,
         'place': 24056,
         'town': 2545,
         'try': 8170,
         'stumbled': 138,
         'upon': 665,
         "mccarthy'": 4,
         "it'": 15735,
         'smaller': 545,
         'sized': 413,
         'resteraunt': 6,
         'think': 4894,
         'make': 5616,
         'due': 934,
         'space': 1459,
         'busy': 3029,
         'saturday': 1461,
         'afternoon': 614,
         'expected': 996,
         'got': 11132,
         'table': 5646,
         'pretty': 6884,
         'quickly': 1435,
         'ordered': 10953,
         'skillet': 199,
         'scrambler': 27,
         'pancake': 187,
         'significant': 63,
         'eggs': 1294,
         'benedict': 143,
         'good': 30013,
         'however': 2895,
         "wouldn'": 1310,
         'call': 1315,
         'like': 17618,
         'scrambled': 137,
         'minced': 24,
         'meats': 685,
         'side': 4614,
         'scal

#### Define max number of words in a sequence 
* Setting this max number is important because we need to define a maximum sequence length that we can set to a number we pick 
* Note: Depending on the text, it is better to set this number high
* (ex: Tweet - it is better to set this number to a high number between 50-70) 
* (ex: bigger text - you can set it to 200 or more) 
* In our trial, we're just going to start off small with 50 words

The reason we need to define the sequence length is because when we use it with Tensorflow, we're going to need the same number of words/sequence length for each sequence. 

We won't be able to have sequences of different lengths. We need to map them to the same sequence size. 

In [50]:
num_words = len(counter)

# Max number of words in a sequence
max_length = 50

#### Use the Tokenizer Class

The next thing we need to use is the Tokenizer class from keras to tokenize the train sentences

In [51]:
# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [52]:
# Fit the tokenizer onto the train sentences 
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train2)

In [53]:
# Pulls the word index from the tokenizer 
word_index = tokenizer.word_index

# This is an attribute of the tokenizer
# The attribute is a dictionary where
# key = actual word 
# value = the number that will now represent that word

In [54]:
word_index

{'food': 1,
 'good': 2,
 'place': 3,
 'great': 4,
 "i'": 5,
 'like': 6,
 'service': 7,
 'time': 8,
 "it'": 9,
 'get': 10,
 'one': 11,
 'back': 12,
 'go': 13,
 'really': 14,
 'would': 15,
 'chicken': 16,
 'also': 17,
 'order': 18,
 'got': 19,
 'ordered': 20,
 'restaurant': 21,
 'menu': 22,
 'us': 23,
 'delicious': 24,
 'well': 25,
 'best': 26,
 'nice': 27,
 'love': 28,
 'always': 29,
 "don'": 30,
 'pizza': 31,
 'try': 32,
 'even': 33,
 'bar': 34,
 'little': 35,
 'first': 36,
 'cheese': 37,
 'came': 38,
 'staff': 39,
 'definitely': 40,
 'sauce': 41,
 'pretty': 42,
 'friendly': 43,
 "didn'": 44,
 'never': 45,
 'went': 46,
 'columbus': 47,
 'amazing': 48,
 'much': 49,
 'could': 50,
 'made': 51,
 'come': 52,
 'experience': 53,
 'people': 54,
 'wait': 55,
 'salad': 56,
 'fresh': 57,
 'meal': 58,
 'table': 59,
 'night': 60,
 'better': 61,
 'make': 62,
 'eat': 63,
 'drinks': 64,
 'two': 65,
 'everything': 66,
 'dinner': 67,
 "you'": 68,
 'location': 69,
 '2': 70,
 'hot': 71,
 'minutes': 72,
 '

In [55]:
# Creates the sequences from our tokenizer, based on the indices from the word_index
train_sequences2 = tokenizer.texts_to_sequences(X_train2)

In [56]:
X_train2.values[0]

"looking breakfast place town try stumbled upon mccarthy' it' smaller sized resteraunt think make due space busy saturday afternoon expected got table pretty quickly ordered skillet scrambler pancake significant got eggs benedict skillet good however wouldn' call skillet like scrambled eggs minced meats side scalloped potatoes potatoes melt mouth good eggs cooked excellently toast brought delicious pancake superb size plate nice circumference also thicker pancake great tasted great said eggs benedict alright eggs slightly undercooked hollandaise alright didn' much flavor also would liked ham thicker piece ham instead shaved use overall price reasonable food good service nice throughout visit definitely going back"

In [57]:
train_sequences2[0]  # This sequence has 24 words

[169,
 155,
 3,
 194,
 32,
 2364,
 762,
 14982,
 9,
 904,
 1095,
 12552,
 76,
 62,
 586,
 382,
 154,
 380,
 816,
 547,
 19,
 59,
 42,
 390,
 20,
 1877,
 6031,
 1954,
 3800,
 19,
 431,
 2321,
 1877,
 2,
 166,
 428,
 426,
 1877,
 6,
 2374,
 431,
 6425,
 741,
 88,
 7868,
 321,
 321,
 1289,
 667,
 2,
 431,
 170,
 5401,
 578,
 298,
 24,
 1954,
 1483,
 351,
 359,
 27,
 13593,
 17,
 2765,
 1954,
 4,
 207,
 4,
 80,
 431,
 2321,
 1462,
 431,
 712,
 1646,
 3834,
 1462,
 44,
 49,
 110,
 17,
 15,
 354,
 1210,
 2765,
 659,
 1210,
 335,
 3835,
 365,
 153,
 126,
 514,
 1,
 2,
 7,
 27,
 1209,
 136,
 40,
 79,
 12]

In [58]:
len(train_sequences2[0])

102

In [59]:
# Now adding padding
from keras.preprocessing.sequence import pad_sequences 

train_padded2 = pad_sequences(
    train_sequences2, maxlen=max_length, padding='post', truncating='post'
)

In [60]:
train_padded2[0]

array([  169,   155,     3,   194,    32,  2364,   762, 14982,     9,
         904,  1095, 12552,    76,    62,   586,   382,   154,   380,
         816,   547,    19,    59,    42,   390,    20,  1877,  6031,
        1954,  3800,    19,   431,  2321,  1877,     2,   166,   428,
         426,  1877,     6,  2374,   431,  6425,   741,    88,  7868,
         321,   321,  1289,   667,     2], dtype=int32)

In [61]:
# Creates the test dataset sequences and padding
test_sequences2 = tokenizer.texts_to_sequences(X_test2)
test_padded2 = pad_sequences(
    test_sequences2, maxlen=max_length, padding='post', truncating='post'
)

In [62]:
print(X_train2.values[0])
print(train_sequences2[0])

looking breakfast place town try stumbled upon mccarthy' it' smaller sized resteraunt think make due space busy saturday afternoon expected got table pretty quickly ordered skillet scrambler pancake significant got eggs benedict skillet good however wouldn' call skillet like scrambled eggs minced meats side scalloped potatoes potatoes melt mouth good eggs cooked excellently toast brought delicious pancake superb size plate nice circumference also thicker pancake great tasted great said eggs benedict alright eggs slightly undercooked hollandaise alright didn' much flavor also would liked ham thicker piece ham instead shaved use overall price reasonable food good service nice throughout visit definitely going back
[169, 155, 3, 194, 32, 2364, 762, 14982, 9, 904, 1095, 12552, 76, 62, 586, 382, 154, 380, 816, 547, 19, 59, 42, 390, 20, 1877, 6031, 1954, 3800, 19, 431, 2321, 1877, 2, 166, 428, 426, 1877, 6, 2374, 431, 6425, 741, 88, 7868, 321, 321, 1289, 667, 2, 431, 170, 5401, 578, 298, 24,

In [63]:
# Checks to make sure that you can decode in reverse 
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode(train_sequences2[0])

"looking breakfast place town try stumbled upon mccarthy' it' smaller sized resteraunt think make due space busy saturday afternoon expected got table pretty quickly ordered skillet scrambler pancake significant got eggs benedict skillet good however wouldn' call skillet like scrambled eggs minced meats side scalloped potatoes potatoes melt mouth good eggs cooked excellently toast brought delicious pancake superb size plate nice circumference also thicker pancake great tasted great said eggs benedict alright eggs slightly undercooked hollandaise alright didn' much flavor also would liked ham thicker piece ham instead shaved use overall price reasonable food good service nice throughout visit definitely going back"

In [64]:
# Checks the shape of the train and the shape of the test
print(f'Shape of train {train_padded2.shape}')
print(f'Shape of test {test_padded2.shape}')

Shape of train (48650, 50)
Shape of test (16217, 50)


### Set up the Network topology

* We could've used One Hot Encoding (OHE) to convert these indices into vectors of 0s an 1s, but this would increase the dimensionality of our features 

**Instead...**
* The Embedding layer - maps each word to a vector of a fixed size with real value elements...
* In contrast to One Hot Encoding, we can use finite size vector to represent an infinite number of real numbers. 
* We're going to use dimensionality of this embedding layer (32) and the input length will be the max length

In [65]:
# Sets up network topology 
model2 = Sequential()

model2.add(Embedding(num_words, 32, input_length=max_length))
model2.add(LSTM(64, dropout=0.1))
model2.add(Dense(4, activation='softmax'))

optimizer = Adam(lr=3e-4)

model2.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

In [66]:
# model_l.add(Bidirectional(LSTM(24)))

In [67]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 32)            1370880   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 1,395,972
Trainable params: 1,395,972
Non-trainable params: 0
_________________________________________________________________


In [68]:
# Fit the model
history2 = model2.fit(
    train_padded2, y_train2, epochs=25, batch_size=256, 
    validation_data=(test_padded2, y_test2)
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
