# Modeling with Stemmed and Balanced Data

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

# standard sklearn imports
from sklearn.datasets import make_classification, make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# tensorflow imports for Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Conv2D, MaxPooling2D, GRU, LSTM, Embedding, Bidirectional
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

# Import regularizers
from tensorflow.keras.regularizers import l2
# Import Dropout
from tensorflow.keras.layers import Dropout
# Import Early Stopping
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.utils import to_categorical, plot_model

# CNN imports 
import os
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# GridSearch imports 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# RNN imports 
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator




# imports for reports on classification
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

plt.style.use(style='seaborn')
%matplotlib inline

## Loading and Prepping the Data

In [3]:
# Loads the data
df = pd.read_csv('../../Data/reviews_stemmed_balanced.csv')
df.head()

Unnamed: 0,business_id,name,review_id,review_stars,text,amb_casual,amb_classy,amb_target,text_length,clean_text,clean_text_length,clean_text_stem,clean_text_stem_length
0,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,wve8w6gIuPpCfo5J--AHjg,3,"The menu sounded promising, with over fifty di...",0.0,0.0,0,121,menu sounded promising fifty different dishes ...,68,menu sound promis fifti differ dish differ sty...,66
1,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,5rFuHGGbimVxPHxgM0sNSA,3,This wasn't the worst Chinese food but it wasn...,0.0,0.0,0,78,wasn' worst chinese food wasn' best egg foo yo...,41,worst chines food best egg foo young dri overc...,39
2,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,2iD3Rdbw0DUzjZSqBq3hXQ,1,I have been coming to this restaurant for over...,0.0,0.0,0,52,coming restaurant 20 years purchased shrimp fr...,27,come restaur 20 year purchas shrimp fri rice g...,26
3,0lCiLKpjrinltPFbBby4sw,The Great Wall Restaurant,e61y5ZlNwg04mAGtcD3vbQ,5,My husband and I love this place.\nGreat price...,0.0,0.0,0,23,husband love place great price lot food make s...,13,husband love place great price lot food make s...,12
4,kZFTi8FKjs30EuzurZ3v3g,Donerick's Pub,38lN2ONaypsfBDLwhGxcSg,5,Great place for beverages with your friends wh...,0.0,0.0,0,61,great place beverages friends watch game lots ...,43,great place beverag friend watch game lot tv g...,43


### Modeling with Tensorflow

In [3]:
# Sets up X and y
X = df['clean_text_stem']
y = df['amb_target']

In [4]:
# One-hot encoding for the categorical y response variable 
y = to_categorical(y)

In [5]:
# Splits the data into training and test sets from sample
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    random_state=42)

In [6]:
X_train

8210     well final got around tri place like got meatl...
12491    love mohawk atmospher food outstand bad thing ...
55375    tri sever chines place powel worth mention twi...
34871    boyfriend stop dinner first night drive across...
17449    stop ice cream treat dinner main street love i...
                               ...                        
47219    great food comfort atmospher food great gorgeo...
48987    bang buck price got awesom mediterrenean food ...
62754    two year want tri bibibop 5th avenu gv locat f...
25438    best servic small oper much staff partli accou...
26916    chicken avocado sandwich favorit mine none foo...
Name: clean_text_stem, Length: 48650, dtype: object

### Basic NLP

In [7]:
# import the Counter function
from collections import Counter

# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
# Creates a function that counts unique words
def counter_word(text):
    count = Counter()
    for doc in text.values:
        for word in doc.split():
            count[word] += 1
    return count

In [9]:
X_train

8210     well final got around tri place like got meatl...
12491    love mohawk atmospher food outstand bad thing ...
55375    tri sever chines place powel worth mention twi...
34871    boyfriend stop dinner first night drive across...
17449    stop ice cream treat dinner main street love i...
                               ...                        
47219    great food comfort atmospher food great gorgeo...
48987    bang buck price got awesom mediterrenean food ...
62754    two year want tri bibibop 5th avenu gv locat f...
25438    best servic small oper much staff partli accou...
26916    chicken avocado sandwich favorit mine none foo...
Name: clean_text_stem, Length: 48650, dtype: object

In [10]:
X_train.values[0]

'well final got around tri place like got meatloaf melt fri ok wish would got someth els honest want get anyth adventur diner first time meatloaf fri flat top grill end dri result fri averag hot tea great although littl thing impress special day got bite chicken homemad noodl veggi still littl crunchi dish well done usual like sort thing friend pasta must pretti good say jack shit gone kind picki eater oh one thing legal tender challeng friend cash'

In [11]:
# Counts the number of times a unique word appears
counter = counter_word(X_train)

In [12]:
# Finds the length or the number of unique words
len(counter)

28152

In [13]:
counter

Counter({'well': 9328,
         'final': 2158,
         'got': 11204,
         'around': 4551,
         'tri': 14488,
         'place': 27839,
         'like': 20121,
         'meatloaf': 224,
         'melt': 761,
         'fri': 8257,
         'ok': 2471,
         'wish': 1716,
         'would': 13236,
         'someth': 4259,
         'els': 2285,
         'honest': 313,
         'want': 8242,
         'get': 18966,
         'anyth': 2416,
         'adventur': 222,
         'diner': 932,
         'first': 7564,
         'time': 20971,
         'flat': 333,
         'top': 5639,
         'grill': 1831,
         'end': 2929,
         'dri': 1584,
         'result': 203,
         'averag': 1338,
         'hot': 5083,
         'tea': 2046,
         'great': 23526,
         'although': 1391,
         'littl': 7898,
         'thing': 6135,
         'impress': 2094,
         'special': 4174,
         'day': 5131,
         'bite': 1772,
         'chicken': 12839,
         'homemad': 824,
  

#### Define max number of words in a sequence 
* Setting this max number is important because we need to define a maximum sequence length that we can set to a number we pick 
* Note: Depending on the text, it is better to set this number high
* (ex: Tweet - it is better to set this number to a high number between 50-70) 
* (ex: bigger text - you can set it to 200 or more) 
* In our trial, we're just going to start off small with 50 words

The reason we need to define the sequence length is because when we use it with Tensorflow, we're going to need the same number of words/sequence length for each sequence. 

We won't be able to have sequences of different lengths. We need to map them to the same sequence size. 

In [14]:
num_words = len(counter)

# Max number of words in a sequence
max_length = 50

#### Use the Tokenizer Class

The next thing we need to use is the Tokenizer class from keras to tokenize the train sentences

In [15]:
# import the tokenizer from keras preprocessing 
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
# Fit the tokenizer onto the train sentences 
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

In [17]:
# Pulls the word index from the tokenizer 
word_index = tokenizer.word_index

# This is an attribute of the tokenizer
# The attribute is a dictionary where
# key = actual word 
# value = the number that will now represent that word

In [18]:
word_index

{'food': 1,
 'good': 2,
 'place': 3,
 'order': 4,
 'great': 5,
 'time': 6,
 'like': 7,
 'get': 8,
 'go': 9,
 'servic': 10,
 'one': 11,
 'back': 12,
 'tri': 13,
 'would': 14,
 'realli': 15,
 'chicken': 16,
 'love': 17,
 'restaur': 18,
 'also': 19,
 'got': 20,
 'come': 21,
 'drink': 22,
 'menu': 23,
 'even': 24,
 'wait': 25,
 'nice': 26,
 'delici': 27,
 'us': 28,
 'well': 29,
 'pizza': 30,
 'best': 31,
 'make': 32,
 'alway': 33,
 'bar': 34,
 'tabl': 35,
 'fri': 36,
 'want': 37,
 'sauc': 38,
 'eat': 39,
 'price': 40,
 'look': 41,
 'littl': 42,
 'tast': 43,
 'chees': 44,
 'first': 45,
 'came': 46,
 'definit': 47,
 'flavor': 48,
 'staff': 49,
 'meal': 50,
 'salad': 51,
 'friendli': 52,
 'pretti': 53,
 'columbu': 54,
 'never': 55,
 'amaz': 56,
 'went': 57,
 'much': 58,
 'ask': 59,
 'could': 60,
 'experi': 61,
 'made': 62,
 'peopl': 63,
 'night': 64,
 'recommend': 65,
 'locat': 66,
 'thing': 67,
 'say': 68,
 'take': 69,
 'sandwich': 70,
 'fresh': 71,
 'server': 72,
 'side': 73,
 'friend': 74,

In [19]:
# Creates the sequences from our tokenizer, based on the indices from the word_index
train_sequences = tokenizer.texts_to_sequences(X_train)

In [20]:
X_train.values[0]

'well final got around tri place like got meatloaf melt fri ok wish would got someth els honest want get anyth adventur diner first time meatloaf fri flat top grill end dri result fri averag hot tea great although littl thing impress special day got bite chicken homemad noodl veggi still littl crunchi dish well done usual like sort thing friend pasta must pretti good say jack shit gone kind picki eater oh one thing legal tender challeng friend cash'

In [21]:
train_sequences[0]  # This sequence has 24 words

[29,
 260,
 20,
 109,
 13,
 3,
 7,
 20,
 1515,
 671,
 36,
 227,
 330,
 14,
 20,
 118,
 245,
 1227,
 37,
 8,
 234,
 1531,
 580,
 45,
 6,
 1515,
 36,
 1170,
 76,
 314,
 190,
 357,
 1622,
 36,
 426,
 93,
 276,
 5,
 416,
 42,
 67,
 267,
 122,
 90,
 20,
 324,
 16,
 638,
 480,
 299,
 99,
 42,
 862,
 89,
 29,
 316,
 188,
 7,
 758,
 67,
 74,
 403,
 307,
 53,
 2,
 68,
 1796,
 1807,
 597,
 195,
 1364,
 1344,
 337,
 11,
 67,
 4879,
 427,
 1787,
 74,
 1052]

In [22]:
len(train_sequences[0])

80

In [23]:
# Now adding padding
from tensorflow.keras.preprocessing.sequence import pad_sequences 

train_padded = pad_sequences(
    train_sequences, maxlen=max_length, padding='post', truncating='post'
)

In [24]:
train_padded[0]

array([  29,  260,   20,  109,   13,    3,    7,   20, 1515,  671,   36,
        227,  330,   14,   20,  118,  245, 1227,   37,    8,  234, 1531,
        580,   45,    6, 1515,   36, 1170,   76,  314,  190,  357, 1622,
         36,  426,   93,  276,    5,  416,   42,   67,  267,  122,   90,
         20,  324,   16,  638,  480,  299], dtype=int32)

In [25]:
# Creates the test dataset sequences and padding
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(
    test_sequences, maxlen=max_length, padding='post', truncating='post'
)

In [26]:
print(X_train.values[0])
print(train_sequences[0])

well final got around tri place like got meatloaf melt fri ok wish would got someth els honest want get anyth adventur diner first time meatloaf fri flat top grill end dri result fri averag hot tea great although littl thing impress special day got bite chicken homemad noodl veggi still littl crunchi dish well done usual like sort thing friend pasta must pretti good say jack shit gone kind picki eater oh one thing legal tender challeng friend cash
[29, 260, 20, 109, 13, 3, 7, 20, 1515, 671, 36, 227, 330, 14, 20, 118, 245, 1227, 37, 8, 234, 1531, 580, 45, 6, 1515, 36, 1170, 76, 314, 190, 357, 1622, 36, 426, 93, 276, 5, 416, 42, 67, 267, 122, 90, 20, 324, 16, 638, 480, 299, 99, 42, 862, 89, 29, 316, 188, 7, 758, 67, 74, 403, 307, 53, 2, 68, 1796, 1807, 597, 195, 1364, 1344, 337, 11, 67, 4879, 427, 1787, 74, 1052]


In [27]:
# Checks to make sure that you can decode in reverse 
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode(train_sequences[0])

'well final got around tri place like got meatloaf melt fri ok wish would got someth els honest want get anyth adventur diner first time meatloaf fri flat top grill end dri result fri averag hot tea great although littl thing impress special day got bite chicken homemad noodl veggi still littl crunchi dish well done usual like sort thing friend pasta must pretti good say jack shit gone kind picki eater oh one thing legal tender challeng friend cash'

In [28]:
# Checks the shape of the train and the shape of the test
print(f'Shape of train {train_padded.shape}')
print(f'Shape of test {test_padded.shape}')

Shape of train (48650, 50)
Shape of test (16217, 50)


### Set up the Network topology

* We could've used One Hot Encoding (OHE) to convert these indices into vectors of 0s an 1s, but this would increase the dimensionality of our features 

**Instead...**
* The Embedding layer - maps each word to a vector of a fixed size with real value elements...
* In contrast to One Hot Encoding, we can use finite size vector to represent an infinite number of real numbers. 
* We're going to use dimensionality of this embedding layer (32) and the input length will be the max length

In [29]:
# Sets up network topology 
model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_length))

# LSTM layer
model.add(LSTM(24))

# Dense hidden layers
model.add(Dense(64, activation='relu', kernel_regularizer=l2(.001)))
model.add(Dropout(0.5))

model.add(Dense(8, activation='relu', kernel_regularizer=l2(.001)))
model.add(Dropout(0.5))

# Output layer
model.add(Dense(4, activation='softmax'))

optimizer = Adam(lr=3e-4)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

In [30]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 32)            900864    
_________________________________________________________________
lstm (LSTM)                  (None, 24)                5472      
_________________________________________________________________
dense (Dense)                (None, 64)                1600      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 520       
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 3

In [31]:
# Fit the model
history = model.fit(
    train_padded, y_train, epochs=25, batch_size=256, 
    validation_data=(test_padded, y_test)
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [35]:
df['amb_target'].value_counts(normalize=True)

1    0.328703
3    0.279911
2    0.207394
0    0.183992
Name: amb_target, dtype: float64