# Model Testing

## Libraries

In [1]:
import pandas as pd
import numpy as np

## Base Data Frame

In [2]:
df = pd.read_json('../raw_data/yelp_academic_dataset_review.json', lines=True, nrows=5000)

In [3]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [4]:
X = df.text
y = df.stars

## Data Cleaning

In [5]:
import string
def clean_text(s):
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = s.lower()
    return s.split()

In [6]:
X = X.map(clean_text)

## Data train / test splitting

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
X_train

306     [my, favorite, bar, in, columbus, a, friend, r...
2289    [had, sleep, number, beds, which, was, interes...
872     [came, in, on, a, saturday, for, lunch, to, an...
629     [worth, a, visit, just, for, the, unparalleled...
2981    [ive, been, here, a, few, times, and, have, be...
                              ...                        
4593    [great, food, good, prices, wonderful, atmosph...
3432    [my, best, friend, treated, me, here, for, bre...
1856    [this, has, never, been, my, ideal, spot, to, ...
4081    [not, super, impressed, we, went, in, for, a, ...
3953    [disappointing, ran, out, of, pearls, and, my,...
Name: text, Length: 3500, dtype: object

## Word 2 Vec and Embedding

In [9]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train, size=60, min_count=10, window=10)

In [10]:
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [11]:
X_train = embedding(word2vec, X_train)
X_test = embedding(word2vec, X_test)

## Padding

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, dtype='float32', value=-1000)
X_test_pad = pad_sequences(X_test, dtype='float32', value=-1000)

## Simplifying the target into binary reviews

In [13]:
def binary_review(x):
    if x >= 4:
        return 1
    return 0

In [14]:
y_test = y_test.map(binary_review)
y_train = y_train.map(binary_review)

## Base LSTM Model

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop

opt = RMSprop(learning_rate=0.001)

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

model = init_model()

In [16]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(patience = 5, restore_best_weights=True)

In [17]:
model = init_model()
history = model.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=10,
          validation_split=0.2,
          callbacks = [es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
y_test

3135    1
2412    0
912     1
101     1
1996    1
       ..
712     1
4857    1
405     1
1509    0
3935    1
Name: stars, Length: 1500, dtype: int64

In [19]:
model.predict(X_test_pad)

array([[0.34222454],
       [0.7594547 ],
       [0.66431576],
       ...,
       [0.27653074],
       [0.25785708],
       [0.30469087]], dtype=float32)