In [1]:
import numpy as np
import pandas as pd
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.callbacks import EarlyStopping
from keras.preprocessing import text, sequence
from nltk.corpus import stopwords
import string, nltk, os

os.chdir

yelp = pd.read_csv('yelp_reviews_sample.csv')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
stop_words = set(stopwords.words('english')) #'if', 'and', 'the', etc.

def preprocess(text):
    translation = str.maketrans('', '', string.punctuation)
    text = text.translate(translation)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

yelp['text'] = yelp.apply(lambda row: preprocess(row['text']), axis=1)

print(yelp.head())

Unnamed: 0               review_id                 user_id  \
0           0  Q1sbwvVQXV2734tPgoKj4Q  hG7b0MtEbXx5QzbzE6C_VA   
1           1  GJXCdrto3ASJOqKeVWPi6Q  yXQM5uF2jS6es16SJzNHfg   
2           2  2TzJjDVDEuAW6MR5Vuc1ug  n6-Gk65cPZL6Uz8qRm3NYw   
3           3  yi0R0Ugj_xUx_Nek0-_Qig  dacAIZ6fTM6mqwW5uxkskg   
4           4  11a8sVPMUFtaC7_ABRkmtw  ssoyf2_x0EQMed6fgHeMyQ   

              business_id  stars  useful  funny  cool  \
0  ujmEBvifdJM6h6RLv4wQIg    1.0       6      1     0   
1  NZnhc2sEQy3RmzKTZnqtwQ    5.0       0      0     0   
2  WTqjgwHlXbSFevF32_DJVw    5.0       3      0     0   
3  ikCg8xy5JIg_NGPx-MSIDA    5.0       0      0     0   
4  b1b1eb3uo-w561D0ZfCEiQ    1.0       7      0     0   

                                                text                 date  
0  total bill horrible service 8gs crooks actuall...  2013-05-07 04:34:36  
1  adore travis hard rocks new kelly cardenas sal...  2017-01-14 21:30:33  
2  say office really together organized f

In [3]:
No_of_Words = 5000
Max_Seq = 200
Embed_Dim = 100

tokenizer = text.Tokenizer(num_words = No_of_Words, filters = '"#&()*+,-./;:<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(yelp['text'].values)
word_index = tokenizer.word_index

print(len(word_index))

113738


In [4]:
x = tokenizer.texts_to_sequences(yelp['text'].values)
x = sequence.pad_sequences(x, maxlen = Max_Seq)

print(x.shape)

(100000, 200)


In [5]:
y = pd.get_dummies(yelp['stars'].values)

print(y.shape)

(100000, 5)


In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2)

In [7]:
from keras.callbacks import CSVLogger, ModelCheckpoint

words = Input(shape=(None,))
x = Embedding(No_of_Words, Embed_Dim)(words)
x = SpatialDropout1D(0.2)(x)
x = LSTM(64, dropout=0.2, recurrent_dropout=0.2, return_sequences = True, name='LSTM_1')(x)
hidden = concatenate([
    GlobalMaxPooling1D()(x),
    GlobalAveragePooling1D()(x),
])
hidden = add([hidden, Dense(128)(hidden)])
result = Dense(5, activation='softmax', name='soft_1')(hidden)

model = Model(words, result)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

try:
    model.load_weights('shooting_stars3.h5', by_name = True)
    print("Weights successfully loaded.")
except:
    print("No weights loaded. Proceeding to train.")

history = model.fit(x_train, y_train,
                    epochs=1,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=5, min_delta=0.0001, restore_best_weights=True),
                    ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True),
                    CSVLogger('log.csv', append=True, separator=';')])

model.save_weights('shooting_stars3.h5')
model.save('SAmodel.h5')
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Weights successfully loaded.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 64000 samples, validate on 16000 samples
Epoch 1/1

Epoch 00001: val_loss improved from inf to 0.70135, saving model to weights.hdf5
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    500000      input_1[0][0]                    
_____________________

In [8]:
accuracy = model.evaluate(x_test, y_test)
print(f'Test set\n Loss: {accuracy[0]}\n Accuracy: {accuracy[1]}')

Test set
 Loss: 0.6925991180419921
 Accuracy: 0.70955


In [9]:
#Making a prediction on a tricky 3-star review from our data.
sample = "Tracy dessert had a big name in Hong Kong and the one in First Markham place has been here for many years now! Came in for some Chinese dessert, and I must say their selection has increased tremendously over the years. I might as well add that the price has also increased tremendously as well. The waitress gave us tea, which I could taste had red date in it. Fancy! A simple taro with coconut with tapioca pearls was like $5.25 or something. Basically all the desserts were more than $5. That's crazy! I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50. A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert. Though I must say, it is Tracy Dessert, and they are a little more on the expensive side. I also saw other items on the menu like fish balls, chicken wings, shaved ice. My friend got a mango drink with fresh mango in it! I'm also surprised how many people come to Tracy Dessert after work. We came on a Sunday and the tables were always filled. I think the amount of tables they had were just perfect because no one really waited for seats for a long time, but the tables kept filling up once a table was finished."
sample = preprocess(sample)
tokenizer.fit_on_texts(sample)
sample = tokenizer.texts_to_sequences([sample])
sample = sequence.pad_sequences(sample, maxlen = Max_Seq)

pred = model.predict(sample)
print(pred)
print(np.argmax(pred)+1)
#The Sequential class has a predict_classes function, but for the Model class we have to return probability vectors and pick out the highest one.

[[0.00225064 0.02780737 0.17857498 0.6430721  0.14829487]]
4


In [11]:
def predict_review(text):
    text = preprocess(text)
    tokenizer.fit_on_texts(text)
    text = tokenizer.texts_to_sequences([text])
    text = sequence.pad_sequences(text, maxlen = Max_Seq)
    pred = model.predict(text)
    pred = np.argmax(pred)+1
    preds = str(pred)
    if pred > 3:
        print("Wow, such service. Great business, " + preds + " Stars!")
    elif pred < 3:
        print("Wow, awful business. Very poor, " + preds + " Stars.")
    else:
        print("Very business. Okay. " + preds + " Stars.")

predict_review(input("Tell the computer about your experience. "))

Very business. Okay. 3 Stars.
