In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve

from sklearn import metrics

import contractions

import scikitplot as skplt
import matplotlib.pyplot as plt

In [2]:
yelp_data_5k = pd.read_csv("/home/sebastian/Documents/bachelor-final/Dataset/PROCESSED_RESTAURANT_REVIEWS_5k.csv")

In [3]:
yelp_data_5k.head()

Unnamed: 0.1,Unnamed: 0,reviewContent,usefulCount,coolCount,funnyCount,rating,restaurantID,reviewCleanWithStopwords,reviewCleanNoStopwords,reviewCleanPorterStemmer,reviewCleanSnowballStemmer,reviewCleanLemmatized,reviewCleanLancaster
0,0,"""'Check, Please."" The bartender was unable to ...",18,11,25,1,VZHyAmdFDreQqL0BT-zdoA,check please the bartender was unable to recom...,check please bartender unable recommend beer t...,check pleas bartend unabl recommend beer tap t...,check pleas bartend unabl recommend beer tap t...,check please bartender unable recommend beer t...,check pleas bartend un recommend beer tap tri ...
1,1,"""2 stars for disappointing food, one star for ...",14,10,7,3,tFcmrGLZNEymSnijoTPmqw,stars for disappointing food one star for grea...,stars disappointing food one star great servic...,star disappoint food one star great servic rea...,star disappoint food one star great servic rea...,star disappoint food one star great service re...,star disappoint food on star gre serv read iai...
2,2,"""A Divine Dialogue"" God: ""Britton. Times up. T...",14,16,23,5,INvIaBFnAvGxzTXFWHzGvA,divine dialogue god britton times up the world...,divine dialogue god britton times world going ...,divin dialogu god britton time world go end to...,divin dialogu god britton time world go end to...,divine dialogue god britton time world go end ...,divin dialog god britton tim world going end t...
3,3,"""A Place To Go When You Have Time"" It was a mi...",13,10,10,5,FySId5SjNhkrtPA5qktdxg,place to go when you have time it was misty br...,place go time misty breezy summer night friend...,place go time misti breezi summer night friend...,place go time misti breezi summer night friend...,place go time misty breezy summer night friend...,plac go tim misty breezy sum night friend jere...
4,4,"""A Thaiphoon of Flavor"" My First: I never enli...",27,22,28,5,RgeMUiZncTs-VSHQLm0wNg,thaiphoon of flavor my first never enlisted in...,thaiphoon flavor first never enlisted air forc...,thaiphoon flavor first never enlist air forc a...,thaiphoon flavor first never enlist air forc a...,thaiphoon flavor first never enlist air force ...,thaiphoon flav first nev enl air forc allow jo...


In [4]:
yelp_data_5k.rating.value_counts()

2    1250
5    1179
1    1159
3    1130
4    1051
Name: rating, dtype: int64

In [5]:
yelp_data_5k.isnull().values.any()

False

# Plotting function

In [6]:
def plot_history(history):
    accuracy = history.history['acc']
    val_accuracy = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    epochs = range(1,len(accuracy) + 1)
    
    # Plot accuracy  
    plt.figure(1)
    plt.plot(epochs, accuracy, 'b', label='Training accuracy')
    plt.plot(epochs, val_accuracy, 'g', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.figure(2)
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.plot(epochs, val_loss, 'g', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Remove data from table with rating = 2 or 4

In [7]:
# Get names of indexes for which column rating has value 2
indexNamesRating2 = yelp_data_5k[ yelp_data_5k['rating'] == 2 ].index
 
# Delete these row indexes from dataFrame
yelp_data_5k.drop(indexNamesRating2, inplace=True)

# Get names of indexes for which column rating has value 4
indexNamesRating4 = yelp_data_5k[ yelp_data_5k['rating'] == 4 ].index
 
# Delete these row indexes from dataFrame
yelp_data_5k.drop(indexNamesRating4, inplace=True)

In [8]:
yelp_data_5k.rating.value_counts()

5    1179
1    1159
3    1130
Name: rating, dtype: int64

# Allocating each text with its specific cleaning method to a variable

In [9]:
x_cleanWithStopwords = yelp_data_5k["reviewCleanWithStopwords"]
x_cleanNoStopwords = yelp_data_5k["reviewCleanNoStopwords"]
x_porterStemmer = yelp_data_5k["reviewCleanPorterStemmer"]
x_snowballStemmer = yelp_data_5k["reviewCleanSnowballStemmer"]
x_lemmatized = yelp_data_5k["reviewCleanLemmatized"]
x_lancaster = yelp_data_5k["reviewCleanLancaster"]

In [10]:
tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_cleanWithStopwords)
x_cleanWithStopwords = tf.transform(x_cleanWithStopwords)

tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_cleanNoStopwords)
x_cleanNoStopwords = tf.transform(x_cleanNoStopwords)

tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_porterStemmer)
x_porterStemmer = tf.transform(x_porterStemmer)

tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_snowballStemmer)
x_snowballStemmer = tf.transform(x_snowballStemmer)

tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_lemmatized)
x_lemmatized = tf.transform(x_lemmatized)

tf = TfidfVectorizer(ngram_range=(1,2), strip_accents='unicode', decode_error='replace', analyzer='word', min_df=5).fit(x_lancaster)
x_lancaster = tf.transform(x_lancaster)

In [11]:
selector_cleanWithStopwords = SelectKBest(f_classif, k=min(19132, x_cleanWithStopwords.shape[1]))
selector_cleanNoStopwords = SelectKBest(f_classif, k=min(19132, x_cleanNoStopwords.shape[1]))
selector_porterStemmer = SelectKBest(f_classif, k=min(19132, x_porterStemmer.shape[1]))
selector_snowballStemmer = SelectKBest(f_classif, k=min(19132, x_snowballStemmer.shape[1]))
selector_lemmatized = SelectKBest(f_classif, k=min(19132, x_lemmatized.shape[1]))
selector_lancaster = SelectKBest(f_classif, k=min(19132, x_lancaster.shape[1]))

In [12]:
y = yelp_data_5k.rating.values

In [13]:
selector_cleanWithStopwords.fit(x_cleanWithStopwords, y)
selector_cleanNoStopwords.fit(x_cleanNoStopwords, y)
selector_porterStemmer.fit(x_porterStemmer, y)
selector_snowballStemmer.fit(x_snowballStemmer, y)
selector_lemmatized.fit(x_lemmatized, y)
selector_lancaster.fit(x_lancaster, y)

SelectKBest(k=19132, score_func=<function f_classif at 0x7f62f0f87b70>)

In [14]:
x_cleanWithStopwords = selector_cleanWithStopwords.transform(x_cleanWithStopwords).astype('float32')
x_cleanNoStopwords = selector_cleanNoStopwords.transform(x_cleanNoStopwords).astype('float32')
x_porterStemmer = selector_porterStemmer.transform(x_porterStemmer).astype('float32')
x_snowballStemmer = selector_snowballStemmer.transform(x_snowballStemmer).astype('float32')
x_lemmatized = selector_lemmatized.transform(x_lemmatized).astype('float32')
x_lancaster = selector_lancaster.transform(x_lancaster).astype('float32')

In [15]:
X_cleanWithStopwords = x_cleanWithStopwords.toarray()
X_cleanNoStopwords = x_cleanNoStopwords.toarray()
X_porterStemmer = x_porterStemmer.toarray()
X_snowballStemmer = x_snowballStemmer.toarray()
X_lemmatized = x_lemmatized.toarray()
X_lancaster = x_lancaster.toarray()

In [16]:
y = yelp_data_5k['rating'].values

In [17]:
X_train_cleanWithStopwords, X_test_cleanWithStopwords, y_train, y_test = train_test_split(X_cleanWithStopwords, y, test_size=0.2, random_state=50)

X_train_cleanNoStopwords, X_test_cleanNoStopwords, y_train, y_test = train_test_split(X_cleanNoStopwords, y, test_size=0.2, random_state=50)

X_train_porterStemmer, X_test_porterStemmer, y_train, y_test = train_test_split(X_porterStemmer, y, test_size=0.2, random_state=50)

X_train_snowballStemmer, X_test_snowballStemmer, y_train, y_test = train_test_split(X_snowballStemmer, y, test_size=0.2, random_state=50)

X_train_lemmatized, X_test_lemmatized, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=50)

X_train_lancaster, X_test_lancaster, y_train, y_test = train_test_split(X_lancaster, y, test_size=0.2, random_state=50)

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

Using TensorFlow backend.


In [19]:
input_shape = X_train_cleanNoStopwords.shape[1:]

In [20]:
input_shape

(19132,)

In [21]:
model = Sequential()
model.add(Dropout(rate=0.2, input_shape = input_shape))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(units=3, activation='softmax'))

model.summary()

W0703 19:06:29.592908 140064191469376 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0703 19:06:29.602348 140064191469376 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0703 19:06:29.603866 140064191469376 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0703 19:06:29.608847 140064191469376 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a futur

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_1 (Dropout)          (None, 19132)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1224512   
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 195       
Total params: 1,224,707
Trainable params: 1,224,707
Non-trainable params: 0
_________________________________________________________________


In [22]:
from keras.optimizers import Adam
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])

W0703 19:06:33.457606 140064191469376 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0703 19:06:33.461236 140064191469376 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3341: The name tf.log is deprecated. Please use tf.math.log instead.



In [23]:
from keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

In [24]:
history_cleanWithStopwords = model.fit(X_train_cleanWithStopwords, y_train, epochs=100, validation_data=(X_test_cleanWithStopwords, y_test), verbose=1, batch_size=16, callbacks=callbacks)

history_cleanNoStopwords = model.fit(X_train_cleanNoStopwords, y_train, epochs=100, validation_data=(X_test_cleanNoStopwords, y_test), verbose=1, batch_size=64, callbacks=callbacks)

history_porterStemmer = model.fit(X_train_porterStemmer, y_train, epochs=100, validation_data=(X_test_porterStemmer, y_test), verbose=1, batch_size=64, callbacks=callbacks)

history_snowballStemmer = model.fit(X_train_snowballStemmer, y_train, epochs=100, validation_data=(X_test_snowballStemmer, y_test), verbose=1, batch_size=64, callbacks=callbacks)

history_lemmatized = model.fit(X_train_lemmatized, y_train, epochs=100, validation_data=(X_test_lemmatized, y_test), verbose=1, batch_size=64, callbacks=callbacks)

history_lancaster = model.fit(X_train_lancaster, y_train, epochs=100, validation_data=(X_test_lancaster, y_test), verbose=1, batch_size=64, callbacks=callbacks)

W0703 19:06:35.763130 140064191469376 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100
Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100
Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100
Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100
Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100
Train on 2774 samples, validate on 694 samples
Epoch 1/100
Epoch 2/100


In [None]:
test_loss, test_acc = model.evaluate(X_test_cleanWithStopwords, y_test, verbose=1)
print('WITH STOPWORDS')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

test_loss, test_acc = model.evaluate(X_test_cleanNoStopwords, y_test, verbose=1)
print('NO STOPWORDS')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

test_loss, test_acc = model.evaluate(X_test_porterStemmer, y_test, verbose=1)
print('PORTERSTEMMER')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

test_loss, test_acc = model.evaluate(X_test_snowballStemmer, y_test, verbose=1)
print('SNOWBALLSTEMMER')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

test_loss, test_acc = model.evaluate(X_test_lemmatized, y_test, verbose=1)
print('LEMMATIZED')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

test_loss, test_acc = model.evaluate(X_test_lancaster, y_test, verbose=1)
print('LANCASTER')
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [None]:
plot_history(history_cleanWithStopwords)
plot_history(history_cleanNoStopwords)
plot_history(history_porterStemmer)
plot_history(history_snowballStemmer)
plot_history(history_lemmatized)
plot_history(history_lancaster)