In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

### Importing Dataset

In [None]:
df=pd.read_csv('Restaurant reviews.csv', encoding = "ISO-8859-1")

In [None]:
df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,


In [None]:
df.shape

(10000, 8)

### Cleaning & Preparing Data

In [None]:
df = df.drop(columns=["Restaurant","Reviewer","Metadata","Time","Pictures"])

In [None]:
y = df["Rating"]
X = df.drop(columns=["Rating"])
X.shape

(10000, 2)

In [None]:
y = y.replace({'Like':3})

In [None]:
y.isnull().sum()


38

In [None]:
y = pd.to_numeric(y)

In [None]:
y = y.fillna(y.median())

In [None]:
for i in range(0,len(y)):
    y.iloc[i] = round(y.iloc[i],0)

In [None]:
for i in range(0,len(y)):
    if (y[i]>=3):
        y[i] = "Positive"
    else:
        y[i] = "Negative"

In [None]:
#Apply this for LSTM then move to Stemming
for i in range(0,len(y)):
    if (y[i]>=3):
        y[i] = 1
    else:
        y[i] = 0

In [None]:
y.unique()

array([1., 0.])

In [None]:
y.shape

(10000,)

### Applying NLP Processes

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(X)):
    review = re.sub('[^a-zA-Z]',' ', str(X['Review'][i]))
    review = review.lower() #Lowering the words is very imporatant in avoiding classifying same words as different words
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #Eleminating words that do not put much value in sentences.
    review = ' '.join(review) #Reconstructing sentences
    corpus.append(review)


### For implementing LSTM goto direct LSTM Section Else Continue


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9000) #After experimenting with 7500, 5000, 2500 ...9000 worked best.
X = cv.fit_transform(corpus).toarray()

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Deciding Best Model:
#### Trying Out MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
restaurant_review_model = MultinomialNB().fit(X_train, y_train)

In [None]:
y_pred = restaurant_review_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)

In [None]:
print(confusion_m)

[[ 457  127]
 [ 102 1814]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9084


#### Trying out Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
randomclassifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = randomclassifier.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[ 386  198]
 [  70 1846]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8928


In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

[[   1  583]
 [   0 1916]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.7668


#### Trying KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test, y_pred)
print(confusion_m)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

[[ 308  276]
 [ 188 1728]]
0.8144


### Here, Among Four MultinomialNB has higher accuracy...

### Applying Bi-Directional LSTM with WordEmbedding

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
### Vocabulary size
voc_size=5000

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[1915,
  2614,
  2597,
  2336,
  2614,
  4299,
  3797,
  272,
  2341,
  2614,
  2752,
  3360,
  289,
  1530,
  1990,
  3807,
  1237,
  95,
  4461,
  496,
  1459,
  3120,
  4592,
  3308],
 [1915,
  2614,
  2943,
  4680,
  1197,
  340,
  2597,
  2614,
  2614,
  1923,
  496,
  1459,
  284,
  1197],
 [3227,
  1216,
  13,
  2597,
  13,
  1915,
  4803,
  1197,
  3458,
  2661,
  3215,
  4088,
  3222,
  3177,
  3100,
  1990,
  2080,
  184,
  3348],
 [496,
  1459,
  3799,
  13,
  1368,
  2977,
  2580,
  2614,
  2597,
  2382,
  4852,
  1549,
  655,
  2752],
 [2597,
  2614,
  4962,
  787,
  3567,
  296,
  4639,
  248,
  2614,
  736,
  3458,
  4085,
  2984,
  1256,
  1915,
  1990,
  2614],
 [4483,
  2614,
  1197,
  2614,
  2597,
  1602,
  858,
  3340,
  1197,
  2597,
  2614,
  947,
  2614,
  4758,
  1837,
  1664,
  2614,
  1511,
  273],
 [1020,
  2752,
  1915,
  899,
  2597,
  4962,
  3381,
  1197,
  1990,
  1537,
  3658,
  655,
  375,
  2984,
  3120,
  3227,
  655,
  2752],
 [2984,
  2367,
  206

In [None]:
sent_length=40
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3120 4592 3308]
 [   0    0    0 ... 1459  284 1197]
 [   0    0    0 ... 2080  184 3348]
 ...
 [1968 1911 2716 ...  375 2577 4057]
 [   0    0 3215 ...  714 1197 3803]
 [  65 2000 1275 ... 2648 1073 1915]]


In [None]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0, 1915, 2614, 2597, 2336, 2614, 4299,
       3797,  272, 2341, 2614, 2752, 3360,  289, 1530, 1990, 3807, 1237,
         95, 4461,  496, 1459, 3120, 4592, 3308])

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
X_final.shape,y_final.shape

((10000, 40), (10000,))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=0)

In [None]:
from tensorflow.keras.layers import Bidirectional
import keras
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=tensorflow.keras.Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=5,batch_size=64)

Train on 7500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x192f5f62048>

In [None]:
y_pred=model.predict_classes(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[ 415,  169],
       [  97, 1819]], dtype=int64)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8936

### Still MultinomialNB has higher Accuracy