### Restaurant Review classification

In [27]:
import pandas as pd

In [28]:
data = pd.read_csv("Restaurant_Reviews.tsv", sep = '\t')
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [29]:
data.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [30]:
# Its a balanced data set

## Data Preprocessing

In [31]:
# Punctuation removal
# Tokenization
# Normalization
# Removal of stop words

# creating BOW
# TFIDF

In [32]:
from nltk.corpus import stopwords
import string

In [33]:
# writing a function to preprocess data

def preprocess_data(features):
    
    removePunctuations = [char for char in features if char not in string.punctuation]
    SentenceWithoutPunctuation = "".join(removePunctuations)
    
    words = SentenceWithoutPunctuation.split(" ")
    
    NormalisedWords = [word.lower() for word in words]
    
    StopWordRemove = [word for word in NormalisedWords if word not in stopwords.words("english")]
    
    return(StopWordRemove)

In [34]:
features = data.Review

labels = data.Liked

In [35]:
preprocess_data(features)

['wow...',
 'loved',
 'place.crust',
 'good.not',
 'tasty',
 'texture',
 'nasty.stopped',
 'late',
 'may',
 'bank',
 'holiday',
 'rick',
 'steve',
 'recommendation',
 'loved',
 'it.the',
 'selection',
 'menu',
 'great',
 'prices.now',
 'getting',
 'angry',
 'want',
 'damn',
 'pho.honeslty',
 'taste',
 'fresh.)the',
 'potatoes',
 'like',
 'rubber',
 'could',
 'tell',
 'made',
 'ahead',
 'time',
 'kept',
 'warmer.the',
 'fries',
 'great',
 'too.a',
 'great',
 'touch.service',
 'prompt.would',
 'go',
 'back.the',
 'cashier',
 'care',
 'ever',
 'say',
 'still',
 'ended',
 'wayyy',
 'overpriced.i',
 'tried',
 'cape',
 'cod',
 'ravoli,',
 'chicken,',
 'cranberry...mmmm!i',
 'disgusted',
 'pretty',
 'sure',
 'human',
 'hair.i',
 'shocked',
 'signs',
 'indicate',
 'cash',
 'only.highly',
 'recommended.waitress',
 'little',
 'slow',
 'service.this',
 'place',
 'worth',
 'time,',
 'let',
 'alone',
 'vegas.did',
 'like',
 'all.the',
 'burrittos',
 'blah!the',
 'food,',
 'amazing.service',
 'also'

In [36]:
#Create BOW -- scikit-learn

# feature ---> textPreprocessor ---> Creating BOW( Vocab , Contigency Matrix)

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=preprocess_data).fit(features)

In [37]:
bow = wordVector.transform(features)

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

TfidfObject = TfidfTransformer().fit(bow)

finalFeature = TfidfObject.transform(bow)

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( finalFeature, labels, test_size = 0.2, random_state = 0)

In [40]:
from sklearn.linear_model import LogisticRegression
lore = LogisticRegression().fit(X_train, y_train)

In [41]:
print(lore.score(X_train, y_train))
print(lore.score(X_test, y_test))

0.96
0.755


In [42]:
from sklearn.metrics import confusion_matrix

In [43]:
confusion_matrix(y_test, lore.predict(X_test))

array([[84, 13],
       [36, 67]], dtype=int64)

In [44]:
from sklearn.metrics import classification_report

print(classification_report(y_test, lore.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      0.87      0.77        97
           1       0.84      0.65      0.73       103

    accuracy                           0.76       200
   macro avg       0.77      0.76      0.75       200
weighted avg       0.77      0.76      0.75       200



import warnings
warnings.filterwarnings("ignore")
for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(finalFeature,
                                                labels,
                                                test_size=0.2,
                                                random_state=i)
    model = LogisticRegression()
    model.fit(X_train,y_train)
    
    trainScore = model.score(X_train,y_train)
    testScore = model.score(X_test,y_test)
    
    if testScore > trainScore and testScore > 0.9:
        print("Testing {} , Training {}, RS {}".format(testScore,trainScore,i))

In [45]:
#Deployment Check

smsInput = input("Enter Review: ")

#Preprocess
preProcessedFeature = preprocess_data(smsInput)

#BOW

bowFeature = wordVector.transform(preProcessedFeature)

#TFIDF

tfIDFFeature = TfidfObject.transform(bowFeature)

#Pred

predLabel = lore.predict(tfIDFFeature)[0]

print("Given SMS is {}".format(predLabel))

Enter Review: hate
Given SMS is 0


In [46]:
from sklearn.model_selection import GridSearchCV
import numpy as np

c_values = np.logspace(-2,2,8)
param_grids = {'C': c_values, "solver" : ['newton-cg','sag', 'saga','lbfgs']}

In [47]:
grid = GridSearchCV(estimator=lore, param_grid= param_grids, n_jobs=-1,cv=5)

In [48]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 3.72759372e-02, 1.38949549e-01, 5.17947468e-01,
       1.93069773e+00, 7.19685673e+00, 2.68269580e+01, 1.00000000e+02]),
                         'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, ve

In [49]:
grid.best_params_

{'C': 1.9306977288832496, 'solver': 'newton-cg'}

In [50]:
param = {'C': 1.9306977288832496, 'solver': 'newton-cg'}
lore.set_params(**param)

lore.fit(X_train, y_train)

LogisticRegression(C=1.9306977288832496, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
print(lore.score(X_train, y_train))
print(lore.score(X_test, y_test))

0.9725
0.755
