### Load Packages

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

### Load dataset

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep="\t")
df.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


### Data Preprocessing: Cleaning

In [3]:
def clean(x):
    # to remove html tags
    x = re.sub(r'<.*?>',' ',x)
    #to replace abbreviations
    x = re.sub(r"can't",'can not', x)
    x = re.sub(r"didn't",'did not', x)
    x = re.sub(r"havn't",'have not', x)
    #to remove contact number
    x = re.sub(r'\d{10}',' ', x)
    #to remove punctuations and numbers
    x = re.sub(r'[^A-Za-z]', ' ',x)
    #to replace more than 1 space with single space
    x = re.sub(r'\s+',' ',x)
    #to convert data to lower case
    x = x.lower()
    return x
clean("<h1>How are  ARE you? 9898989898 didn't (*&^) 987y &*^% pick call</h1>")

' how are are you did not y pick call '

In [4]:
df["Review"] = df.Review.apply(clean)

In [5]:
df.head(10)

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
5,now i am getting angry and i want my damn pho,0
6,honeslty it did not taste that fresh,0
7,the potatoes were like rubber and you could te...,0
8,the fries were great too,1
9,a great touch,1


In [6]:
X = df["Review"].values
Y = df["Liked"].values

In [7]:
print(X.shape, Y.shape)

(1000,) (1000,)


In [8]:
pd.Series(Y).value_counts()

1    500
0    500
dtype: int64

<h2>Train Test Split<h2>

In [9]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, random_state=10, test_size=.20)

In [10]:
xtrain.shape, xtest.shape

((800,), (200,))

In [11]:
print(xtrain[0])
print(ytrain[0])

what i really like there is the crepe station 
1


### Tokenization using CountVectorizer

In [12]:
cv = CountVectorizer()
cv_train = cv.fit_transform(xtrain).toarray()
cv_test = cv.transform(xtest).toarray()

In [13]:
cv_train.shape

(800, 1763)

In [14]:
print(cv.get_feature_names()[-5:])
print(len(cv.get_feature_names()))

['yucky', 'yukon', 'yum', 'yummy', 'zero']
1763


### Train Model

In [15]:
lg = LogisticRegression(C=1.2)
lg.fit(cv_train, ytrain)


LogisticRegression(C=1.2)

### Evaluate

In [16]:
tst_pred=lg.predict(cv_test)
tr_pred = lg.predict(cv_train)

In [17]:
print(metrics.confusion_matrix(ytest, tst_pred))

[[82 18]
 [18 82]]


In [18]:
print("Training Score: ", lg.score(cv_train, ytrain))
print("Testing Score: ", lg.score(cv_test, ytest))

Training Score:  0.99125
Testing Score:  0.82


### Let's check model

In [19]:
test = np.array(["Worst Experience Ever","I must say it fabulus","Horrible! Don't eat here",
            "I hate this","I love this food","amazing food","hate it" ,"love it","horrible experience"])

In [20]:
cv_tst = cv.transform(test).toarray()

In [21]:
pred = lg.predict(cv_tst)
result = pd.DataFrame({"Review":test, "Prediction":pred})
result.Prediction = result.Prediction.map({0:"Negative",1:"Positive"})
result

Unnamed: 0,Review,Prediction
0,Worst Experience Ever,Negative
1,I must say it fabulus,Positive
2,Horrible! Don't eat here,Negative
3,I hate this,Negative
4,I love this food,Positive
5,amazing food,Positive
6,hate it,Negative
7,love it,Positive
8,horrible experience,Negative


### Save Model / Export to pkl file

In [42]:
fl = open("review_model.pkl","wb")
pickle.dump(lg,fl)
fl.close()

In [43]:
f2 = open("cv.pkl", "wb")
pickle.dump(cv, f2)
f2.close()