In [1]:
#import necessary libraries to run the program
import pandas as pd

#Tdidf Transformer is used to determine the frequency of words and their contribution to the information
from sklearn.feature_extraction.text import TfidfTransformer

#MultinomialNB is used to classify between liked and not liked
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
#load the dataset - restaurant reviews
dataset = pd.read_csv("Restaurant_Reviews.tsv", encoding='ANSI',delimiter='\t',quoting=3)

In [3]:
#print the first few items of the dataset
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
#Details of the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [5]:
#Details of the dataset by the class - '1' or '0'
dataset.groupby('Liked').describe()

Unnamed: 0_level_0,Review,Review,Review,Review
Unnamed: 0_level_1,count,unique,top,freq
Liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,500,497,I would not recommend this place.,2
1,500,499,I love this place.,2


In [6]:
#divide the dataset into feature/review and output variables
X=dataset['Review']
Y=dataset['Liked']

print(X.head())

#divide the dataset into training and testing set. Here X1 and X2 are the training and testing set respectively
X1, X2, Y1, Y2 = train_test_split(X,Y,random_state=0)


#fit the training data to the Vectorizer
cv1 = CountVectorizer(stop_words="english")
cv1.fit(X1)    

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: Review, dtype: object


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
#Transform the text and apply it to TD-IDF algorithm for further processing and then model it
X1_trans = cv1.transform(X1)
X1_Tdidf = TfidfTransformer().fit(X1_trans)
X1_final = X1_Tdidf.transform(X1_trans)
model = MultinomialNB().fit(X1_final,Y1)

#Transform the testing data and apply it to TD-IDF algorithm for further processing and then test it
X2_trans = cv1.transform(X2)
X2_Tdidf = TfidfTransformer().fit(X2_trans)
X2_final = X2_Tdidf.transform(X2_trans)

#print the predicted values
predict = model.predict(X2_final)
print (predict)

[0 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0
 0 1 1 1 1 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1
 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0
 1 0 1 1 0 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 1 1
 1 0 1 0 1 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0
 0 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 1 1 0 0
 0 0 0 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 0 0]


In [8]:
#print the efficiency of the model
print (roc_auc_score(Y2,predict))

0.736617183985605
