## Importing Packages

In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score


## Loading the Dataset

In [3]:
data = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


The dataset contains two columns:-
    1. Review : Describes the review given by the customer about the restaurant.
    2. Liked : Here 1 means 'Postive' and 0 means 'Negative'.

## Exploratory Data Analysis

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
data.isna().sum()

Review    0
Liked     0
dtype: int64

In [8]:
data.dtypes

Review    object
Liked      int64
dtype: object

In [10]:
data.groupby('Liked').size()

Liked
0    500
1    500
dtype: int64

This shows that the data is balanced.

## NLP

Steps:
    1. We will remove the numbers and punctuations.
    2. Then we will convert the text into lower case.
    3. Removing the stopwords.
    4. Stemming
    5. CountVectorizer (Bag of Words)

In [13]:
# Downloading the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rashi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
corpus = []
for i in range(1000):
    
    # Removing punctuations and numbers.
    review = re.sub('[^a-zA-Z]',' ',data['Review'][i])
    
    # Converting to lower case
    review = review.lower()
    review = review.split()
    
    # Removing stopwords and stemming
    ps = PorterStemmer()
    review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ]
    review = ' '.join(review)
    
    corpus.append(review)
    
    
    
    

In [16]:
# Coverting list to dataframe
corpus_df = pd.DataFrame(corpus)
corpus_df.head()

Unnamed: 0,0
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [17]:
corpus_df['corpus'] = corpus_df
corpus_df = corpus_df.drop([0],axis=1)
corpus_df.head()

Unnamed: 0,corpus
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [18]:
# Creating Bag of Words model
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
y = data.iloc[:,1].values

## Building the model 

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state = 0)
model = GaussianNB()
model.fit(X_train,y_train)
ypred = model.predict(X_test)
confusion_matrix(y_test,ypred)

array([[55, 42],
       [12, 91]], dtype=int64)

In [23]:
accuracy_score(y_test,ypred)

0.73

In [24]:
Review = "nice service"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Positive


In [25]:
Review = "Took too much time"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Negative


In [26]:
Review = "Amazing food"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Positive


## By : Rashi Saluja