### Importing libraries

In [1]:
import numpy as np
import pandas as pd



### Loading Dataset

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Inspecting Data

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
dataset.dtypes

Review    object
Liked      int64
dtype: object

- Dataset have only 2 columns : one is in textual format and other is binary categorical ( 0 or 1 )

In [5]:
dataset.groupby('Liked').size()

Liked
0    500
1    500
dtype: int64

-  Dataset is Balanced 
- It doesn't have any null values

In [6]:
dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

#### Objective : 
- Clean and Preprocess a single review then create a for loop for cleaning all 1000 reveiws



#### First review 

In [7]:
dataset['Review'][0]

'Wow... Loved this place.'

In [8]:
# Removing Numbers and Punctuations with the help of Rgular Expressions

import re

review = re.sub( '[^a-zA-Z]', ' ', dataset['Review'][0] )
print(review)

Wow    Loved this place 


- Convert each word into its lower case / upper case (prefer lower case):
- It irrelevant to have same words in different cases (eg: ‘LOVE’ and ‘LoVe’ )

In [9]:
# Convert the string to lower 

review = review.lower()
review

'wow    loved this place '

In [13]:
import nltk
nltk.download('stopwords') #------- download stopwords

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAJEEV\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
# stopwords.words('english')
len(stopwords.words('english'))

179

- There are total 179 stopwords in english language

In [16]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [17]:
# By list comprehension, we tried to remove the stop word 

review1 = [ word for word in review if not word in set(stopwords.words('english')) ]
review1

['wow', 'loved', 'place']

### Stemming:
- Convert word to its root word

Example: 
loved ----> love, stopped ---->stop

In [18]:
# Use Stemming to take word it to its Root form

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

review1 = [ ps.stem(word) for word in review1 ]
review1

['wow', 'love', 'place']

In [19]:
# Convert list to string 

review2 = ' '.join(review1)
review2

'wow love place'

### Count-Vectorizer( )
- This will construct the vocabulary of the bag-of-words model and transform the sentences into sparse feature vectors

In [20]:
corpus1 = []

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3) # max-feature=3 means take only top 3 columns into consideration
print(review2)

corpus1.append(review2)
print(corpus1)

X = cv.fit_transform(corpus1)
print(X.toarray())

wow love place
['wow love place']
[[1 1 1]]


- Now the textual data is preprocessed and converted into numerical format, which we can use for ML model

#### Preprocessing all the rows :

In [20]:
dataset.shape

(1000, 2)

In [21]:
dataset.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


- There are 1000 rows (from 0 to 999)

#### Preprocessing 1000 rows

In [21]:
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ]
    review = ' '.join(review)
    # print(review)
    corpus.append(review)

In [22]:
print("Review Type: ",type(review))
print("Corpus Type: ",type(corpus))

Review Type:  <class 'str'>
Corpus Type:  <class 'list'>


#### Creating DataFrame for Preprocessed Reviews

In [23]:
corpus_dataset = pd.DataFrame(corpus)
corpus_dataset.head()

Unnamed: 0,0
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [24]:
corpus_dataset['corpus'] = corpus_dataset
corpus_dataset = corpus_dataset.drop([0], axis=1)
corpus_dataset.head()

Unnamed: 0,corpus
0,wow love place
1,crust good
2,tasti textur nasti
3,stop late may bank holiday rick steve recommen...
4,select menu great price


In [25]:
# Saving pre-processed dataset for future reference: 
corpus_dataset.to_csv("corpus_dataset.csv")

### Bag of Words Model for whole data

In [26]:
# Create a Bag of Words Model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)

In [27]:
X = cv.fit_transform(corpus).toarray()
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

- Sparse matrix is created for top 1500 columns

In [29]:
# To see  all the top 1500 seleceted feature names: 
# cv.get_feature_names()
len(cv.get_feature_names())

1500

In [30]:
# As our input data is in numpy format so changing y(target variable) in numpy array
y = dataset.iloc[:,1].values

### Splitting Data into 80-20 ratio

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Naive Bayes

In [49]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

BernoulliNB()

In [50]:
y_pred = classifier.predict(X_test)

In [51]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [52]:
confusion_matrix(y_test,y_pred)

array([[73, 24],
       [22, 81]], dtype=int64)

In [53]:
accuracy_score(y_test,y_pred)

0.77

- Model is not too good nor too bad as it is 77% accurate in predicting review either positive or negative

### Check it on Unseen Data

In [54]:
Review = "nice service"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = classifier.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Positive


In [55]:
Review = "long waiting time"
input1 = [Review]

input_data = cv.transform(input1).toarray()

input_pred = classifier.predict(input_data)

if input_pred[0]==1:
    print("Review is Positive")
else:
    print("Review is Negative")

Review is Negative
