In [1]:
# Natural Language Processing

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
# Delimiter will be tabs for tsv file and quoting=3 will ignore all the double quotes
data = pd.read_csv('Reviews.tsv', delimiter = '\t', quoting = 3)
data.shape

(1000, 2)

In [4]:
# Inspecting the data
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# Importing the NLP libraries
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Cleaning the texts
corpus = []
for i in range(0, 1000):
    # Removing special chars and integers
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    review = review.lower()
    # Creating a list of words from the sentences
    review = review.split()
    ps = PorterStemmer()
    # Removing the stopwords and also stemming all the words to their roots
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    # Corpus will contain all the cleaned review that can be used in the model
    corpus.append(review)

In [7]:
# Equivalent to corpus.head()
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [8]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# Tokenizing the data
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, 1].values

In [9]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [12]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [13]:
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [14]:
(55+91)/(55+12+42+91)

0.73