In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### For NLP processing of text files as below, use .tsv format of input file
    tsv is tab separated value file.
    tsv is used because commas may come in sentences or texts but not tabs, so tsv is used here.
    

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
# delimiter = '\t' is given since this is a tsv file.
# quoting = 3 will make the dataset to ignore all the double quotes in the input file text.

dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Cleaning texts:

* import re library which is used for cleaning texts
* lets beging by cleaning first row
* substitute characters other than alphabets with spaces and change UpperCase to LowerCase

In [3]:
import re

In [4]:
review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][0])
review = review.lower()

### NLTK library to get the common words (this, that, is...) and remove from our dataset

In [1]:
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rameshveer/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#we have downloaded stopwords from above, now we need to import to use it
from nltk.corpus import stopwords

In [8]:
#to split the first row into words and put it into a list, so we can remove stopwords
review = review.split()
review

['wow', 'loved', 'this', 'place']

### Use a for_loop to loop thru the words in first row and match with stopwords. then remove the stopwords.
    * set() will make the search function faster.

In [9]:
review = [word for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'loved', 'place']

### Stemming - the process of slicing the end or the beginning of words with the intention of removing affixes
    * use the below library to stem the word loved to love.
    * if we do stemming, then we can derive common words therby reducing no. of cols (Sparcity - More cols with zero value). So performance will be better.

In [10]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [11]:
review = [ps.stem(word) for word in review]
review

['wow', 'love', 'place']

In [12]:
# Now since we are done with cleaning, we need join the words back 
review = ' '.join(review)
review

'wow love place'

In [13]:
## A For_Loop for cleaning all rows as above

corpus = []

for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
corpus[0:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

### Bag Of Words Model
* to create a sparce matrix for all the words in the above corpus list. Each column will be created for each unique word.

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [16]:
np.shape(X)
#we got 1565 cols by above text rows

(1000, 1565)

#to remove 1 time words like name(John, etc), place name.. we can use max func to remove those 
#thereby reducing the no. of cols

In [17]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
np.shape(X)

(1000, 1500)

In [19]:
#creating y - dependent variable.
y = dataset.iloc[:,1].values
# .values will create an array
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1,

### Apply any Classification algorithm now to predict y on sparced X

#### Naive Bayes

In [20]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 0)



In [21]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [22]:
classifier.fit(x_train, y_train)

GaussianNB(priors=None)

In [23]:
y_pred = classifier.predict(x_test)

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [25]:
cm = confusion_matrix(y_pred, y_test)
cm

array([[55, 12],
       [42, 91]])

In [26]:
accuracy_score(y_pred, y_test)

0.72999999999999998