# NLP
### Aim : build a classifier to classify restaurant reviews as positive or negative using bag of words model

In [27]:
# using sklearn==0.18.2 and scipy==1.2.3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import warnings 
warnings.filterwarnings('ignore')

df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) # quoting=3 means we're ignoring quotes
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## cleaning one review

In [28]:
import re
review = re.sub(r'[^a-zA-Z]',' ',df['Review'][0])
review = review.split()
review

['Wow', 'Loved', 'this', 'place']

## removing irrevalent words
using nltk lib

In [29]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skrstv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
review = [w for w in review if w not in eng_stopwords]

In [31]:
review

['Wow', 'Loved', 'place']

## STEMMING - reducing to simple present tense

In [32]:
from nltk.stem.porter import PorterStemmer
ps  = PorterStemmer()
review = [ps.stem(w) for w in review if w not in eng_stopwords]
# review
review = ' '.join(review)

In [33]:
review

'wow love place'

## cleaning all reviews

In [34]:
cleaned_reviews = []
for i in range(1000):
    review = re.sub(r'[^a-zA-Z]',' ',df['Review'][i])
    review = review.split()
    ps  = PorterStemmer()
    review = [ps.stem(w) for w in review if w not in eng_stopwords] 
    cleaned_reviews.append(' '.join(review))

## creating bag of words
one column for each word, reviews will be the count of those words in review

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
cv = CountVectorizer(max_features=  1500)
X = cv.fit_transform(cleaned_reviews).toarray()
y = df.iloc[:,-1].values

## training and testing the classifier

In [37]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state = 0)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix as confmat
cm = confmat(y_test,y_pred)
cm

array([[55, 42],
       [11, 92]])

In [38]:
# the count of true negatives is :math:`cm[0,0]`, 
# false negatives is :math:`cm[1,0]`,
# true positives is:math:`cm[1,1]` and
# false positives is :math:`cm[0,1]`.

# [[TN, FN],
#  [FP, TP]]
    
# so 11+42 is the count of wrong predictions