## Processing Data for NLP

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting=3)


In [30]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0,df.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shangeth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,-1].values

## Using Machine learning models to train

In [32]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

"""Use any classification model for training"""
# reg = DecisionTreeRegressor(random_state=0)
# reg = GaussianNB()
reg = MLPClassifier(hidden_layer_sizes=(1000,1000,1000),max_iter=20000,random_state=0)



reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)

from sklearn.metrics import accuracy_score
print('Accuracy = ',accuracy_score(y_pred,y_test))

Accuracy =  0.79


In [33]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix = \n',cm)

Confusion Matrix = 
 [[81 16]
 [26 77]]


In [34]:
def predict_reaction(comment):
    review = re.sub('[^a-zA-Z]', ' ', comment)
    review = review.lower()
    review = review.split()
    
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    
    cor1 = corpus
    cor1.append(review)
    
    cv = CountVectorizer(max_features=1500)
    X_t = cv.fit_transform(cor1).toarray()

    if reg.predict([X_t[-1]]):
        print(comment+ ' is a Postive comment!! :) :D\n')
    else:
        print(comment+ ' is a Negative Comment!! :(\n')

In [35]:
# comment = input('Enter you comment : ')
comment = ' worst food !!!'
predict_reaction(comment)

comment = ' its really nice !!!'
predict_reaction(comment)

comment = ' i hate it !!!'
predict_reaction(comment)

 worst food !!! is a Negative Comment!! :(

 its really nice !!! is a Postive comment!! :) :D

 i hate it !!! is a Negative Comment!! :(



In [36]:
# less prediction accuracy because of the small dataset