# Natural Language Processing (NLP)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 

In [3]:
warnings.filterwarnings('ignore')

In [26]:
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [27]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


# importing NLP packages

In [30]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shipr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [39]:
corpus=[]

In [40]:
for i in range(0,1000):
    review=re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    all_stopwords=stopwords.words('english')
    all_stopwords.remove('not')
    review=[ps.stem(word) for word in review if word not in set(all_stopwords)]
    review=" ".join(review)
    corpus.append(review)
    
    

# creating bags of words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()


In [45]:
X=cv.fit_transform(corpus).toarray()
y=dataset.iloc[:,-1].values

In [48]:
len(X[0])

1566

In [64]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# train model using Naive Bayes Classifier

In [65]:
from sklearn.naive_bayes import GaussianNB
nbc=GaussianNB()
nbc.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [66]:
y_pred=nbc.predict(X_test)

In [67]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)

In [85]:
score

0.73

In [92]:
def new_prediction(new_review):
    review=re.sub('[^a-zA-Z]',' ',new_review)
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    all_stopwords=stopwords.words('english')
    all_stopwords.remove('not')
    review=[ps.stem(word) for word in review if word not in set(all_stopwords)]
    review=" ".join(review)
    review=[review]
    p=nbc.predict(cv.transform(review).toarray())
    if p[0]==1:
        print('positive review')
    else:
        print('negative review')

In [None]:
new_prediction(input('enter your reviews :'))

# using logistic regression classifier

In [112]:
from sklearn.linear_model import LogisticRegression
logisticclassifier = LogisticRegression()
logisticclassifier.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [113]:
score=accuracy_score(y_test,y_pred)

In [114]:
score

0.685

# using SVC

In [116]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [117]:
# defining parameter range

param_grid = {'C': [0.1, 1],  
              'gamma': [1, 0.1], 
              'kernel': ['rbf','poly','sigmoid']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.504, total=   3.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.504, total=   3.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.2s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.504, total=   3.8s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.735, total=   3.9s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.744, total=   3.3s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ......... C=0.1, gamma=1, kernel=poly, score=0.729, total=   3.2s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.791, total=   3.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.756, total=   3.3s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ...... C=0.1, gamma=1, kernel=sigmoid, score=0.793, total=   3.5s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  2.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1], 'gamma': [1, 0.1],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [118]:
print(grid.best_params_) 
print(grid.best_estimator_)

{'C': 1, 'gamma': 0.1, 'kernel': 'sigmoid'}
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [119]:
print(grid.best_score_)

0.78375


In [120]:
def new_prediction(new_review):
    review=re.sub('[^a-zA-Z]',' ',new_review)
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    all_stopwords=stopwords.words('english')
    all_stopwords.remove('not')
    review=[ps.stem(word) for word in review if word not in set(all_stopwords)]
    review=" ".join(review)
    review=[review]
    p=grid.predict(cv.transform(review).toarray())
    if p[0]==1:
        print('positive review')
    else:
        print('negative review')

In [127]:
new_prediction(input('enter your reviews :'))

enter your reviews :i love the food
positive review
