In [1]:
#https://www.geeksforgeeks.org/python-nlp-analysis-of-restaurant-reviews/

In [2]:
# Importing Libraries
import numpy as np 
import pandas as pd
 
# Import dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [3]:
#Text Cleaning or Preprocessing 
#Remove Punctuations, Numbers: Punctuations, Numbers don’t help much in processing the given text, if included, 
#they will just increase the size of a bag of words that we will create as the last step 
#and decrease the efficiency of an algorithm.
#Stemming: Take roots of the word 
#Convert each word into its lower case:
#For example, it is useless to have some words in different cases (eg ‘good’ and ‘GOOD’).

In [4]:
# library to clean data
import re
 
# Natural Language Tool Kit
import nltk
 
nltk.download('stopwords')
 
# to remove stopword
from nltk.corpus import stopwords
 
# for Stemming propose
from nltk.stem.porter import PorterStemmer
 
# Initialize empty array
# to append clean text
corpus = []
 
# 1000 (reviews) rows to clean
for i in range(0, 1000):
     
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
     
    # convert all cases to lower cases
    review = review.lower()
     
    # split to array(default delimiter is " ")
    review = review.split()
     
    # creating PorterStemmer object to
    # take main stem of each word
    ps = PorterStemmer()
     
    # loop for stemming each word
    # in string array at ith row   
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
                 
    # rejoin all string array elements
    # to create back into a string
    review = ' '.join(review) 
     
    # append each string to create
    # array of clean text
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
 
# To extract max 1500 feature.
# "max_features" is attribute to
# experiment with to get better results
cv = CountVectorizer(max_features = 1500)
 
# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()
 
# y contains answers if review
# is positive or negative
y = dataset.iloc[:, 1].values

In [6]:
#Splitting Corpus into Training and Test set.
#For this, we need class train_test_split from sklearn.cross_validation. 
#Split can be made 70/30 or 80/20 or 85/15 or 75/25, here I choose 75/25 via “test_size”. 
#X is the bag of words, y is 0 or 1 (positive or negative).

In [8]:
from sklearn.model_selection import cross_validate

In [11]:
# Splitting the dataset into
# the Training set and Test set
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
 
# experiment with "test_size"
# to get better results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [12]:
#Fitting a Predictive Model (here random forest) 

#Since Random forest is an ensemble model (made of many trees) from sklearn.ensemble, import RandomForestClassifier class
#With 501 trees or “n_estimators” and criterion as ‘entropy’
#Fit the model via .fit() method with attributes X_train and y_train
 

In [13]:
# Fitting Random Forest Classification
# to the Training set
from sklearn.ensemble import RandomForestClassifier
 
# n_estimators can be said as number of
# trees, experiment with n_estimators
# to get better results
model = RandomForestClassifier(n_estimators = 501,
                            criterion = 'entropy')
                             
model.fit(X_train, y_train)

In [14]:
#Predicting Final Results via using .predict() method with attribute X_test 
 

# Predicting the Test set results
y_pred = model.predict(X_test)
 
y_pred

array([1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [15]:
#Accuracy with the random forest was 72%.(It may be different when performed an experiment with different test sizes, here = 0.25).
#Step 8: To know the accuracy, a confusion matrix is needed.
#Confusion Matrix is a 2X2 Matrix.
 

In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, y_pred)
 
cm

array([[103,  21],
       [ 31,  95]], dtype=int64)