In [1]:
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

import pickle 
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


## Text Cleaning

In [5]:
corpus = list()

for review in df['Review']:
    
    # Punctuation
    review = re.sub(pattern='[^A-Za-z]', 
                    repl=' ', 
                    string=review)
    review = review.lower()
    review = review.split()
    
    # StopWords
    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    
    # Stemming
    ps = PorterStemmer()
    
    review = [ps.stem(word) for word in review if word not in all_stopwords]
    review = ' '.join(review)
    
    corpus.append(review)

In [6]:
corpus[:5]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

## Model Making

### Bag of Words

In [7]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus)

In [8]:
X

<1000x1500 sparse matrix of type '<class 'numpy.int64'>'
	with 5418 stored elements in Compressed Sparse Row format>

In [9]:
X = X.toarray()

In [10]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
Y = df['Liked']

### Train Test Split 

In [12]:

x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

### Random Forest Classifier

In [13]:
classifier = RandomForestClassifier(n_estimators=10, criterion='gini', random_state=0) 
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [14]:
# <<-- Hyper Parameter Tuning -->>

parameters = [{'criterion':['entropy', 'gini'], 'min_samples_split':[2, 4, 8], 'n_estimators':[10, 15, 20, 25, 30]}]

grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy', 
                           cv = 10,
                           n_jobs = -1, 
                           verbose=10)

grid_search = grid_search.fit(x_train, y_train)

best_accuracy = grid_search.best_score_   
best_parameters = grid_search.best_params_  

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

Best Accuracy: 80.62 %
Best Parameters: {'criterion': 'entropy', 'min_samples_split': 4, 'n_estimators': 25}


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   27.9s finished


In [15]:
# Make new Random Forest model with best parameters

classifier = RandomForestClassifier(n_estimators=25, criterion='entropy', min_samples_split=4, random_state=0) 
classifier.fit(x_train, y_train)
predicted_values = classifier.predict(x_test)

In [16]:
# Evaluation of model

print(f'Accuracy: {accuracy_score(y_test, predicted_values)*100}%\n')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, predicted_values)}\n')
print(classification_report(y_test, predicted_values))

Accuracy: 72.0%

Confusion Matrix: 
[[84 13]
 [43 60]]

              precision    recall  f1-score   support

           0       0.66      0.87      0.75        97
           1       0.82      0.58      0.68       103

   micro avg       0.72      0.72      0.72       200
   macro avg       0.74      0.72      0.72       200
weighted avg       0.74      0.72      0.71       200



In [17]:
# <<--- K fold cross validation --->>

accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10) 

print("Average Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Average Accuracy: 80.65 %
Standard Deviation: 2.96 %


## Single Prediction

In [18]:
new_review = 'Nice place'

new_review = re.sub('[^a-zA-Z]', ' ', new_review)

new_review = new_review.lower()

new_review = new_review.split()

ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)

new_corpus = [new_review]

new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)

if new_y_pred == 1:
    print('Positive Review')
else:
    print('Negative Review')

Positive Review


## Save Model

In [19]:
with open('Random_Forest_Model.txt','wb') as f:
    pickle.dump(classifier, f)