# Naive Bayes Classifier 

## Import Statement

In [67]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
from gensim import corpora
from gensim import models
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix   
import nltk
import pandas as pd 
import numpy as np
import gensim
import random

## Load training file

In [2]:
data = pd.read_csv("./data/cleaned_hotelreviews.csv")

## Data Preprocessing

### Remove nan and Stopwords

In [3]:
data = data.dropna()
stop_list = stopwords.words('english')
data['reviews'] = data['reviews'].apply(lambda x: [word for word in x.split() if word not in stop_list])

### Remove single word

In [4]:
data['reviews'] = data['reviews'].apply(lambda x: x if len(x) > 1 else [])

In [5]:
# data[data['reviews'] != '[]'] has error has convert list to string
cleaned_data = data[data.astype(str)['reviews'] != '[]']

In [6]:
cleaned_data

Unnamed: 0,reviews,class
0,"[angry, made, post, available, via, possible, ...",negative
1,"[rooms, nice, elderly, bit, difficult, rooms, ...",negative
2,"[room, dirty, afraid, walk, barefoot, floor, l...",negative
3,"[booked, company, line, showed, pictures, room...",negative
4,"[backyard, hotel, total, mess, happen, hotel, ...",negative
...,...,...
834105,"[good, breakfast]",positive
834106,"[helpful, staff, allowed, check, early, arrive...",positive
834108,"[breakfast, ok, got, earlier, check]",positive
834109,"[rooms, enormous, really, comfortable, believe...",positive


### Stemming 

In [7]:
stemmer = PorterStemmer()
data_stem = cleaned_data
data_stem['reviews'] = data_stem['reviews'].apply(lambda x: [stemmer.stem(word) for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
data_stem

Unnamed: 0,reviews,class
0,"[angri, made, post, avail, via, possibl, site,...",negative
1,"[room, nice, elderli, bit, difficult, room, tw...",negative
2,"[room, dirti, afraid, walk, barefoot, floor, l...",negative
3,"[book, compani, line, show, pictur, room, thou...",negative
4,"[backyard, hotel, total, mess, happen, hotel, ...",negative
...,...,...
834105,"[good, breakfast]",positive
834106,"[help, staff, allow, check, earli, arriv, chec...",positive
834108,"[breakfast, ok, got, earlier, check]",positive
834109,"[room, enorm, realli, comfort, believ, famili,...",positive


### Lemetization

### TF-IDF

## Train the model

### Use Gensim to convert to dictionary and prepare data for training
1. Convert reviews to dictionary
2. Convert the reviews dictionary into TF vectors 
3. Tag a number to the word based on each review TF vector
4. Tag the dictionary with either positive or negative based on the data

In [9]:
dictionary = corpora.Dictionary(data_stem['reviews'])
data_stem['reviews'] = [dictionary.doc2bow(doc) for doc in data_stem['reviews']]

print(data_stem['reviews'])

0         [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1         [(4, 1), (40, 1), (57, 1), (81, 3), (120, 1), ...
2         [(1, 1), (17, 2), (29, 1), (39, 3), (51, 2), (...
3         [(3, 1), (14, 3), (43, 2), (60, 1), (81, 3), (...
4         [(51, 2), (241, 1), (242, 1), (243, 1), (244, ...
                                ...                        
834105                                 [(255, 1), (288, 1)]
834106    [(3, 1), (21, 2), (98, 1), (200, 1), (434, 1),...
834108      [(21, 1), (44, 1), (70, 1), (159, 1), (288, 1)]
834109    [(81, 1), (151, 1), (231, 1), (320, 1), (382, ...
834110                                 [(200, 1), (461, 1)]
Name: reviews, Length: 791439, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
data_stem['reviews'] = [{id:1 for (id, tf_value) in vec} for vec in data_stem['reviews']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Train test split the dataframe and conver into tuples (Not use)

In [16]:
train_test_data = data_stem
msk = np.random.rand(len(train_test_data)) < 0.8
train = train_test_data[msk]
test = train_test_data[~msk]

tuples_train_list =list(train.itertuples(index=False, name=None))
tuples_test_list = list(test.itertuples(index=False, name=None))

In [27]:
print(tuples_test_list[1])

({12: 1, 14: 1, 21: 1, 24: 1, 25: 1, 51: 1, 67: 1, 68: 1, 81: 1, 108: 1, 117: 1, 120: 1, 154: 1, 168: 1, 184: 1, 200: 1, 263: 1, 264: 1, 265: 1, 266: 1, 267: 1, 268: 1, 269: 1, 270: 1, 271: 1, 272: 1, 273: 1, 274: 1, 275: 1, 276: 1, 277: 1, 278: 1, 279: 1, 280: 1, 281: 1, 282: 1, 283: 1, 284: 1, 285: 1, 286: 1, 287: 1}, 'negative')


### Convert dataframe into a list of tuples

In [28]:
data = data_stem
tuples_list = list(data.itertuples(index=False, name=None))
print(tuples_list[0])

({0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 50: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 97: 1, 98: 1, 99: 1, 100: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 106: 1, 107: 1, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 115: 1, 116: 1, 117: 1, 118: 1, 119: 1}, 'negative')


### Fit the train data into nltk classifier 

In [87]:
random.shuffle(tuples_list)
fold_count = 10
kf = KFold(n_splits = fold_count)
total = 0
count = 1

for train, test in kf.split(tuples_list):
    train_data = np.array(tuples_list)[train]
    test_data = np.array(tuples_list)[test]
    print("train size:", len(train_data), "test size:", len(test_data))
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    print("Fold", count, ":", nltk.classify.accuracy(classifier, test_data))
    total += nltk.classify.accuracy(classifier, test_data)
    count+=1
average_accuracy = total/fold_count
print("Average accuracy:", average_accuracy)

train size: 712295 test size: 79144
Fold 1 : 0.9124507227332457
train size: 712295 test size: 79144
Fold 2 : 0.9129940361872031
train size: 712295 test size: 79144
Fold 3 : 0.9128424138279592
train size: 712295 test size: 79144
Fold 4 : 0.9137268775902153
train size: 712295 test size: 79144
Fold 5 : 0.9120463964419286
train size: 712295 test size: 79144
Fold 6 : 0.9112377438592945
train size: 712295 test size: 79144
Fold 7 : 0.9129814009905994
train size: 712295 test size: 79144
Fold 8 : 0.9126276154856969
train size: 712295 test size: 79144
Fold 9 : 0.912071666835136
train size: 712296 test size: 79143
Fold 10 : 0.9113882465916127
Average accuracy: 0.9124367120542891


### Confusion Matrix

In [88]:
test_result = []
gold_result = []

for i in range(len(test_data)):
    test_result.append(classifier.classify(test_data[i][0]))
    gold_result.append(test_data[i][1])

print('\nClasification report:\n', classification_report(gold_result, test_result))
print('\nConfussion matrix:\n',confusion_matrix(gold_result, test_result))


Clasification report:
               precision    recall  f1-score   support

    negative       0.89      0.91      0.90     34602
    positive       0.93      0.92      0.92     44541

    accuracy                           0.91     79143
   macro avg       0.91      0.91      0.91     79143
weighted avg       0.91      0.91      0.91     79143


Confussion matrix:
 [[31318  3284]
 [ 3729 40812]]


### Test with unseen input

In [91]:
test = input("Enter reviews :")
test = test.split()
# test = test[word for word in test if word not in stop_list]
test = dictionary.doc2bow(test)
test = [{id:1 for (id, tf_value) in test}]
print(test)
print("Review outcome:", classifier.classify(test[0]))

Enter reviews :the waitress showed excellent service 10 out of 10 will come back
[{378: 1, 535: 1, 617: 1, 2445: 1, 6238: 1, 17213: 1, 28363: 1, 37319: 1}]
Review outcome: positive
