# Naive Bayes Classifier 

## Import Statement

In [1]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim import models
from gensim.models.phrases import Phrases, Phraser
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix   
import nltk
import pandas as pd 
import numpy as np
import gensim
import random
import pickle

## Load training file

In [2]:
data = pd.read_csv("../data/tagged_sentence_data.csv")

## Check Model Performance

### Remove NaN and covert to lowercase

In [None]:
data = data.dropna()
data_lower = [[word.lower() for word in d.split()] for d in data]

### Train the model and check the result with unprocessed data

In [None]:
# Covert to BOW
dictionary = corpora.Dictionary(data_lower['sentence'])
data_stem['review'] = [dictionary.doc2bow(doc) for doc in data_stem['review']]
data_stem['review'] = [{id:1 for (id, tf_value) in vec} for vec in data_stem['review']]

# Convert to tuple
tuples_list = list(data.itertuples(index=False, name=None))

# Fit into model
random.shuffle(tuples_list)
fold_count = 10
kf = KFold(n_splits = fold_count)
total = 0
count = 1

for train, test in kf.split(tuples_list):
    train_data = np.array(tuples_list)[train]
    test_data = np.array(tuples_list)[test]
    print("train size:", len(train_data), "test size:", len(test_data))
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    print("Fold", count, ":", nltk.classify.accuracy(classifier, test_data))
    total += nltk.classify.accuracy(classifier, test_data)
    count+=1
average_accuracy = total/fold_count
print("Average accuracy:", average_accuracy)

# Confusion Matrix
test_result = []
gold_result = []

for i in range(len(test_data)):
    test_result.append(classifier.classify(test_data[i][0]))
    gold_result.append(test_data[i][1])

print('\nClasification report:\n', classification_report(gold_result, test_result))
print('\nConfussion matrix:\n',confusion_matrix(gold_result, test_result))

## Data Preprocessing

### Remove nan and Stopwords

In [3]:
data = data.dropna()
stop_list = stopwords.words('english')
data['reviews'] = data['reviews'].apply(lambda x: [word for word in x.split() if word not in stop_list])

### Remove single word 

In [4]:
data['reviews'] = data['reviews'].apply(lambda x: x if len(x) > 1 else [])

In [5]:
# data[data['reviews'] != '[]'] has error has convert list to string
cleaned_data = data[data.astype(str)['reviews'] != '[]']
cleaned_data

Unnamed: 0,city,country,reviews,agg_polarity,sentiment
0,Vienna,Austria,"[Location, good, Have, stayed, I, e, 9yrs, ago...",0.476700,positive
1,Vienna,Austria,"[Breakfast, options, limited, Personally, woul...",0.421500,positive
2,City of London,United Kingdom,"[The, room, great, size, city, hotel, The, sho...",0.803000,positive
3,West End of London,United Kingdom,"[The, room, small, bathroom, minute, A, bedsid...",0.037075,negative
4,London,United Kingdom,"[The, bar, prices, rooms, small, side]",0.000000,negative
...,...,...,...,...,...
55049,Bresso,Italy,"[Nice, value, money, deal, The, hotel, nice, m...",0.451600,positive
55050,Poplar,United Kingdom,"[The, room, little, small, A, larger, room, sp...",0.210750,positive
55051,Malakoff,France,"[Lovely, decor, ambience, Super, location, gre...",0.915300,positive
55052,Barbican,United Kingdom,"[I, would, like, breakfast, incluyed, although...",0.648600,positive


### 1. Stemming 

In [6]:
stemmer = PorterStemmer()
# data_stem = cleaned_data.copy(deep=True)
cleaned_data['stem_reviews'] = cleaned_data['reviews'].apply(lambda x: [stemmer.stem(word) for word in x])
cleaned_data

Unnamed: 0,city,country,reviews,agg_polarity,sentiment,stem_reviews
0,Vienna,Austria,"[Location, good, Have, stayed, I, e, 9yrs, ago...",0.476700,positive,"[locat, good, have, stay, I, e, 9yr, ago, shop..."
1,Vienna,Austria,"[Breakfast, options, limited, Personally, woul...",0.421500,positive,"[breakfast, option, limit, person, would, choo..."
2,City of London,United Kingdom,"[The, room, great, size, city, hotel, The, sho...",0.803000,positive,"[the, room, great, size, citi, hotel, the, sho..."
3,West End of London,United Kingdom,"[The, room, small, bathroom, minute, A, bedsid...",0.037075,negative,"[the, room, small, bathroom, minut, A, bedsid,..."
4,London,United Kingdom,"[The, bar, prices, rooms, small, side]",0.000000,negative,"[the, bar, price, room, small, side]"
...,...,...,...,...,...,...
55049,Bresso,Italy,"[Nice, value, money, deal, The, hotel, nice, m...",0.451600,positive,"[nice, valu, money, deal, the, hotel, nice, mo..."
55050,Poplar,United Kingdom,"[The, room, little, small, A, larger, room, sp...",0.210750,positive,"[the, room, littl, small, A, larger, room, spa..."
55051,Malakoff,France,"[Lovely, decor, ambience, Super, location, gre...",0.915300,positive,"[love, decor, ambienc, super, locat, great, vi..."
55052,Barbican,United Kingdom,"[I, would, like, breakfast, incluyed, although...",0.648600,positive,"[I, would, like, breakfast, incluy, although, ..."


### 2. Lemmatization

In [7]:
lemmatizer = WordNetLemmatizer()
# data_lemmatize = cleaned_data.copy(deep=True)
cleaned_data['lemma_reviews'] = cleaned_data['reviews'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
cleaned_data

Unnamed: 0,city,country,reviews,agg_polarity,sentiment,stem_reviews,lemma_reviews
0,Vienna,Austria,"[Location, good, Have, stayed, I, e, 9yrs, ago...",0.476700,positive,"[locat, good, have, stay, I, e, 9yr, ago, shop...","[Location, good, Have, stayed, I, e, 9yrs, ago..."
1,Vienna,Austria,"[Breakfast, options, limited, Personally, woul...",0.421500,positive,"[breakfast, option, limit, person, would, choo...","[Breakfast, option, limited, Personally, would..."
2,City of London,United Kingdom,"[The, room, great, size, city, hotel, The, sho...",0.803000,positive,"[the, room, great, size, citi, hotel, the, sho...","[The, room, great, size, city, hotel, The, sho..."
3,West End of London,United Kingdom,"[The, room, small, bathroom, minute, A, bedsid...",0.037075,negative,"[the, room, small, bathroom, minut, A, bedsid,...","[The, room, small, bathroom, minute, A, bedsid..."
4,London,United Kingdom,"[The, bar, prices, rooms, small, side]",0.000000,negative,"[the, bar, price, room, small, side]","[The, bar, price, room, small, side]"
...,...,...,...,...,...,...,...
55049,Bresso,Italy,"[Nice, value, money, deal, The, hotel, nice, m...",0.451600,positive,"[nice, valu, money, deal, the, hotel, nice, mo...","[Nice, value, money, deal, The, hotel, nice, m..."
55050,Poplar,United Kingdom,"[The, room, little, small, A, larger, room, sp...",0.210750,positive,"[the, room, littl, small, A, larger, room, spa...","[The, room, little, small, A, larger, room, sp..."
55051,Malakoff,France,"[Lovely, decor, ambience, Super, location, gre...",0.915300,positive,"[love, decor, ambienc, super, locat, great, vi...","[Lovely, decor, ambience, Super, location, gre..."
55052,Barbican,United Kingdom,"[I, would, like, breakfast, incluyed, although...",0.648600,positive,"[I, would, like, breakfast, incluy, although, ...","[I, would, like, breakfast, incluyed, although..."


## Train the model

### Use Gensim to convert to dictionary and prepare data for training
1. Convert reviews to dictionary
2. Convert the reviews dictionary into TF vectors 
3. Tag a number to the word based on each review TF vector
4. Tag the dictionary with either positive or negative based on the data

In [8]:
# Stemming 
dictionary = corpora.Dictionary(cleaned_data['stem_reviews'])
cleaned_data['stem_reviews'] = [dictionary.doc2bow(doc) for doc in cleaned_data['stem_reviews']]

# Lemmatize
dictionary = corpora.Dictionary(cleaned_data['lemma_reviews'])
cleaned_data['lemma_reviews'] = [dictionary.doc2bow(doc) for doc in cleaned_data['lemma_reviews']]

cleaned_data

Unnamed: 0,city,country,reviews,agg_polarity,sentiment,stem_reviews,lemma_reviews
0,Vienna,Austria,"[Location, good, Have, stayed, I, e, 9yrs, ago...",0.476700,positive,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1..."
1,Vienna,Austria,"[Breakfast, options, limited, Personally, woul...",0.421500,positive,"[(10, 1), (13, 1), (15, 1), (16, 1), (17, 1), ...","[(10, 1), (15, 1), (16, 1), (17, 1), (18, 1), ..."
2,City of London,United Kingdom,"[The, room, great, size, city, hotel, The, sho...",0.803000,positive,"[(18, 1), (20, 1), (29, 1), (30, 1), (31, 1), ...","[(19, 1), (21, 1), (30, 1), (31, 1), (32, 1), ..."
3,West End of London,United Kingdom,"[The, room, small, bathroom, minute, A, bedsid...",0.037075,negative,"[(28, 1), (45, 2), (53, 1), (55, 1), (56, 1), ...","[(29, 1), (35, 1), (48, 2), (56, 1), (57, 1), ..."
4,London,United Kingdom,"[The, bar, prices, rooms, small, side]",0.000000,negative,"[(4, 1), (45, 1), (53, 1), (74, 1), (79, 1), (...","[(6, 1), (35, 1), (48, 1), (76, 1), (80, 1), (..."
...,...,...,...,...,...,...,...
55049,Bresso,Italy,"[Nice, value, money, deal, The, hotel, nice, m...",0.451600,positive,"[(11, 3), (20, 2), (26, 1), (45, 2), (53, 3), ...","[(11, 2), (21, 1), (26, 1), (35, 3), (48, 2), ..."
55050,Poplar,United Kingdom,"[The, room, little, small, A, larger, room, sp...",0.210750,positive,"[(3, 1), (11, 1), (28, 1), (45, 2), (46, 1), (...","[(11, 1), (29, 1), (35, 1), (48, 2), (49, 1), ..."
55051,Malakoff,France,"[Lovely, decor, ambience, Super, location, gre...",0.915300,positive,"[(9, 1), (37, 1), (41, 1), (186, 1), (525, 1),...","[(32, 1), (41, 1), (97, 1), (198, 1), (595, 1)..."
55052,Barbican,United Kingdom,"[I, would, like, breakfast, incluyed, although...",0.648600,positive,"[(1, 1), (12, 1), (16, 2), (28, 1), (33, 1), (...","[(2, 1), (12, 1), (29, 1), (38, 1), (44, 1), (..."


### 3. TF-IDF

In [9]:
cleaned_data_list = cleaned_data['stem_reviews'].values.T.tolist()
tfidf = models.TfidfModel(cleaned_data_list)
cleaned_data_with_tfidf = [tfidf[vec] for vec in cleaned_data_list]

#### Unigram

#### Bigram

In [None]:
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

#### Trigram

In [None]:
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

In [11]:
data_stem['reviews'] = [{id:1 for (id, tf_value) in vec} for vec in data_stem['reviews']]

NameError: name 'data_stem' is not defined

### Train test split the dataframe and conver into tuples (Not use)

In [15]:
train_test_data = data_stem
msk = np.random.rand(len(train_test_data)) < 0.8
train = train_test_data[msk]
test = train_test_data[~msk]

tuples_train_list =list(train.itertuples(index=False, name=None))
tuples_test_list = list(test.itertuples(index=False, name=None))

In [16]:
print(tuples_test_list[1])

({46: 1, 60: 1, 67: 1, 68: 1, 199: 1, 225: 1, 260: 1}, 'negative')


### Convert dataframe into a list of tuples

In [17]:
data = data_stem
tuples_list = list(data.itertuples(index=False, name=None))
print(tuples_list[0])

({0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1, 21: 1, 22: 1, 23: 1, 24: 1, 25: 1, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 50: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 1, 56: 1, 57: 1, 58: 1, 59: 1, 60: 1, 61: 1, 62: 1, 63: 1, 64: 1, 65: 1, 66: 1, 67: 1, 68: 1, 69: 1, 70: 1, 71: 1, 72: 1, 73: 1, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1, 79: 1, 80: 1, 81: 1, 82: 1, 83: 1, 84: 1, 85: 1, 86: 1, 87: 1, 88: 1, 89: 1, 90: 1, 91: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 97: 1, 98: 1, 99: 1, 100: 1, 101: 1, 102: 1, 103: 1, 104: 1, 105: 1, 106: 1, 107: 1, 108: 1, 109: 1, 110: 1, 111: 1, 112: 1, 113: 1, 114: 1, 115: 1, 116: 1, 117: 1, 118: 1, 119: 1}, 'negative')


### Fit the train data into nltk classifier 

In [18]:
random.shuffle(tuples_list)
fold_count = 10
kf = KFold(n_splits = fold_count)
total = 0
count = 1

for train, test in kf.split(tuples_list):
    train_data = np.array(tuples_list)[train]
    test_data = np.array(tuples_list)[test]
    print("train size:", len(train_data), "test size:", len(test_data))
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    print("Fold", count, ":", nltk.classify.accuracy(classifier, test_data))
    total += nltk.classify.accuracy(classifier, test_data)
    count+=1
average_accuracy = total/fold_count
print("Average accuracy:", average_accuracy)

train size: 712295 test size: 79144
Fold 1 : 0.9111998382694835
train size: 712295 test size: 79144
Fold 2 : 0.913625796017386
train size: 712295 test size: 79144
Fold 3 : 0.9122611947841909
train size: 712295 test size: 79144
Fold 4 : 0.9107576063883555
train size: 712295 test size: 79144
Fold 5 : 0.911920044475892
train size: 712295 test size: 79144
Fold 6 : 0.9121222076215506
train size: 712295 test size: 79144
Fold 7 : 0.9123749115536238
train size: 712295 test size: 79144
Fold 8 : 0.9137142423936117
train size: 712295 test size: 79144
Fold 9 : 0.9127286970585262
train size: 712296 test size: 79143
Fold 10 : 0.9138521410611172
Average accuracy: 0.9124556679623737


### Confusion Matrix

In [19]:
test_result = []
gold_result = []

for i in range(len(test_data)):
    test_result.append(classifier.classify(test_data[i][0]))
    gold_result.append(test_data[i][1])

print('\nClasification report:\n', classification_report(gold_result, test_result))
print('\nConfussion matrix:\n',confusion_matrix(gold_result, test_result))


Clasification report:
               precision    recall  f1-score   support

    negative       0.90      0.91      0.90     34504
    positive       0.93      0.92      0.92     44639

    accuracy                           0.91     79143
   macro avg       0.91      0.91      0.91     79143
weighted avg       0.91      0.91      0.91     79143


Confussion matrix:
 [[31315  3189]
 [ 3629 41010]]


### Test with unseen input

In [20]:
test = input("Enter reviews :")
test = test.split()
# test = test[word for word in test if word not in stop_list]
test = dictionary.doc2bow(test)
test = [{id:1 for (id, tf_value) in test}]
print(test)
print("Review outcome:", classifier.classify(test[0]))

Enter reviews :awesome view and food
[{108: 1, 673: 1, 12951: 1}]
Review outcome: positive


## Export to pickle

In [None]:
save_classifier = open("nb_model.pickle","wb") #binary write
pickle.dump(model, classifier)
save_classifier.close()