## Import Required Libraries

In [104]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [66]:
FILE_NAME ='data_source/data.csv'
SEED=1234

## Load Data

In [29]:
data ={'review':[],'label':[]}
with open(FILE_NAME,'r') as f:
    for line in f:
        fields = line.split('||')
        data['review'].append(fields[0])
        data['label'].append(fields[1].rstrip())
data = pd.DataFrame(data)
data = data[1:]
   

In [41]:
data.shape

(221160, 2)

## Remove Corrupted Labels

In [42]:
data = data[data['label'] != '|neg']
data = data[data['label'] != '|pos']
data = data[data['label'] != 'd initially,but the young lady who managed the desk was masterful in correcting the issue.']

In [43]:
len(data['label'].unique())

2

In [53]:
print("Number of non unique reviews: ",data.shape[0] - len(data['review'].unique()))

Number of non unique reviews:  0


## Remove Duplicates

In [52]:
data = data.drop_duplicates(['review'])

In [54]:
data.shape

(220192, 2)

## Data Exploration

In [63]:
data[data['label'] == 'pos'].head()

Unnamed: 0,label,review
1,pos,"Cozy hotel with excellent location (a quiet, b..."
2,pos,Friendly staff. Room very big and comfortable...
3,pos,Location is very convenient. Atmosphere of a f...
4,pos,"Everything was ok, main thing going for this h..."
5,pos,Excellent accommodation good location attentiv...


In [62]:
data[data['label'] == 'neg'].head()

Unnamed: 0,label,review
110318,neg,I'd have to make it up...
110319,neg,Staff were friendly but not particularly proac...
110320,neg,Only moan would be shower curtain really needs...
110321,neg,"Unable to drink water in room, had to get wate..."
110322,neg,They order a taxi transfer for you and add 20 ...


## Split the data

In [65]:
X = data['review']
y = data['label']

In [68]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=SEED)



## Feature extraction

For each training review, we count the number of occurrences of each word (*term*) and use this to build a **term-document** matrix. This matrix contains the frequency of terms that occur in the set of training reviews.

In [133]:
count_vectorizer = CountVectorizer(ngram_range=(1,3))
X_train_counts = count_vectorizer.fit_transform(X_train)
print(X_train_counts.shape)

(176153, 2472315)


## Modeling

In [134]:
models =[]
models.append(('MultinomialNB',MultinomialNB()))
models.append(('Linear SVC',LinearSVC()))

results = []
names =[]
for name,model in models:
    #10-fold Cross Validation
    kfold = KFold(n_splits=10,random_state=SEED)
    cv_results = cross_val_score(model,X_train_counts,y_train,cv=kfold,scoring="f1_weighted")
    results.append(cv_results)
    names.append(name)
    msg = "Model Name: %s | Mean F1 Score: %f | SD: (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


Model Name: MultinomialNB | Mean F1 Score: 0.927518 | SD: (0.001937)
Model Name: Linear SVC | Mean F1 Score: 0.924787 | SD: (0.001469)


In [135]:
clf = MultinomialNB().fit(X_train_counts, y_train)

### Model Evaluation

In [147]:
reviews = ['This is a bad hotel',
            'This hotel is amazing!!!',
            'The roof was leaking',
             'The a/c was broken but I loved the stay',
             'The hotel was well located',
             'The hotel was too noisy',
              'The service could be better',
              'The hotel was next to the city center',
              'The manager was rude',
              'nothing',
               'bad',
              'no comments',
              'Big dirty rooms',
              'amazingly dirty rooms',
              'They took very long to give us the room',
              'The room was beautiful and the hotel had an amazing gym',
              'The food didn\'t taste amazing',
              'It was like the AI hotel in Altered Carbon',
              'It looked like a castle from the mid 18th centuary',
              'The Waiters were lazy',
            'It was so amazing, that I will never go back there again!',
              'Nothing! Had Bed bugs and water leaking from the ceiling!!! Staying ther was beyond amazingly horrible.'
          ]
X_new_counts = count_vectorizer.transform(reviews)

predicted = clf.predict(X_new_counts)
probabilities = clf.predict_proba(X_new_counts)

for review, category,probability in zip(reviews,predicted,probabilities):
    print('{0} => {1} {2:.2f}%'.format(review, category,max(probability)*100))

This is a bad hotel => neg 89.83%
This hotel is amazing!!! => pos 100.00%
The roof was leaking => neg 97.61%
The a/c was broken but I loved the stay => pos 76.04%
The hotel was well located => pos 100.00%
The hotel was too noisy => neg 99.88%
The service could be better => neg 100.00%
The hotel was next to the city center => pos 100.00%
The manager was rude => neg 99.99%
nothing => neg 61.96%
bad => neg 80.93%
no comments => neg 91.72%
Big dirty rooms => neg 98.93%
amazingly dirty rooms => neg 83.78%
They took very long to give us the room => neg 100.00%
The room was beautiful and the hotel had an amazing gym => pos 100.00%
The food didn't taste amazing => neg 50.06%
It was like the AI hotel in Altered Carbon => neg 99.61%
It looked like a castle from the mid 18th centuary => neg 95.41%
The Waiters were lazy => neg 94.15%
It was so amazing, that I will never go back there again! => neg 87.04%
Nothing! Had Bed bugs and water leaking from the ceiling!!! Staying ther was beyond amazingly 

In [149]:
X_test_counts = count_vectorizer.transform(X_test)
predicted = clf.predict(X_test_counts)
print("Test set accuracy is", np.mean(predicted == y_test))

Test set accuracy is 0.9298576261949636


In [139]:
print(classification_report(y_test,
                            predicted))

             precision    recall  f1-score   support

        neg       0.91      0.95      0.93     21937
        pos       0.95      0.91      0.93     22102

avg / total       0.93      0.93      0.93     44039

