In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import datetime

import nltk
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2322 entries, 0 to 2321
Data columns (total 5 columns):
source          2322 non-null object
Rating          2170 non-null object
ReviewText      1622 non-null object
PUBLISH_TS      2322 non-null object
organisation    2322 non-null object
dtypes: object(5)
memory usage: 90.8+ KB


In [4]:
data.shape

(2322, 5)

In [5]:
data.head(10)

Unnamed: 0,source,Rating,ReviewText,PUBLISH_TS,organisation
0,google,3 stars,,2020-01-18 15:03:56.791524,ibis saint martin
1,google,5 stars,"Good hotel, not close to the sites of interest...",2020-01-18 15:03:56.791590,ibis saint martin
2,google,5 stars,Nice budget hotel for your stay in Paris..,2020-09-19 15:03:56.791618,ibis saint martin
3,google,5 stars,I was in Paris from 8 to 13 August at this fac...,2019-01-18 15:03:56.791656,ibis saint martin
4,google,1 stars,I stayed there for 3 nights. Besides the poor ...,2020-08-20 15:03:56.791677,ibis saint martin
5,google,1 stars,Attention half mouth. Very expensive room. Tin...,2020-01-18 15:03:56.791703,ibis saint martin
6,google,4 stars,,2020-01-18 15:03:56.791725,ibis saint martin
7,google,4 stars,"Very clean,good breakfast,comfort room,many re...",2020-01-18 15:03:56.791848,ibis saint martin
8,google,5 stars,,2020-01-18 15:03:56.791871,ibis saint martin
9,google,2 stars,,2019-01-18 15:03:56.791889,ibis saint martin


In [6]:
data.Rating.value_counts()

4 stars    818
5 stars    541
3 stars    529
2 stars    169
1 stars    113
Name: Rating, dtype: int64

In [7]:
data.isnull().sum(axis = 0)

source            0
Rating          152
ReviewText      700
PUBLISH_TS        0
organisation      0
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:
data.shape

(1470, 5)

In [10]:
mod_data = data.iloc[:,1:3]

mod_data.head()

Unnamed: 0,Rating,ReviewText
1,5 stars,"Good hotel, not close to the sites of interest..."
2,5 stars,Nice budget hotel for your stay in Paris..
3,5 stars,I was in Paris from 8 to 13 August at this fac...
4,1 stars,I stayed there for 3 nights. Besides the poor ...
5,1 stars,Attention half mouth. Very expensive room. Tin...


In [11]:
# Remove Units from Value List
mod_data['Rating'] = mod_data['Rating'].map(lambda x: str(x)[:1:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
print(mod_data)

     Rating                                         ReviewText
1         5  Good hotel, not close to the sites of interest...
2         5         Nice budget hotel for your stay in Paris..
3         5  I was in Paris from 8 to 13 August at this fac...
4         1  I stayed there for 3 nights. Besides the poor ...
5         1  Attention half mouth. Very expensive room. Tin...
...     ...                                                ...
2317      4  We stayed for 4 nights, its a nice hotel, very...
2318      1  We were robbed $300 at this hotel and staff de...
2319      3  We chose this hotel for the comparatively reas...
2320      3  We travel often between France and England, an...
2321      1  Me and my girlfriend stayed at the hotel from ...

[1470 rows x 2 columns]


In [13]:
# create the label
mod_data["Label"] = mod_data["Rating"].apply(lambda x: 3 if x < str(3)
                                             else (6 if x < str(5) else 8))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
print(mod_data)
df = mod_data.iloc[:,1:3]

     Rating                                         ReviewText  Label
1         5  Good hotel, not close to the sites of interest...      8
2         5         Nice budget hotel for your stay in Paris..      8
3         5  I was in Paris from 8 to 13 August at this fac...      8
4         1  I stayed there for 3 nights. Besides the poor ...      3
5         1  Attention half mouth. Very expensive room. Tin...      3
...     ...                                                ...    ...
2317      4  We stayed for 4 nights, its a nice hotel, very...      6
2318      1  We were robbed $300 at this hotel and staff de...      3
2319      3  We chose this hotel for the comparatively reas...      6
2320      3  We travel often between France and England, an...      6
2321      1  Me and my girlfriend stayed at the hotel from ...      3

[1470 rows x 3 columns]


In [15]:
mod_data['Label'].unique()
df.head()

Unnamed: 0,ReviewText,Label
1,"Good hotel, not close to the sites of interest...",8
2,Nice budget hotel for your stay in Paris..,8
3,I was in Paris from 8 to 13 August at this fac...,8
4,I stayed there for 3 nights. Besides the poor ...,3
5,Attention half mouth. Very expensive room. Tin...,3


In [16]:
# Removing punctuations
df.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [17]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
print(train.shape,test.shape)

(1029, 2) (441, 2)


In [18]:
train.shape

(1029, 2)

In [19]:
nltk.download('stopwords')
eng_stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
####################################

In [21]:
## BAG OF WORDS
countvector=CountVectorizer(ngram_range=(2,2))
traindataset=countvector.fit_transform(train['ReviewText'])

In [22]:
# print(countvector.get_feature_names())
traindataset.shape
print(type(traindataset))

<class 'scipy.sparse.csr.csr_matrix'>


In [23]:
# implement RandomForest Classifier.......

In [40]:
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
randomclassifier.fit(traindataset,train['Label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
## Predict for the Test Dataset
test_dataset = countvector.transform(test['ReviewText'])
predictions = randomclassifier.predict(test_dataset)

In [42]:
print(predictions)

[6 6 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 6 6 6 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 3 6 6 6 6 6 8 6 6 6 8 8 6 6 6 6 6 6 6 6 6 6 6 6 6 8
 6 6 8 6 6 6 8 6 6 6 6 6 6 6 6 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 3 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 8 6 6 6 6
 6 6 6 6 6 8 8 6 6 6 6 6 6 6 6 6 6 8 6 6 6 6 6 6 6 6 8 6 6 6 6 6 6 6 6 6 6
 6 6 3 6 6 6 6 6 6 6 3 6 6 6 6 8 6 6 6 6 6 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 8 6 6]


In [43]:
## Import library to check accuracy
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [61]:
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)
score=accuracy_score(test['Label'],predictions)
print(score)
report=classification_report(test['Label'],predictions)
print(report)

[[  2  70   0]
 [  4 253  10]
 [  1  93   8]]
0.5963718820861678
              precision    recall  f1-score   support

           3       0.29      0.03      0.05        72
           6       0.61      0.95      0.74       267
           8       0.44      0.08      0.13       102

    accuracy                           0.60       441
   macro avg       0.45      0.35      0.31       441
weighted avg       0.52      0.60      0.49       441



In [29]:
from sklearn.naive_bayes import MultinomialNB

naive = MultinomialNB()
naive.fit(traindataset, train['Label'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
predictions = naive.predict(test_dataset)
predictions

array([6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 3, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6,
       8, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 8, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

In [31]:
matrix = confusion_matrix(test['Label'], predictions)
print(matrix)
score = accuracy_score(test['Label'], predictions)
print(score)
report = classification_report(test['Label'], predictions)
print(report)

[[  7  65   0]
 [  1 259   7]
 [  0  96   6]]
0.6167800453514739
              precision    recall  f1-score   support

           3       0.88      0.10      0.18        72
           6       0.62      0.97      0.75       267
           8       0.46      0.06      0.10       102

    accuracy                           0.62       441
   macro avg       0.65      0.38      0.34       441
weighted avg       0.62      0.62      0.51       441

