# Hotel Sentiment Analysis of Trip Advisor

In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Importing the dataset
df = pd.read_csv('C:/Users/Manoj/Desktop/Hotel sentiment Analysis/tripadvisor_hotel_reviews.csv')
df.head(5)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
df.shape

(20491, 2)

In [6]:
df.describe

<bound method NDFrame.describe of                                                   Review  Rating
0      nice hotel expensive parking got good deal sta...       4
1      ok nothing special charge diamond member hilto...       2
2      nice rooms not 4* experience hotel monaco seat...       3
3      unique, great stay, wonderful time hotel monac...       5
4      great stay great stay, went seahawk game aweso...       5
...                                                  ...     ...
20486  best kept secret 3rd time staying charm, not 5...       5
20487  great location price view hotel great quick pl...       4
20488  ok just looks nice modern outside, desk staff ...       2
20489  hotel theft ruined vacation hotel opened sept ...       1
20490  people talking, ca n't believe excellent ratin...       2

[20491 rows x 2 columns]>

In [7]:
df.info

<bound method DataFrame.info of                                                   Review  Rating
0      nice hotel expensive parking got good deal sta...       4
1      ok nothing special charge diamond member hilto...       2
2      nice rooms not 4* experience hotel monaco seat...       3
3      unique, great stay, wonderful time hotel monac...       5
4      great stay great stay, went seahawk game aweso...       5
...                                                  ...     ...
20486  best kept secret 3rd time staying charm, not 5...       5
20487  great location price view hotel great quick pl...       4
20488  ok just looks nice modern outside, desk staff ...       2
20489  hotel theft ruined vacation hotel opened sept ...       1
20490  people talking, ca n't believe excellent ratin...       2

[20491 rows x 2 columns]>

# Cleaning the data

In [8]:
df['Rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [9]:
#Delete those entries with rating less than 3
df_neg = df.loc[df['Rating']<3]
df_neg = df_neg.reset_index(drop = True)

In [11]:
#Focussing only on data with 5 star rating
df_five = df.loc[df['Rating'] == 5]
df_five = df_five.reset_index(drop = True)

In [14]:
#To consider only those 5 star rating entries equal to length of ratings < 3
df_pos = df_five.loc[:len(df_neg)]
len(df_pos)                               #1421+1793 = 3214~3215.

3215

In [15]:
#Concatinating rows of ratings as less than 3 above and equal to 5 below {To provide sentiment attribute}
df_all = pd.concat([df_neg, df_pos], axis = 0)                 #axis = 0 ie access to all rows
df_all = df_all.reset_index(drop = True)

In [16]:
len (df_all)               #equal to all ratings < 3 and df_pos ~== 6400

6429

In [18]:
df_all.head(5)                  #All 1 and two star ratings hotel

Unnamed: 0,Review,Rating
0,ok nothing special charge diamond member hilto...,2
1,"poor value stayed monaco seattle july, nice ho...",2
2,horrible customer service hotel stay february ...,1
3,disappointed say anticipating stay hotel monac...,2
4,great location need internally upgrade advanta...,2


In [19]:
df_all.tail(5)                #All 5 star rating hotels

Unnamed: 0,Review,Rating
6424,perfect hotel hotel does not really need glowi...,5
6425,perfect hotel small hotel comfortable perfect ...,5
6426,ordinary location extraordinary hotel know lov...,5
6427,"classy indulgence awesome experience, staff n'...",5
6428,first-rate experience stay library hotel wife ...,5


# Create a Sentiments Column

In [20]:
df_all['Sentiment'] = np.where(df_all['Rating'] == 5, "Positive", "Negative")

In [21]:
df_all.head()                                     #These hotel get sentiment of negative

Unnamed: 0,Review,Rating,Sentiment
0,ok nothing special charge diamond member hilto...,2,Negative
1,"poor value stayed monaco seattle july, nice ho...",2,Negative
2,horrible customer service hotel stay february ...,1,Negative
3,disappointed say anticipating stay hotel monac...,2,Negative
4,great location need internally upgrade advanta...,2,Negative


In [22]:
df_all.tail()                                     #These hotel get sentiment as positive

Unnamed: 0,Review,Rating,Sentiment
6424,perfect hotel hotel does not really need glowi...,5,Positive
6425,perfect hotel small hotel comfortable perfect ...,5,Positive
6426,ordinary location extraordinary hotel know lov...,5,Positive
6427,"classy indulgence awesome experience, staff n'...",5,Positive
6428,first-rate experience stay library hotel wife ...,5,Positive


In [23]:
#All entries get randomised
df_all = df_all.sample(frac = 1)
df_all = df_all.reset_index(drop = True) 

In [24]:
df_all.head(10)

Unnamed: 0,Review,Rating,Sentiment
0,perfect hotel hotel does not really need glowi...,5,Positive
1,wedding paradise 32 friends family loved place...,5,Positive
2,fall waiting happen ultra-modernist sculpture ...,2,Negative
3,noise big problem booked hotel relaxing weeken...,2,Negative
4,dont high expectations just returned occidenta...,2,Negative
5,horrible hotel booked hotel minute descision w...,1,Negative
6,fantastic hotel stayed sofitel nights say that...,5,Positive
7,fantastic hotel coming knew pound strong compa...,5,Positive
8,wonderful hotel great staff decided stay readi...,5,Positive
9,african american sister went punta cana july 2...,2,Negative


In [25]:
df_all.tail(10)

Unnamed: 0,Review,Rating,Sentiment
6419,"overpriced overrated worn not recommend hotel,...",1,Negative
6420,great stay stayed large ny hotels boutique hot...,5,Positive
6421,"stay away, begin, rooms air conditioned damp, ...",1,Negative
6422,"disgusting stayed 1 night glad, room looked cu...",1,Negative
6423,"great vacation, group 16 traveled iowa chicago...",5,Positive
6424,"great family fun, just returned president week...",5,Positive
6425,"pay spent week august resort, not planned trip...",2,Negative
6426,overpriced poor service stayed krasnapolsky ni...,2,Negative
6427,miss hotel great location doubts recommending ...,2,Negative
6428,beautiful resort just got staying 4 days wyndh...,5,Positive


# Split data into training and testing sets

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_all.Review, df_all.Sentiment)         #X = Review , Y = Sentiment

In [29]:
#Using Count Vectorization since Review is a sentence and not a word...      Refer documentation for implementation of count vectorization
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_vec = v.fit_transform(x_train)                                  #Handles array of numbers in place of words of reviews
x_test_vec = v.transform(x_test)

# Classification Model

In [30]:
#Implement Support Vector Machine (SVM)

from sklearn import svm
clf_svm = svm.SVC(kernel = "linear")
clf_svm.fit(x_train_vec, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

# Testing Accuracy

In [32]:
clf_svm.score(x_test_vec, y_test)

0.9502487562189055

In [34]:
from sklearn.metrics import f1_score
f1_score(y_test, clf_svm.predict(x_test_vec), average = None)

array([0.95043371, 0.95006242])

In [40]:
rev = ["great stay stayed large ny hotels boutique ho"]
rev_vec = v.transform(rev)
clf_svm.predict(rev_vec)

array(['Positive'], dtype=object)

In [41]:
rev = ["Hopeless hotel"]
rev_vec = v.transform(rev)
clf_svm.predict(rev_vec)

array(['Negative'], dtype=object)