In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, GridSearchCV, cross_val_score
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import NMF

## Exploratory Data Analysis

In [None]:
# Data from: https://www.kaggle.com/jiashenliu/515k-hotel-reviews-data-in-europe
data = pd.read_csv('Hotel_Reviews.csv')

# remove rows with no reviews
data = data[(data['Negative_Review'] != 'No Negative') | (data['Positive_Review'] != 'No Positive')]
data.reset_index(drop=True, inplace=True)

In [None]:
data

In [None]:
data.describe()

In [None]:
#how many unqiue in each column
for i in data.columns:
    print(i, len(data[i].unique()))

In [None]:
#Testing if there was a difference between ratings from europeans and non-europeans
europe = pd.read_html('https://en.wikipedia.org/wiki/Ethnic_groups_in_Europe')

euro = list(europe[4]['Country'])
for idx, i in enumerate(euro):
    if not i.isalpha():
        euro[idx] = i[:-3]
    if i =='United King':
        euro[idx] = 'United Kingdom'
euro[6] = 'Bosnia and Herzegovina'

euro_or_no = list(data['Reviewer_Nationality'])
for idx, i in enumerate(euro_or_no):
    euro_or_no[idx] = i[1:-1]
for idx,i in enumerate(euro_or_no):

    if i in list(euro):
        euro_or_no[idx] = 'European'
    else:
        euro_or_no[idx] = 'Not European'

data['Reviewer_Nationality'] = euro_or_no
euros = data.copy()
euros.drop(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
        'Hotel_Name',
       'Negative_Review', 
       'Total_Number_of_Reviews', 'Positive_Review',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Tags',
       'days_since_review', 'lat', 'lng'], axis=1, inplace=True)
euros.Reviewer_Nationality.unique()
euros.groupby('Reviewer_Nationality').mean()

In [None]:
def city_address(address, cities):
    for city in cities:
        if city in address:
            return city
cities = ['Amsterdam', 'Vienna', 'Milan', 'Barcelona', 'Paris', 'London']
data['city'] = data['Hotel_Address'].apply(lambda x: city_address(x, cities))

In [None]:
# Checking monthly trend, consistent scoring
monthdf = data[['Review_Date','Average_Score','Reviewer_Score']]
monthdf['Review_Date'] = pd.to_datetime(monthdf['Review_Date'])
monthdf['Review_Date'] = monthdf['Review_Date'].apply(lambda x: x.strftime('%m'))
monthdf['City'] = data['city']
monthdf = monthdf.groupby(['Review_Date', 'City']).agg({'Average_Score':'mean', 'Reviewer_Score':'mean'})
monthdf.reset_index(inplace=True)
monthdf.sort_values('City')

In [None]:
for i in cities:
    tempdf = monthdf[monthdf['City']==i]
    plt.plot(tempdf['Review_Date'], tempdf['Reviewer_Score'], label=i)
    plt.xlabel('Month')
    plt.ylabel('Average Rating')
    plt.legend(loc='upper right')

In [None]:
data.head()

In [None]:
plt.scatter(data['Total_Number_of_Reviews_Reviewer_Has_Given'],data['Reviewer_Score'])

In [None]:
plt.scatter(data['Average_Score'],data['Reviewer_Score'])

In [None]:
plt.hist(data['Reviewer_Score'], bins=50);
plt.title('Distribution of Reviewer Scores')
plt.xlabel('Reviewer Score')
plt.ylabel('Occurrences')

## Cleaning the Data

In [None]:
#Drop unwanted columns
data.drop(['Hotel_Name','Hotel_Address','Review_Date','days_since_review', 'lat', 'lng'],  axis=1, inplace=True)
data['Average_Score'].astype(float)

In [None]:
#correcting tags from 55,000 to a few - started as a list of strings that looked like lists of strings
tags = data.Tags

import ast
new_tags = []
for i in tags:
    new_tags.append(ast.literal_eval(i))

In [None]:
# check most common tags
from collections import Counter
list_for_counting = []
for i in new_tags:
    for t in i:
        list_for_counting.append(t)
    
c = Counter(list_for_counting)
c.most_common(20)      

In [None]:
#feature engineering tags to reduce columns
for index, i in enumerate(new_tags):
    for idx, t in enumerate(i):
        if t not in ['Stayed 1-2 nights','Stayed 3-4 nights', 'Stayed 5+ nights', 'Fancy', 'Budget', 'Medium', 'High', ' Business trip ', ' Solo traveler ', ' Leisure trip ', ' Couple ', ' Group ', ' Family with young children ', ' Family with older children ']:
            if t in [' Stayed 1 night ',' Stayed 2 nights ']:
                new_tags[index][idx] = 'Stayed 1-2 nights'
            if t in [' Stayed 3 nights ',' Stayed 4 nights ']:
                new_tags[index][idx] = 'Stayed 3-4 nights'
            if t in [' Stayed 5 nights ',' Stayed 6 nights ', ' Stayed 7 nights ', ' Stayed 8 nights ', ' Stayed 9 nights ', ' Stayed 10 nights ',  ' Stayed 11 nights ',
                 ' Stayed 12 nights ', ' Stayed 13 nights ', ' Stayed 14 nights ', ' Stayed 15 nights ', ' Stayed 16 nights ', ' Stayed 17 nights ',' Stayed 18 nights ', ' Stayed 19 nights ', ' Stayed 20 nights ',
                 ' Stayed 21 nights ', ' Stayed 22 nights ', ' Stayed 23 nights ', ' Stayed 24 nights ', ' Stayed 25 nights ', ' Stayed 26 nights ',
                 ' Stayed 27 nights ', ' Stayed 28 nights ', ' Stayed 29 nights ', ' Stayed 30 nights ', ' Stayed 31 nights ',]:
                new_tags[index][idx] = 'Stayed 5+ nights'
            if 'Luxury' in t or 'VIP' in t or 'Executive' in t or 'Ambassador' in t or 'Royal' in t or 'Penthouse' in t or 'Suite' in t or 'Duplex' in t or 'Presidential' in t or 'Apartment' in t or 'Apartement' in t:
                new_tags[index][idx] = 'Fancy'
            if 'Superior' in t or 'Premium' in t or 'Prestige' in t or 'Premiere' in t or 'Privilege' in t or 'Deluxe' in t or 'Premier' in t or 'Club' in t or 'View' in t or 'Art' in t or 'Fabulous' in t or 'Wonderful' in t or 'Loft' in t or 'Eiffel' in t or 'Spa' in t or 'King' in t:
                new_tags[index][idx] = 'High'
            if 'Standard' in t or 'Budget' in t or 'Small' in t or 'Economy' in t or 'Basic' in t or 'Bunk Bed' in t or 'Interior' in t or 'Special Offer' in t or 'Triple' in t or 'Quadruple' in t or 'Quintuple' in t or 'Sextuple' in t or 'Junior' in t or 'Twin' in t or 'Mini' in t or 'Check In' in t or 'Check in' in t or'Solo' in t or 'Camper' in t or 'Rooms' in t or 'Interconnecting' in t or 'FAMILY' in t or 'Atrium' in t or 'rooms' in t:
                new_tags[index][idx] = 'Budget'
            if 'Comfort' in t or 'Family' in t or 'Classic' in t or 'Large' in t or 'Double' in t or 'Cosy' in t or 'Single' in t or 'Connecting' in t or 'Queen' in t or 'Cozy' in t or 'Studio' in t or 'Adjacent' in t or 'Two' in t:
                new_tags[index][idx] = 'Medium'
            
for index, i in enumerate(new_tags):
    for idx, t in enumerate(i):            
            if t not in ['Stayed 1-2 nights','Stayed 3-4 nights', 'Stayed 5+ nights', 'Fancy', 'Budget', 'Medium', 'High', ' Business trip ', ' Solo traveler ', ' Leisure trip ', ' Couple ', ' Group ', ' Family with young children ', ' Family with older children ']:
                   new_tags[index][idx] = 'High'

In [None]:
unique_tags = []
for i in new_tags:
    for t in i:
        if t not in unique_tags:
            unique_tags.append(t)

In [None]:
#Replace tags with feature engineered tags
data['New_Tags'] = new_tags

In [None]:
#Onehotencode tags and drop the old columns
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

tagdf = pd.DataFrame(mlb.fit_transform(data.New_Tags),columns=mlb.classes_, index=data.index)


data = data.join(tagdf)
data = data.drop(['New_Tags'], axis=1)
data.drop('Tags', axis=1, inplace=True)

In [None]:
data = data.drop(['Reviewer_Nationality'], axis=1)

In [None]:
#clean = clean.to_csv('clean.csv')

In [None]:
data["Negative_Review"] = data["Negative_Review"].apply(lambda x: str(x).replace("No Negative", " "))
data["Positive_Review"] = data["Positive_Review"].apply(lambda x: str(x).replace("No Positive", " "))

In [None]:
neg = data["Negative_Review"]
pos = data["Positive_Review"]

In [None]:
from textblob import TextBlob
neg_review_sentiment = []
for i in neg:
    review = TextBlob(i)
    neg_review_sentiment.append(review.sentiment)


In [None]:
neg_review_sentiments = [i.polarity for i in neg_review_sentiment]
data['neg_review_sentiment'] = neg_review_sentiments

In [None]:
pos_review_sentiment = []
for i in pos:
    review = TextBlob(i)
    pos_review_sentiment.append(review.sentiment)

In [None]:
pos_review_sentiments = [i.polarity for i in pos_review_sentiment]
data['pos_review_sentiment'] = pos_review_sentiments

In [None]:
#remove stopwords and lemmatize reviews for nmf and most frequent word comparisons
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

data['Neg_Review_Clean'] = data['Negative_Review']

lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

filtered = []
for i in data['Neg_Review_Clean']:
    i = i.split()
    filtered_sentence = [lem.lemmatize(w.lower()) for w in i if w not in stop_words]
    filtered.append(' '.join(filtered_sentence))
    
data['Neg_Review_Clean'] = filtered

In [None]:
#remove stopwords and lemmatize reviews
data['Pos_Review_Clean'] = data['Positive_Review']

filteredpos = []
for i in data['Pos_Review_Clean']:
    i = i.split()
    filtered_sentence = [lem.lemmatize(w) for w in i if w not in stop_words]
    filteredpos.append(' '.join(filtered_sentence))
    
data['Pos_Review_Clean'] = filteredpos

In [None]:
data

In [None]:
data.to_csv('cleanest.csv')

In [None]:
#Making same dataset using vader sentiment analysis for comparison

In [None]:
cleanest = pd.read_csv('cleanest.csv')
#cleanest.fillna(' ', inplace=True)
#cleanest.drop(['Unnamed: 0','Neg_Review_Clean', 'Pos_Review_Clean'], axis=1, inplace=True)

In [None]:
cleanest

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

analyser = SentimentIntensityAnalyzer()

pos = cleanest['Positive_Review']
pos_review_sentiment = []
for i in pos:
    snt = analyser.polarity_scores(i)
    pos_review_sentiment.append(snt['compound'])


In [None]:
neg = cleanest['Negative_Review']
neg_review_sentiment = []
for i in neg:
    snt = analyser.polarity_scores(i)
    neg_review_sentiment.append(snt['compound'])

In [None]:
cleanest['vader_pos_sent'] = pos_review_sentiment
cleanest['vader_neg_sent'] = neg_review_sentiment

In [None]:
cleanest.drop(['neg_review_sentiment', 'pos_review_sentiment'], axis=1, inplace=True)

In [None]:
cleanest.drop(['Positive_Review', 'Negative_Review'], axis=1, inplace=True)

In [None]:
cleanest.to_csv('vader.csv')

## Modeling

In [None]:
# has textblob sentiment
cleanest = pd.read_csv('cleanest.csv')
cleanest.fillna(' ', inplace=True)
cleanest.drop(['Unnamed: 0', 'Negative_Review', 'Positive_Review', 'Neg_Review_Clean', 'Pos_Review_Clean'], axis=1, inplace=True)

In [None]:
#Drop due to colinearity with total reviews
cleanest.drop('Additional_Number_of_Scoring', axis=1, inplace=True)

### Modeling with Textblob sentiment analysis

In [None]:
# Benchmark - error if you always guessed the average (1.18)
mean_absolute_error(cleanest['Average_Score'], cleanest['Reviewer_Score'])

In [None]:
y = cleanest.pop('Reviewer_Score')
X = cleanest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [None]:
rf = RandomForestRegressor(min_samples_leaf = 30,
                          max_depth=25,
                          max_features=10,
                          n_estimators=500)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
#Out of the box random forest with textblob sentiment analysis: 0.8860822932892579

In [None]:
gdbr = GradientBoostingRegressor(learning_rate=0.05,
                                  loss='ls',
                                 max_depth=15,
                                  n_estimators=500,
                                 min_samples_leaf=120,
                                 max_features=15,
                                  random_state=1)

In [None]:
gdbr.fit(X_train, y_train)

In [None]:
y_hat = gdbr.predict(X_test)
mean_absolute_error(y_test, y_hat)

In [None]:
#Out of the box gradient boost with textblob analysis: 0.8735888232428165 

In [None]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
param = {}
# I used gamma regression 
param['objective'] = 'reg:gamma'
param['eta'] = 0.05
param['max_depth'] =6
param['silent'] = 0
param['nthread'] = 4
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 250
bst = xgb.train(param, xg_train, num_round, watchlist)
pred = bst.predict(xg_test)
mean_absolute_error(y_test,pred)

In [None]:
#Out of the box XGboost with textblob sentiment analysis: 0.8773096008942032

### Modeling with original data

In [None]:
cleanest.drop(['neg_review_sentiment', 'pos_review_sentiment'], axis=1, inplace=True)

In [None]:
X = cleanest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
#Out of the box random forest no sentiment analysis: 0.9172898922210799

In [None]:
gdbr.fit(X_train, y_train)
y_hat = gdbr.predict(X_test)
mean_absolute_error(y_test, y_hat)

In [None]:
#Out of the box gradient boost: 0.9180026868600099 

In [None]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
param = {}
# I used gamma regression 
param['objective'] = 'reg:gamma'
param['eta'] = 0.05
param['max_depth'] =6
param['silent'] = 0
param['nthread'] = 4
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 250


In [None]:
bst = xgb.train(param, xg_train, num_round, watchlist)
pred = bst.predict(xg_test)
mean_absolute_error(y_test,pred)

In [None]:
#Out of the box XGboost: 0.9150395736506789 

### Modeling with Vader Sentiment Analysis

In [None]:
vader = pd.read_csv('vader.csv')
vader.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
vader

In [None]:
vader.drop('Additional_Number_of_Scoring', axis=1, inplace=True)

In [None]:
y = vader.pop('Reviewer_Score')
X = vader

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
##Out of the box random forest with vader sentiment analysis:0.864468939011441

In [None]:
gdbr.fit(X_train, y_train)
y_hat = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
#Out of the box gradient boost with vader sentiment analysis: 0.864468939011441

In [None]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
param = {}
# I used gamma regression 
param['objective'] = 'reg:gamma'
param['eta'] = 0.05
param['max_depth'] =6
param['silent'] = 0
param['nthread'] = 4
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 250
bst = xgb.train(param, xg_train, num_round, watchlist)
pred = bst.predict(xg_test)
mean_absolute_error(y_test,pred)

In [None]:
mean_squared_error(y_test, pred)**0.5

In [None]:
mean_squared_error(y, vader['Average_Score'])**0.5

In [None]:
#Out of the box XGboost with vader sentiment analysis:  0.8603997258088616

In [None]:
# Vader slightly better, will proceed with vader sentiment analysis

### Modeling Vader with count vector of top 300 words from each positive and negative reviews

In [None]:
cleanest = pd.read_csv('cleanest.csv')
count = CountVectorizer(max_features=300, stop_words='english')


In [None]:
fitted_pos = count.fit_transform(cleanest['Pos_Review_Clean'].values.astype('U'))
pos_col_names = count.get_feature_names()
fitted_pos = fitted_pos.todense()
fitted_pos = pd.DataFrame(fitted_pos, columns = pos_col_names)

In [None]:
vader = vader.join(fitted_pos)

In [None]:
fitted_neg = count.fit_transform(cleanest['Neg_Review_Clean'].values.astype('U'))
neg_col_names = count.get_feature_names()
fitted_neg = fitted_neg.todense()
fitted_neg = pd.DataFrame(fitted_neg, columns = neg_col_names)
fitted_neg = fitted_neg.add_suffix('_neg')
vader = vader.join(fitted_neg)

In [None]:
vader.to_csv('vadercountfinal')

In [None]:
vadercountfinal = pd.read_csv('vadercountfinal.csv')

In [None]:
vadercountfinal

In [None]:
y = vadercountfinal.pop('Reviewer_Score')
X = vadercountfinal

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
rf = RandomForestRegressor(min_samples_leaf = 30,
                          max_depth=25,
                          max_features=10,
                          n_estimators=500)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
#Out of the box random forest with vader and count vectorize: 0.9482405860580165

In [None]:
gdbr = GradientBoostingRegressor(learning_rate=0.05,
                                  loss='ls',
                                 max_depth=15,
                                  n_estimators=500,
                                 min_samples_leaf=120,
                                 max_features=10,
                                  random_state=1)
gdbr.fit(X_train, y_train)
y_hat = gdbr.predict(X_test)
mean_absolute_error(y_test, y_hat)

In [None]:
#Out of the box gradient boost with vader and count vectorizer: 0.7937548145846729

In [None]:
vadercountfinal = pd.read_csv('vadercountfinal.csv')
X = vadercountfinal
y = y
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
param = {}
# I used gamma regression 
param['objective'] = 'reg:gamma'
param['eta'] = 0.02
param['max_depth'] = 10
param['silent'] = 0
param['nthread'] = 4
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 2000

In [None]:
pred = bst2.predict(xg_test)
mean_absolute_error(y_test,pred)

In [None]:
# Best tuned XGBoost: 0.7745135257798303

In [None]:
rf = RandomForestRegressor(min_samples_leaf = 5,
                          max_depth=30,
                          max_features=60,
                          n_estimators=1000)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mean_absolute_error(y_test, y_pred)

In [None]:
# Best tuned random forest: .8199

In [None]:
gdbr = GradientBoostingRegressor(learning_rate=0.01,
                                  loss='ls',
                                 max_depth=35,
                                  n_estimators=1250,
                                 min_samples_leaf=80,
                                 max_features=70,
                                  random_state=1)

gdbr.fit(X_train, y_train)
y_hat = gdbr.predict(X_test)
mean_absolute_error(y_test, y_hat)

In [None]:
#Best tuned gradient boosting regressor: 0.764065461073389

### Feature Importances

In [None]:
feats_dict = bst.get_score(importance_type='gain')

In [None]:
items = feats_dict.items()

In [None]:
sorted(feats_dict.values())

### Graphing Model comparison

In [None]:
original = [0.918, 0.920, 0.916]
textblob = [0.885, 0.874, 0.879]
vadergraph = [0.864, 0.864, 0.860]
vadercountvectorizer = [0.812, 0.764, 0.774]



N = 3
ind = np.array((0,1,2))
width = 0.25
fig= plt.figure(figsize=(12,6))
rects1 = plt.bar(ind, original, 4/5*width, label='Original Dataset')
rects2 = plt.bar(ind + width, textblob, 4/5*width,
    label='With Textblob')
rects3 = plt.bar(ind + 2*width, vadergraph, 4/5*width,
    label='With Vader')
rects4 = plt.bar(ind + 3*width, vadercountvectorizer, 4/5*width,
    label='Vader CountVectorizer')



plt.ylabel('Mean Absolute Error')
plt.title('Scores by dataset and model')
plt.axis([-.25,3,0.7,0.95])
plt.xticks(ind + 1.5*width, ('Random Forest', 'Gradient Boost', 'XGBoost'))
plt.legend(loc='best')


def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2., 1*h, h,
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)

plt.show()
fig.savefig('Hotel-Reviews/images/modelcomparison.png')

In [None]:
#Using a dictionary to find word occurrences

In [None]:
negative_words = {}
for i in negative_reviews:
    try:
        for w in i.split():
            if w not in negative_words.keys():
                negative_words[w] = 0
            negative_words[w] +=1
    except AttributeError:
        continue
        

In [None]:
top_list = sorted(negative_words.values())[::-1][:30]
for k,v in negative_words.items():
    if v in top_list:
        print(k)

In [None]:
positive_words = {}
for i in positive_reviews:
    try:
        for w in i.split():
            if w not in positive_words.keys():
                positive_words[w] = 0
            positive_words[w] +=1
    except AttributeError:
        continue

In [None]:
top_list = sorted(negative_words.values())[::-1][:30]
for k,v in negative_words.items():
    if v in top_list:
        print(k)

### NMF

In [None]:
count = CountVectorizer(max_features=1000, stop_words='english')

In [None]:
words = count.get_feature_names()

In [None]:
fitted_neg = count.fit_transform(neg_content.values.astype('U')) 
fitted_pos = count.fit_transform(pos_content.values.astype('U'))

In [None]:
model = NMF(n_components=3, init='random', random_state=0)
W = model.fit_transform(fitted_neg)
H = model.components_

In [None]:
associated_pos = []
for i in H2:
    associated_pos.append((np.argsort(i)[::-1]))

In [None]:
topics_pos = []
for num, i in enumerate(associated_pos):
    lst = []
    for idx in i:
        lst.append(words[idx])
    topics_pos.append(lst)
    print(f'topic{num+1} :', lst)

## Word Clouds

In [None]:
from wordcloud import WordCloud
wc = WordCloud(background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)


In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [40, 10]

# Create subplots for each Topic
for index, topic in enumerate(topics_pos):
    wc = WordCloud(width = 1000, height = 500).generate(' '.join(topic))
    
    plt.subplot(2, 5, index+1)
    plt.imshow(wc, interpolation="quadric")
    plt.axis("off")
    #plt.title(hand_labels[index])
    
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 25,
        max_font_size = 30, 
        scale = 3,
    ).generate(str(data))

    fig = plt.figure(1, figsize = (10, 10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.tight_layout()
    plt.show()

    return fig

fig = show_wordcloud(topics_pos[0])
fig.savefig('Hotel-Reviews/images/topic1.jpg')


In [None]:
fig2 = show_wordcloud(topics_pos[1])
fig2.savefig('Hotel-Reviews/images/topic2.jpg')


In [None]:
fig3 = show_wordcloud(topics_pos[2])
fig3.savefig('Hotel-Reviews/images/topic3.jpg')

In [None]:
show_wordcloud(fitted_neg)

## Mapping

In [None]:
mapdf = data[['Hotel_Name', 'Average_Score', 'lat', 'lng']]
mapdf = mapdf.groupby(['Hotel_Name']).agg({'Average_Score':'mean', 'lat':'mean', 'lng':'mean'})
mapdf = mapdf.dropna()
mapdf.reset_index(inplace=True)

In [None]:
mapdf['color'] = pd.cut(mapdf['Average_Score'], bins=[0,8.1,8.5,8.9,10], 
                              labels=['red', 'orange', 'blue', 'green'])

In [None]:
import folium
m = folium.Map(location=[52.360576, 4.915968])

image_file='legend.png'

FloatImage(image_file,bottom=5,left=5).add_to(m)

for i in range(mapdf.shape[0]):
    folium.CircleMarker([mapdf['lat'][i],mapdf['lng'][i]], radius=2,tooltip=f'{mapdf["Hotel_Name"][i]} : {mapdf["Average_Score"][i]:0.2}', color=mapdf['color'][i]).add_to(m)
m

In [None]:
from folium.plugins import FloatImage


import folium
b = folium.Map(location=[52.360576, 4.915968])
# convert to (n, 2) nd-array format for heatmap
import folium.plugins as plugins
lat = np.array(mapdf['lat'])
lng = np.array(mapdf['lng'])
coords = []
for i,j in zip(lat,lng):
    coords.append((i,j))
hotels = np.array(coords)

image_file='legend.png'

FloatImage(image_file,bottom=5,left=5).add_to(b)

# plot heatmap
for i in range(mapdf.shape[0]):
    folium.CircleMarker([mapdf['lat'][i],mapdf['lng'][i]], radius=2, tooltip=f'{mapdf["Hotel_Name"][i]} : {mapdf["Average_Score"][i]:0.2}', color=mapdf['color'][i]).add_to(b)


b.add_children(plugins.HeatMap(hotels, radius=30))
b



In [None]:
mapdata = np.array(mapdf[['lat', 'lng','Average_Score']])

### Update Vader Lexicon

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

analyser = SentimentIntensityAnalyzer()

In [None]:
a = analyser.lexicon.keys()
b = analyser.lexicon.values()

In [None]:
hotel_lexicon = {k:v for k,v in zip(a,b)}

In [None]:
neg = data['Negative_Review']
neg_review_sentiment = []
for i in neg:
    snt = analyser.polarity_scores(i)
    neg_review_sentiment.append(snt['compound'])


In [None]:
pos = data['Positive_Review']
pos_review_sentiment = []
for i in pos:
    snt = analyser.polarity_scores(i)
    pos_review_sentiment.append(snt['compound'])


In [None]:
vader

In [None]:
#vader.drop(['neg_review_sentiment', 'pos_review_sentiment'], axis=1, inplace=True)

vader['pos_sent'] = pos_review_sentiment
vader['neg_sent'] = neg_review_sentiment

In [None]:
vader.drop(['Unnamed: 0', 'Unnamed: 0.1','Neg_Review_Clean', 'Pos_Review_Clean'], axis=1, inplace=True)

In [None]:
vader.to_csv('vader.csv')

In [None]:
vader.drop('Additional_Number_of_Scoring', axis=1, inplace=True)

In [None]:
neg_dict = {}
for idx, i in enumerate(cleanest['Neg_Review_Clean']):
    try:
        for w in word_tokenize(i):
            if w.isalpha():
                w=w.lower()
                if w not in neg_dict.keys():
                    neg_dict[w] = 0
                neg_dict[w] -=1
    except TypeError:
        continue

In [None]:
sorted(neg_dict.values())

In [None]:
pos_dict = {}
for i in cleanest['Pos_Review_Clean']:
    try:
        for w in word_tokenize(i):
            w=w.lower()
            if w.isalpha():
                if w not in pos_dict.keys():
                    pos_dict[w] = 0

                pos_dict[w]+=1
    except TypeError:
        continue            


In [None]:
sorted(pos_dict.values())[::-1]

In [None]:
len(pos_dict.keys())

In [None]:
len(neg_dict.keys())

In [None]:
merged_dict = neg_dict.copy()

for i in pos_dict.keys():
    if i not in merged_dict.keys():
        merged_dict[i]=0
    merged_dict[i] += pos_dict[i]

In [None]:
top_neg = sorted(merged_dict.items(), key=(lambda item: item[1]))[:55]

In [None]:
top_pos = sorted(merged_dict.items(), key=(lambda item: item[1]))[73480:]

In [None]:
#top_pos[::-1]

In [None]:
a = analyser.lexicon.keys()
b = analyser.lexicon.values()

In [None]:
hotel_lexicon = {k:v for k,v in zip(a,b)}

In [None]:
in_lexicon = []
for i in top_pos:
    if i[0] in hotel_lexicon.keys():
        in_lexicon.append((i[0], hotel_lexicon[i[0]]))
    else:
        in_lexicon.append(i)

In [None]:
pos_in_lexicon = [i for i in in_lexicon if i[1]>4]

In [None]:
nin_lexicon = []
for i in top_neg:
    if i[0] in hotel_lexicon.keys():
        nin_lexicon.append((i[0], hotel_lexicon[i[0]]))
    else:
        nin_lexicon.append(i)

In [None]:
neg_in_lexicon = [i for i in nin_lexicon if i[1]<-4]

In [None]:
neg_in_lexicon 

In [None]:
w_pos_in_lexicon = [i[0] for i in pos_in_lexicon]
w_neg_in_lexicon = [i[0] for i in neg_in_lexicon]

In [None]:
#w_neg_in_lexicon

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
#vader update
pos_words = {'large':1.5,
 'minute':1,
 'convenient':1.5,
 'walking':1,
 'near':1,
 'central':1,
 'view':1,
 'walk':1,
 'spacious':2,
 'modern':2,
 'everything':1,
 'quiet':1,
 'comfy':1,
 'close':1,
 'location':1.5}

neg_words = {
 'i':-.05
 'small': -2,
 'little':-2,
 'expensive': -2,
 'work':-1,
 'not':-1,
 'air':-1,
 'noise': -2,
 'told':-0.5,
 'bathroom':-1,
 'water':-1,
 'booking': -1,
 'hot':-1,
 'shower':-2,
 'cold': -3,
 'price':-1,
 'slow': -2,
 'booked':-1,
 'old':-1,
 'paid':-0.5,
 'toilet':-1,
 'tiny':-2,
 'working':-1}
analyser = SentimentIntensityAnalyzer()

analyser.lexicon.update(pos_words)
analyser.lexicon.update(neg_words)

In [None]:
vader = pd.read_csv('vader.csv')

In [None]:
data["Negative_Review"] = data["Negative_Review"].apply(lambda x: str(x).replace("No Negative", ""))
data["Positive_Review"] = data["Positive_Review"].apply(lambda x: str(x).replace("No Positive", ""))

In [None]:
pos = data['Positive_Review']
pos_review_sentiment = []
for i in pos:
    snt = analyser.polarity_scores(i)
    pos_review_sentiment.append(snt['compound'])


In [None]:
neg = data['Negative_Review']
neg_review_sentiment = []
for i in neg:
    snt = analyser.polarity_scores(i)
    neg_review_sentiment.append(snt['compound'])

In [None]:
vader3 = vader2.copy()
vader3['new_pos_sent'] = pos_review_sentiment
vader3['New_neg_sent'] = neg_review_sentiment


In [None]:
vader3.to_csv('vader3.csv')

In [None]:
vader2.head()

In [None]:
vader2.to_csv('vader2.csv')

In [None]:
vader2 = pd.read_csv('vader2.csv')

In [None]:
vader2.drop(['pos_sent', 'neg_sent'],axis=1, inplace=True)

In [None]:
len(edited_dict.items())

In [None]:
sorted(edited_dict.items(), key=(lambda item: item[1]))

In [None]:
sent_dict = {key:val for key, val in merged_dict.items() if key in edited_dict.keys()}

In [None]:
lst = [('Location', 182356),
 ('Staff', 156235),
 ('Good', 91507),
 ('Friendly', 80715),
 ('Helpful', 71799),
 ('Excellent', 59915),
 ('Nice', 59561),
 ('Clean', 58436),
 ('Comfortable', 54418),
 ('Hotel', 49731)]
words = []
vals = []
for i in lst:
    words.append(i[0])
    vals.append(i[1])


In [None]:
pos_words = pd.DataFrame([words,vals])

In [None]:
pos_words=pos_words.T

In [None]:
#plot positive word occurrences
fig, ax = plt.subplots(figsize=(12,6))
ax.bar(pos_words[0], pos_words[1], color='green')
ax.xaxis.set_tick_params(labelsize=16, rotation=45, )
ax.set_xlabel('Positive Words', fontsize=18)
ax.set_ylabel('Relative Occurrences', fontsize=18)
fig.suptitle('Top 10 Positive Words', fontsize=20);
plt.savefig('Hotel-Reviews/images/poswords.jpg')

In [None]:
neglst = [ 
('I', 54831),
 ('Small', 38226),
 ('Nothing', 32126),
 ('Room', 29869),
 ('Bit', 23675),
 ('Could', 22721),
 ('Poor', 15300),
 ('Little', 15107),
 ('Expensive', 14225),
 ('Noisy', 12803)
]
words = []
vals = []
for i in neglst:
    words.append(i[0])
    vals.append(i[1])

In [None]:
neg_words = pd.DataFrame([words,vals])
neg_words=neg_words.T

In [None]:
#plot negative word occurrences
fig, ax = plt.subplots(figsize=(12,6))
ax.bar(neg_words[0], neg_words[1], color='red')
ax.xaxis.set_tick_params(labelsize=16, rotation=45)
ax.set_xlabel('Negative Words', fontsize=18)
ax.set_ylabel('Relative Occurrences', fontsize=18)
fig.suptitle('Top 10 Negative Words', fontsize=20);
plt.savefig('Hotel-Reviews/images/negwords.jpg')

In [None]:
featurelist = [('fabulous', 0.29964397592170217), ('room', 0.30194291249199645), ('Review_Total_Positive_Word_Counts', 0.3052570176688559), ('view_neg', 0.3065943580036194), ('customer_neg', 0.309148504285848), ('like', 0.3097000750242623), ('cleaned_neg', 0.3138296421587767), ('far_neg', 0.31415213928598024), ('experience_neg', 0.3160898314851383), ('upgraded', 0.31808503933448273), ('uncomfortable_neg', 0.3181067243519525), ('amazing', 0.32032346693149794), ('positive', 0.3268533664683133), ('friendly_neg', 0.3279204781542303), ('star', 0.3316088215976614), ('money', 0.33323600792235114), ('furniture_neg', 0.33556941761333337), ('thing_neg', 0.3359991773269183), ('basic_neg', 0.3371390389734257), ('worth_neg', 0.34183704961622896), ('bad_neg', 0.35738071688622886), ('excellent', 0.3594808207557529), ('service', 0.359927338350747), ('good', 0.36450212979652796), ('value_neg', 0.36939321877731096), ('Average_Score', 0.37155553164158), ('maybe_neg', 0.3742655712045161), ('slightly_neg', 0.3756614262933333), ('problem_neg', 0.37852534112012903), ('negative_neg', 0.3795259726073531), ('liked_neg', 0.3798875420290196), ('loved', 0.38373564215679484), (' Leisure trip ', 0.3845988736897397), ('thing', 0.3854006269861976), ('tiny_neg', 0.3855309579764056), ('smell_neg', 0.39104875876111), ('comfort', 0.396663971459846), ('service_neg', 0.40567563202398277), ('poor_neg', 0.40716188873969195), ('location', 0.4366442383734017), ('exceptional', 0.44605379501967746), ('fantastic', 0.4506617635248613), ('manager_neg', 0.4539769613296106), ('dated_neg', 0.4552046506772193), ('little_neg', 0.4669997007938732), ('bed_neg', 0.48724146197141455), ('location_neg', 0.5196409792334291), ('terrible_neg', 0.5253355840173783), ('ok', 0.533507276280509), ('old_neg', 0.5588221246639997), ('rude_neg', 0.5628752955510673), ('bit_neg', 0.5665008095155175), ('overpriced_neg', 0.5972159823682486), ('staff', 0.6016395407762328), ('hotel_neg', 0.6126645258793607), ('clean_neg', 0.6272994880442819), ('rooms_neg', 0.6394214532205174), ('Review_Total_Negative_Word_Counts', 0.643662427137203), ('star_neg', 0.6624058946175011), ('money_neg', 0.6654582424585267), ('cleanliness', 0.702056108507394), ('perfect_neg', 0.7507870704583097), ('fault_neg', 0.7786412114249133), ('dirty_neg', 0.9736228654579733), ('staff_neg', 1.1175715739400547), ('new_pos_sent', 1.197688207777768), ('New_neg_sent', 1.3588873831895074), ('room_neg', 3.213848362698663)][::-1]

In [None]:
featurelist

## Best and Worst Reviews

In [None]:
vader = pd.read_csv('vader.csv')

In [None]:
data['Positive_Review'][vader.sort_values('vader_pos_sent', ascending=False).index[0]]

In [None]:
data['Negative_Review'][vader.sort_values('vader_neg_sent').index[0]]