In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import datetime
import re

df = pd.read_csv("../input/Hotel_Reviews.csv")
df.head()

## General Overview of the Score

In [None]:
sns.distplot(df["Reviewer_Score"],kde=False,bins=15)

## Highest and Lowest Scoring Countries of Origin
Identifying if there is a pattern within the country of origin of the reviewer and the score they gave. As there are countries from which only one or two reviewers originate, we will be setting a minimum of 100 reviewers/ reviews per country of origin. Otherwise we have a country in which there is only one reviewer awarding a 10 skewing the results.

In [None]:
# If there are more than 100 instances of the country
countries = df["Reviewer_Nationality"].value_counts()[df["Reviewer_Nationality"].value_counts() > 100]
g = df.groupby("Reviewer_Nationality").mean()
g.loc[countries.index.tolist()]["Reviewer_Score"].sort_values(ascending=False)[:10].plot(kind="bar",ylim=(8,9),title="Top Reviewing Countries of Origin")

In [None]:
g.loc[countries.index.tolist()]["Reviewer_Score"].sort_values()[:10].plot(kind="bar",ylim=(7,8),title="Bottom Reviewing Countries of Origin")

## Hotel Location
Identifying if there are differences between countries. 

Technical note: This can be done via the Geocode (lat,lng) or via some hacky parsing of the address. As the countries in the dataset are limited and geocode lookup can be resource intensive, I went for the hacky address parsing.

In [None]:
def country_ident(st):
    last = st.split()[-1]
    if last == "Kingdom": return "United Kingdom"
    else: return last
    
df["Hotel_Country"] = df["Hotel_Address"].apply(country_ident)
df.groupby("Hotel_Country").mean()["Reviewer_Score"].sort_values(ascending=False)

In [None]:
sns.boxplot(data=df,y="Reviewer_Score",x="Hotel_Country",showfliers=False)

## Score by Review Date
Identifying if there is a time or seasonal trend in the review scores, be it over time or by months.

Note: Looking at the time and month trend, one can see a slight difference. This may be due to the differences in price, due to on/off season rates a hotel will charge. In turn, the guests may either expect more (more money spent might be higher expectations) or will attract a different category of guests that may be pickier. Linking this dataset with the prices would be very interesting.

In [None]:
df["Review_Date"] = df["Review_Date"].apply(lambda date: datetime.datetime.strptime(date, '%m/%d/%Y').strftime('%Y-%m-%d'))
df["Review_Date_Month"] = df["Review_Date"].apply(lambda x: x[5:7])
df[["Review_Date","Reviewer_Score"]].groupby("Review_Date").mean().plot(figsize=(15,7))

In [None]:
sns.boxplot(y="Reviewer_Score",x="Review_Date_Month",data=df,showfliers=False)

### Best Hotels by Rating

In [None]:
g = df.groupby(["Hotel_Name","Hotel_Country"]).mean().sort_values("Average_Score",ascending=False)
g["Average_Score"].head(20)

### Score by Nights

Identifying if the more nights a guests stays, the higher/lower their review score.

In [None]:
def splitString(string):
    array = string.split(" ', ' ")
    array[0] = array[0][3:]
    array[-1] = array[-1][:-3]
    if not 'trip' in array[0]:
        array.insert(0,None)
    try:
        return float(array[3].split()[1])
    except:
        return None

df["Nights"] = df["Tags"].apply(splitString)
sns.jointplot(data=df,y="Reviewer_Score",x="Nights",kind="reg")

### Potential To-Do:

- Investigate if type of stay (business/leisure) affects the review score.
- Investigate if family/guest type affects the review score.