In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the data and reading it

In [None]:
df = pd.read_csv('/kaggle/input/trip-advisor-hotels-data/TA hotels scraper.csv')
dfc = df.copy() #Making a copy of the original dataframe is always a good practice

In [None]:
dfc.head() #Taking a look at the first 5 rows of our dataframe

# Data cleaning

In [None]:
#The symbols on the country_name and city_name columns are going to be a problem
#We need to get rid of these unwanted symbols to proceed with our analysis

In [None]:
#Getting rid of the symboles

x = 0
t = 0

for n in dfc['country_name']:  #The loop has purpose to get rid of the last 2 characters (This works because the characters that we want to get rid of are consistent, and we know exactly where they will be)
    dfc['country_name'][x] = dfc['country_name'][x][:-2]
    x += 1
    

for n in dfc['city_name']:  #Same loop, different column
    dfc['city_name'][t] = dfc['city_name'][t][:-2]
    t += 1

In [None]:
#Checking for Nans
dfc.isna().any().any() #We have null values

In [None]:
tot = dfc.shape[0]
round((dfc.isna().sum() / tot) * 100, 2)  # 78% of the prices are missing
                                          # Since the price is a core variable to our analysis, we want it seperated from the main df  

In [None]:
# create dfn (a data frame in wich we got rid of price Nans)
# We will need `this later

dfn = dfc.dropna(subset = ['Price'])
print(dfc.shape[0] - dfn.shape[0], 'rows were droped')

In [None]:
#Check for duplicates
dfc.duplicated().any().any()   # No dups

In [None]:
#Checking the data types
dfc.dtypes  # All good 

#  EDA

In [None]:
dfc.describe().T #Descriptive statistics of our data

In [None]:
# We're going to start the analysis by continents
# We will analyse every continent on our data set and try to get as much info as possible from it

In [None]:
#First we will create a few functions to help us visualize our data better
#The functions will highlight the highest and lowest values when plotting 

#M_M (maximum and minimum value)√
#MI (minimum value)√
#MA (maximum value)√

def M_M(series, max_color = '#ff8282', min_color = '#d5f4f5' ,other_color = '#dedede'): #Highlights max and min bars
    max_val = series.max()
    min_val = series.min()
    pal = []
    
    for item in series:
        if item == max_val:
            pal.append(max_color)
        elif item == min_val:
            pal.append(min_color)
        else:
            pal.append(other_color)
    return pal


def MI(series, min_color = '#9ed9db' ,other_color = '#dedede'): #Highlight only the min bar
    min_val = series.min()
    pal = []
    
    for item in series:
        if item == min_val:
            pal.append(min_color)
        else:
            pal.append(other_color)
    return pal


def MA(series, max_color = '#ff8282' ,other_color = '#dedede'): #highlights only the max bar
    max_val = series.max()
    pal = []
    
    for item in series:
        if item == max_val:
            pal.append(max_color)
        else:
            pal.append(other_color)
    return pal

In [None]:
#Continents with the most expensive Hotels
#For this graph we want to use the dfn dataframe
#We will get the avg Hotel prices for each continent and visualize it 

C_prices = dfn.groupby('continent_name').agg({'Price': 'mean'})

plt.figure(figsize = (15, 6))
sns.barplot(x = C_prices.index, y = 'Price', edgecolor='black',data = C_prices, 
            palette = M_M(C_prices['Price']))

sns.despine(left=True, bottom=False, right=True, top=True)

plt.title('Hotel prices by continent', fontsize = 25)
plt.xlabel('')
plt.ylabel('')

In [None]:
print('We can see that Africa has the highest average price per night for Hotels even if it is one of the lowest continents by GDP. \n \nThis possibly implies that there is no positive coorelation between how rich a continent is, and the Hotel prices (even a negative relationship is possible).')

In [None]:
#Let's take a look at the price distribution for all 4 continents 

Africa = dfn[dfn['continent_name'] == 'Africa']
Asia = dfn[dfn['continent_name'] == 'Asia']
Europe = dfn[dfn['continent_name'] == 'Europe']
South_Pacific = dfn[dfn['continent_name'] == 'South Pacific']

In [None]:
#We will use a boxplot to have a look at the at the data distribution

fig, axes = plt.subplots(2, 2, figsize = (25, 15))

sns.boxplot(Africa.Price, ax = axes[0, 0], color = '#ff8282')
axes[0, 0].set(xlabel='', ylabel='')
axes[0, 0].set_title('Price distibution for Africa', fontsize = 20)

sns.boxplot(Asia.Price, ax = axes[0, 1], color = '#ff8282')
axes[0, 1].set(xlabel='', ylabel='')
axes[0, 1].set_title('Price distibution for Asia', fontsize = 20)

sns.boxplot(Europe.Price, ax = axes[1, 0], color = '#ff8282')
axes[1, 0].set(xlabel='', ylabel='')
axes[1, 0].set_title('Price distibution for Europe', fontsize = 20)

sns.boxplot(South_Pacific.Price, ax = axes[1, 1], color = '#ff8282')
axes[1, 1].set(xlabel='', ylabel='')
axes[1, 1].set_title('Price distibution for South Pacific', fontsize = 20)

fig.suptitle('Price Distribution For All 4 Continents', fontsize = 30)

In [None]:
print("We can see that we're dealing with lots of outliers across the board, this is not \nsurprising since we're analyzing by continent.")

In [None]:
#Let's use distplot to have a different look at the data distribution

fig, axes = plt.subplots(2, 2, figsize = (25, 15))

sns.distplot(Africa.Price, ax = axes[0, 0], color = '#f2b40a')
axes[0, 0].set(xlabel='', ylabel='')
axes[0, 0].set_title('Price distibution for Africa', fontsize = 20)

sns.distplot(Asia.Price, ax = axes[0, 1], color = '#f2b40a')
axes[0, 1].set(xlabel='', ylabel='')
axes[0, 1].set_title('Price distibution for Asia', fontsize = 20)

sns.distplot(Europe.Price, ax = axes[1, 0], color = '#f2b40a')
axes[1, 0].set(xlabel='', ylabel='')
axes[1, 0].set_title('Price distibution for Europe', fontsize = 20)

sns.distplot(South_Pacific.Price, ax = axes[1, 1], color = '#f2b40a')
axes[1, 1].set(xlabel='', ylabel='')
axes[1, 1].set_title('Price distibution for South Pacific', fontsize = 20)

fig.suptitle('Price Distribution For All 4 Continents', fontsize = 30)

In [None]:
print("Again, we can see that we're dealing with outliers for all 4 continents, the most extreme \nbeing Asia. \n\nIt is normal to spot outliers when it comes to Hotel prices, so there is no need to \neliminate them.")

In [None]:
#Reviews by continent

In [None]:
#We will figure out the avg rating score for each continent and compare that to the number of reviews received 
#This will give us an idea on where the best reviewed hotels are situated.

In [None]:
continent_reviews = dfc.groupby('continent_name').agg({'Rating': 'mean',
                                                      'reviews count': 'sum'}) #Grouping continents by ratings and reviews

fig, axes = plt.subplots(1, 2, figsize = (20, 6))

sns.barplot(ax = axes[0], y = continent_reviews.index, x = 'Rating', data = continent_reviews, color = '#128a71', orient = 'h',
           palette = MA(continent_reviews['Rating']), edgecolor='black')
axes[0].set_title('Ratings by continent', fontsize = 20)
axes[0].set(xlabel = '', ylabel = '')
axes[0].spines['right'].set_visible(False)
axes[0].spines['top'].set_visible(False)
axes[0].spines['bottom'].set_visible(False)

C = ['#d1d0cb', '#adaaaa', '#ff8282', '#8a8484']
axes[1].pie(continent_reviews['reviews count'], labels = continent_reviews.index, startangle = 90, colors = C, autopct='%1.1f%%')
axes[1].set_title('Reviews count for each continent', fontsize = 20)
my_circle = plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)

plt.show()

In [None]:
print("◘ We can see that Europe is the best reviewed continent in terms of Hotels, and it's also \nthe continent that received the most reviews. \n\n◘ This indicate that European Hotels get the best customer engagement... But why? \n\n◘ A quick Google search shows that Europe has received the highest number of international \nvisitors in 2019, by 745  million visitors much larger than all other continents.\n\n◘ It seems that Europe is the favorite destination for tourists possibly due to quality \nservices, safety and reasonable prices.")

In [None]:
#Let's analyse the countries now

In [None]:
#Which are the most expensive countries to rent in? and which are the most affordable

In [None]:
hotels_count = dfc.groupby('country_name').agg({'Hotel name': 'count'}).sort_values(by = 'Hotel name', ascending = False).head(10)
hotels_count0 = hotels_count.rename(columns={'Hotel name': 'Count'})

HP = dfn.groupby('country_name').agg({'Price': 'mean'})
HD = dfc.groupby('country_name').count().reset_index()
HD.drop(HD.columns.difference(["country_name", "Hotel name"]), 1, inplace=True)
HD = HD.rename(columns={'Hotel name': 'Count'})

iso_alpha = pd.read_html('https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes')
iso_alpha0 = iso_alpha[0]
iso_alpha0 = pd.DataFrame(iso_alpha0.to_records())

iso_alpha0 = iso_alpha0.rename(columns={iso_alpha0.columns[1]: 'Country', 
                                       iso_alpha0.columns[5]: 'Country Code'})

iso_alpha0 = iso_alpha0.rename(columns={'Country': 'country_name'})
iso_alpha0.drop(iso_alpha0.columns.difference(["country_name", "Country Code"]), 1, inplace=True)

iso_alpha0['country_name0'] = iso_alpha0['country_name'].str.split('[').str[0]
iso_alpha0['country_name0'] = iso_alpha0['country_name0'].str.replace('Viet Nam', 'Vietnam')
iso_alpha0['country_name0'] = iso_alpha0['country_name0'].str.strip()

iso_alpha0.drop('country_name', axis = 1, inplace = True)
iso_alpha0.rename(columns = {'country_name0': 'country_name'}, inplace = True)

main = pd.merge(HP, iso_alpha0, on="country_name")

In [None]:
#Geo plotting the most expensive and cheapest countries in terms of mean Hotel prices per night

fig = go.Figure(data=go.Choropleth(
    locations = main['Country Code'],
    z = main['Price'],
    text = main['country_name'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='lightgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '$',
    colorbar_title = 'Price<br>US$',
))

fig.update_layout(
    title_text='Hotel prices heatmap',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        showarrow = False
    )]
)

fig.show()

In [None]:
#Which are the most expensive countries to rent a hotel in? And which are the cheapest ones?

In [None]:
T_10_high = dfn.groupby('country_name').agg({'Price': 'mean',
                                            'reviews count': 'sum',
                                            'Rating': 'mean'}).sort_values(by = 'Price', ascending = False).head(10)
T_10_low = dfn.groupby('country_name').agg({'Price': 'mean',
                                           'reviews count': 'sum',
                                           'Rating': 'mean'}).sort_values(by = 'Price', ascending = True).head(10)

fig, axes = plt.subplots(1, 2, figsize = (20, 10))
sns.barplot(ax=axes[0], y = T_10_high.index, x = 'Price', data = T_10_high, orient = 'h', 
            palette = MA(T_10_high['Price']), edgecolor='black')
axes[0].set(xlabel = '', ylabel = '')
axes[0].set_title('Top 10 Countries With The Most Expensive Hotels', fontsize = 20)
axes[0].spines['right'].set_visible(False)
axes[0].spines['top'].set_visible(False)
axes[0].spines['bottom'].set_visible(False)

sns.barplot(ax=axes[1], y = T_10_low.index, x = 'Price', data = T_10_low, orient = 'h', palette = MI(T_10_low['Price']), edgecolor='black')
axes[1].set(xlabel = '', ylabel = '')
axes[1].set_title('Top 10 Countries With The Cheapest Hotels', fontsize = 20)
axes[1].spines['right'].set_visible(False)
axes[1].spines['top'].set_visible(False)
axes[1].spines['bottom'].set_visible(False)

In [None]:
print("The country with the most expensive hotels is {}, with an average price of {}$ \nper night.\n".format(T_10_high.reset_index().iloc[0][0], round(T_10_high.reset_index().iloc[0][1], 2)))
print("The country with the cheapest hotels is {}, with an average price of {}$ per night.".format(T_10_low.reset_index().iloc[0][0], round(T_10_low.reset_index().iloc[0][1], 2)))

In [None]:
#We will take a look at the customer engagement for the most expensive coutries and the cheapest countries

In [None]:
#Reviews analysis for countries with the most expensive/cheapest hotels 

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20, 7))

sns.barplot(ax = axes[0], x = T_10_high.index, y = 'Rating', data = T_10_high, color = '#128a71', orient = 'v',
           palette = MA(T_10_high['Rating']), edgecolor='black')
axes[0].set_title('X', fontsize = 20)
axes[0].set(xlabel = '', ylabel = '')
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation = 45)
axes[0].spines['right'].set_visible(False)
axes[0].spines['top'].set_visible(False)
axes[0].spines['bottom'].set_visible(True)
axes[0].spines['left'].set_visible(False)
axes[0].set_title('Rating By Country', fontsize = 20)

C = ['#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#f24e71', '#3996d4', '#3996d4']

axes[1].hlines(T_10_high.index, xmin=0,
            xmax=T_10_high['reviews count'], color = C)
axes[1].set_title('Reviews Count For Each Country', fontsize = 20)
axes[1].spines['right'].set_visible(False)
axes[1].spines['top'].set_visible(False)
axes[1].spines['bottom'].set_visible(False)
axes[1].spines['left'].set_visible(True)
  
# drawing the markers (circle)
axes[1].plot(T_10_high['reviews count'], T_10_high.index, "o", color = '#3996d4')
axes[1].set_xlim(0)

fig.suptitle('Countries With The Most Expensive Hotels', fontsize = 30)
fig.tight_layout(rect=[0, 0.05, 1, 0.98])

plt.show()

In [None]:
print("The ratings for the top 10 most expensive countries are consistent with only one country \nbelow 4.\n\nAs for the reviews, we have huge differences between countries. This could be because \ncertain countries have less visitors, but keep a premium for the hotels.")

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20, 7))

sns.barplot(ax = axes[0], x = T_10_low.index, y = 'Rating', data = T_10_low, color = '#128a71', orient = 'v',
           palette = MA(T_10_low['Rating']), edgecolor='black')
axes[0].set_title('X', fontsize = 20)
axes[0].set(xlabel = '', ylabel = '')
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation = 45)
axes[0].spines['right'].set_visible(False)
axes[0].spines['top'].set_visible(False)
axes[0].spines['bottom'].set_visible(True)
axes[0].spines['left'].set_visible(False)
axes[0].set_title('Rating By Country', fontsize = 20)

C = ['#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#3996d4', '#f24e71']

axes[1].hlines(T_10_low.index, xmin=0,
            xmax=T_10_low['reviews count'], color = C)
axes[1].set_title('Reviews Count For Each Country', fontsize = 20)
axes[1].spines['right'].set_visible(False)
axes[1].spines['top'].set_visible(False)
axes[1].spines['bottom'].set_visible(False)
axes[1].spines['left'].set_visible(True)
  
# drawing the markers (circle)
axes[1].plot(T_10_low['reviews count'], T_10_low.index, "o", color = '#3996d4')
axes[1].set_xlim(0)

fig.suptitle('Countries With The Cheapest Hotels', fontsize = 30)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.show()

In [None]:
print("The ratings are not very consistent with certain countries scoring much lower than others \nwith a similar price point.\n\nThe reviews volume is much lower in comparaison to the more expensive countries, indicating that the cheaper countries are dealing with less visitors (there are a few exceptions).")

In [None]:
print("Conclusion:\n\nThe countries with the more expensive hotels seem to be more consistent with their ratings \nand reviews pointing towards a healthier hospitality industry.")

print("\nAlso, due to the number of visitors and the ratings value, it seems that the countries with the more expensive hotels are offering better services.")

In [None]:
#Which are the countries with the best and worst hotel ratings

In [None]:
T_R_high = dfc.groupby('country_name').agg({'Rating': 'mean'}).sort_values(by = 'Rating', ascending = False).head(10).reset_index()
T_R_low = dfc.groupby('country_name').agg({'Rating': 'mean'}).sort_values(by = 'Rating', ascending = True).head(10).reset_index()

print("The country with the highest Hotel rating is {} with an average rating of {}, \nand the county with the worst reviewed hotels is {} with an \naverage rating of {}".format(T_R_high.iloc[0][0], round(T_R_high.iloc[0][1], 2), T_R_low.iloc[0][0], T_R_low.iloc[0][1],))

In [None]:
#Since the 1 - 5 rating scale is quite small, we will consider even the smallest deviations to be very impactfull. 
#Every country with a rating below 4 will be considered low-tier 
#Countries with a below 4 rating (aka countries where you might not wanna book a Hotel in)
l = dfc.groupby('country_name').agg({'Rating': 'mean'})
l[l.Rating < 4]

In [None]:
#Which are the cities with the best and worst hotel ratings

In [None]:
C_R_high = dfc.groupby('city_name').agg({'Rating': 'mean'}).sort_values(by = 'Rating', ascending = False).head(10).reset_index()
C_R_low = dfc.groupby('city_name').agg({'Rating': 'mean'}).sort_values(by = 'Rating', ascending = True).head(10).reset_index()

print("The city with the highest Hotel rating is {} with an average rating of {}, and the city with the worst reviewed hotels is {} with an average rating of {}".format(C_R_high.iloc[0][0], round(C_R_high.iloc[0][1], 2), C_R_low.iloc[0][0], round(C_R_low.iloc[0][1], 2) ))

In [None]:
#Which countries has the most hotel listings on TripAdvisor?

In [None]:
#Let's geo plot Hotels density for every country in our data set

main0 = pd.merge(HD, iso_alpha0, on="country_name")

fig = go.Figure(data=go.Choropleth(
    locations = main0['Country Code'],
    z = main0['Count'],
    text = main0['country_name'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'Count',
))

fig.update_layout(
    title_text='Hotels density',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    annotations = [dict(
        x=0.55,
        y=0.1,
        xref='paper',
        yref='paper',
        showarrow = False
    )]
)

fig.show()

In [None]:
print("The country with the most hotel listings is Italy, followed by China and few other \nEuropian countries")

# Card analysis

In [None]:
from IPython.display import Image
import os
!ls ../input/images

In [None]:
Image("../input/images/clean img.png")

In [None]:
print("A card is the block of info that summarizes what a hotel has to offer without having to \nclick through\n")
print("We will analyze these cards to figure out things like which amneties are often displayed, \nhow many of these blocks (the amenities section) are used etc...\n")

In [None]:
#Before we get to the analysis we need to understand our data attributes
#The below image shows where the Info columns came from

In [None]:
Image("../input/images/guide img.png")

In [None]:
#Please note that the block that I skipped and crossed off in red is not included in our analysis
#At the time when this data was scraped the block that says "Taking Safety measures" did not exist, so we will leave it out of our analysis

In [None]:
#Now, let's create a visual illustration of the most common amenities used in each block

fig, axes = plt.subplots(2, 3, figsize = (20, 15), facecolor = None)
axes[1][2].set_visible(False)

for x in range(1, 6): 
    data = dfc['info.' + str(x)].value_counts().to_dict()
    wc = WordCloud(width = 1000, height = 1000,
                    background_color ='#e8e8e8',
                    min_font_size = 25).generate_from_frequencies(data)
        
    if x == 1:
        axes[0, 0].imshow(wc)
        axes[0, 0].axis('off')
    if x == 2:
        axes[0, 1].imshow(wc)
        axes[0, 1].axis('off')
        
    if x == 3:
        axes[1, 0].imshow(wc)
        axes[1, 0].axis('off')
        
    if x == 4:
        axes[1, 1].imshow(wc)
        axes[1, 1].axis('off')
        
    if x == 5:
        axes[0, 2].imshow(wc)
        axes[0, 2].axis('off')

In [None]:
print("This illustration gives us an idea on what's going on but we need more details to be \nable to draw conclusions.")

In [None]:
#The info section on the cards has a limit of 5 fields
#Each field is given to showcase a certain amneties

In [None]:
#The first field

In [None]:
Image("../input/images/Card1.png")

In [None]:
F1 = dfc['info.1'].value_counts().reset_index()
F1.rename(columns = {'index': 'amenity',
                   'info.1': 'count'}, inplace = True)

plt.figure(figsize = (20, 8))
sns.barplot(x = 'amenity', y = 'count', data = F1, palette = MA(F1['count']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Info 1", fontsize = 30)

In [None]:
print("The most dislpayed amenity by far is 'free wifi'\n")
print("This shows that customers value internet connection a lot, so free, reliable and fast \nwifi needs to be on every Hotel's priority list")

In [None]:
#Second field

In [None]:
Image("../input/images/Card1 copy.png")

In [None]:
F2 = dfc['info.2'].value_counts().reset_index()
F2.rename(columns = {'index': 'amenity',
                   'info.2': 'count'}, inplace = True)

plt.figure(figsize = (20, 8))
sns.barplot(x = 'amenity', y = 'count', data = F2, palette = MA(F2['count']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Info 2", fontsize = 30)

In [None]:
print("Free parking takes the cake as the most listed amenity on the second field\n")
print("Even if most hotels list free parking on the second field, Restaurant and pool were \nmentioned over 5000 times")

In [None]:
#Third field

In [None]:
Image("../input/images/Card1 copy 3.png")

In [None]:
F3 = dfc['info.3'].value_counts().reset_index()
F3.rename(columns = {'index': 'amenity',
                   'info.3': 'count'}, inplace = True)

plt.figure(figsize = (20, 8))
sns.barplot(x = 'amenity', y = 'count', data = F3, palette = MA(F3['count']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Info 3", fontsize = 30)

In [None]:
print("For the third place, 'Special Offer' is used just a bit over 'Visit Hotel Website'\n")
print("Usually 'Visit Hotel Website' is in the last field, and since 'Special Offer' and \n'visit Hotel Website' are almost neck to neck, it means that the 3rd slot is not often used, therefore it skips to the next one")

In [None]:
#Forth field

In [None]:
Image("../input/images/Card1 copy 4.png")

In [None]:
F4 = dfc['info.4'].value_counts().reset_index()
F4.rename(columns = {'index': 'amenity',
                   'info.4': 'count'}, inplace = True)

plt.figure(figsize = (20, 8))
sns.barplot(x = 'amenity', y = 'count', data = F4, palette = MA(F4['count']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Info 4", fontsize = 30)

In [None]:
print("The forth field mentions 'Visit Hotel Website' almost every single time.")

In [None]:
#Fifth field

In [None]:
Image("../input/images/card1 copy 5.png")

In [None]:
F5 = dfc['info.5'].value_counts().reset_index()
F5.rename(columns = {'index': 'amenity',
                   'info.5': 'count'}, inplace = True)

plt.figure(figsize = (20, 8))
sns.barplot(x = 'amenity', y = 'count', data = F5, palette = MA(F5['count']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Info 5", fontsize = 30)

In [None]:
print("For the last field on the left side, 'free cancellation' is mentioned the most.")

In [None]:
#What is the top amenity showcased on the cards 

In [None]:
I1 = dfc.groupby('info.1').count()['Hotel name'].reset_index().rename(columns = {'Hotel name': 'Count', 'info.1': 'info'}).sort_values(by = 'Count', ascending = False).head(1)
I2 = dfc.groupby('info.2').count()['Hotel name'].reset_index().rename(columns = {'Hotel name': 'Count', 'info.2': 'info'}).sort_values(by = 'Count', ascending = False).head(1)
I3 = dfc.groupby('info.3').count()['Hotel name'].reset_index().rename(columns = {'Hotel name': 'Count', 'info.3': 'info'}).sort_values(by = 'Count', ascending = False).head(1)
I4 = dfc.groupby('info.4').count()['Hotel name'].reset_index().rename(columns = {'Hotel name': 'Count', 'info.4': 'info'}).sort_values(by = 'Count', ascending = False).head(1)

main = I1.append([I2, I3, I4])

plt.figure(figsize = (15, 8))
sns.barplot(y = 'info', x = 'Count', data = main, color = '#2b658a', orient = 'h')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = False, top = True, bottom = True, right = True)
plt.title("Top amenities", fontsize = 30)

In [None]:
print("By far Wifi is the most important amenity as it has been mentioned on almost every single \nhotel listing")

In [None]:
print("By looking at the above findings, we can have a clear idea on what customers value when \nlooking to book a hotel ")

In [None]:
#How often all the fields given by TA get used?
#We will figure that out by looking at the Nans as an indicator of non-used fields

In [None]:
#We will construct binary variables to be able to answer our question

In [None]:
info = dfc.copy()
info.drop(columns = ['info.6', 'info.7'], inplace = True)

for n in range(1, 6):
    info['info.' + str(n)].replace(np.nan, 0, inplace = True)
    
for i in range(1, 6, 1):
    y = 0
    for x in info['info.' + str(i)]:
        if info['info.' + str(i)][y] != 0:
            info['info.' + str(i)][y] = '1'
        y += 1

#The loop has purpose to change nans to 0 and non nulls to 1 
#This will enable us to figure out how often each slot has been used in order

In [None]:
info['info.1'] = pd.to_numeric(info['info.1'])
info['info.2'] = pd.to_numeric(info['info.2'])
info['info.3'] = pd.to_numeric(info['info.3'])
info['info.4'] = pd.to_numeric(info['info.4'])
info['info.5'] = pd.to_numeric(info['info.5'])

In [None]:
info['# of slots used'] = info['info.1'] + info['info.2'] + info['info.3'] + info['info.4']

In [None]:
info  #The results

In [None]:
slots_used = info['# of slots used'].value_counts().reset_index().rename(columns = {'index': 'number of slots',
                                                                      '# of slots used': 'count'})

plt.figure(figsize = (20, 8))
sns.barplot(x = 'number of slots', y = 'count', data = slots_used, palette = M_M(slots_used['number of slots']), edgecolor='black')

plt.ylabel("")
plt.xlabel("")
sns.despine(left = True)
plt.title("Number of fields used", fontsize = 30)

In [None]:
print("We can clearly see that 2 fields get used the most and 4 gets used the least. That means \nthat very few use up all the fields to showecase important amenities and other important info.")

In [None]:
#Top Hotels in Europe

In [None]:
# To determine which are the best hotels we need customer input. In this case we don't have the actual reviews
# But we do have ratings and how many reviews a hotel got. We can use this to get an estimate of the best hotels in Europe 

In [None]:
top_R = dfc[(dfc['Rating'] == 5) & (dfc['continent_name'] == 'Europe')] 

top_H = top_R.sort_values(by = 'reviews count', ascending = False)

print("\nTop 5 hotels in Europe\n")
top_H[['Hotel name', 'Price', 'country_name']].head(5)