Importing the required libraries and reading the dataset

In [None]:
import csv
import pandas as pd
import numpy as np
import datetime
import requests
import warnings
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)

restaurants_df = pd.read_csv('/kaggle/input/tripadvisor-european-restaurants/tripadvisor_european_restaurants.csv',
                             encoding='utf8', low_memory=False)

Reading the dataframe with TripAdvisor restaurants from the main European countries

In [None]:
countries_dict = {'Austria': 'AUT', 'Belgium': 'BEL', 'Bulgaria': 'BGR', 'Croatia': 'HRV', 'Czech Republic': 'CZE',
                  'Denmark': 'DNK', 'England': 'GBR', 'Finland': 'FIN', 'France': 'FRA', 'Germany': 'DEU',
                  'Greece': 'GRC', 'Hungary': 'HUN', 'Ireland': 'IRL', 'Italy': 'ITA', 'Northern Ireland': 'GBR',
                  'Poland': 'POL', 'Portugal': 'PRT', 'Romania': 'ROU', 'Scotland': 'GBR', 'Slovakia': 'SVK',
                  'Spain': 'ESP', 'Sweden': 'SWE', 'The Netherlands': 'NLD', 'Wales': 'GBR'}
restaurants_df['country_code'] = restaurants_df['country'].map(countries_dict).fillna(restaurants_df['country'])

# average price in euro
restaurants_df['minimum_range'] = restaurants_df['price_range'].str.split('-').str[0].str.replace('€', '').str.replace(',', '')
restaurants_df['minimum_range'] = pd.to_numeric(restaurants_df['minimum_range'], errors='coerce')
restaurants_df['maximum_range'] = restaurants_df['price_range'].str.split('-').str[1].str.replace('€', '').str.replace(',', '')
restaurants_df['maximum_range'] = pd.to_numeric(restaurants_df['maximum_range'], errors='coerce')
restaurants_df['average_price'] = (restaurants_df['minimum_range'] + restaurants_df['maximum_range']) / 2

# drop the fields used for average_price calculation
restaurants_df.drop(['minimum_range', 'maximum_range'], axis=1, inplace=True)

restaurants_df.head(5)

Starting the analysis of the entire dataframe

In [None]:
print(f'There are {restaurants_df.restaurant_link.nunique()} unique restaurants in the dataframe')
print(f'There are {restaurants_df.country.nunique()} unique countries in the dataframe')

In [None]:
restaurants_df['country'].value_counts().sort_values().plot(kind='barh')

In [None]:
msno.matrix(restaurants_df, fontsize=20)

The most notable fields with missing data are price_range, cuisines, original_open_hours, and the additional scores for food, service, value, and atmosphere

In [None]:
warnings.filterwarnings('ignore') # avoiding the "factorplot" error message

res = sns.factorplot('vegetarian_friendly', 'avg_rating', hue='claimed',
                     data=restaurants_df.sort_values(by='claimed'), legend_out=False)
fig = plt.gcf()
fig.set_size_inches(10, 6)
plt.xlabel('Vegetarian Friendly', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.title('Factorplot of Vegetarian Friendly and Average Rating - Grouped by Claimed', fontsize=14)
plt.legend(loc='upper right', title='Claimed')
plt.show()

In [None]:
warnings.filterwarnings('ignore') # avoiding the "factorplot" error message

res = sns.factorplot('vegan_options', 'avg_rating', hue='claimed',
                     data=restaurants_df.sort_values(by='claimed'), legend_out=False)
fig = plt.gcf()
fig.set_size_inches(10, 6)
plt.xlabel('Vegan Options', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.title('Factorplot of Vegan Options and Average Rating - Grouped by Claimed', fontsize=14)
plt.legend(loc='upper right', title='Claimed')
plt.show()

In [None]:
warnings.filterwarnings('ignore') # avoiding the "factorplot" error message

res = sns.factorplot('gluten_free', 'avg_rating', hue='claimed',
                     data=restaurants_df.sort_values(by='claimed'), legend_out=False)
fig = plt.gcf()
fig.set_size_inches(10, 6)
plt.xlabel('Gluten-free', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.title('Factorplot of Gluten-free and Average Rating - Grouped by Claimed', fontsize=14)
plt.legend(loc='upper right', title='Claimed')
plt.show()

The impact of the claimed field to the average rating is much greater for the vegan_options and gluten_free compared to the vegetarian_friendly

In [None]:
res = sns.violinplot(x='country_code', y='food', data=restaurants_df.sort_values(by='country_code'))
fig = plt.gcf()
fig.set_size_inches(15, 8)
plt.xlabel('Country Code', fontsize=12)
plt.ylabel('Restaurant Food Rating', fontsize=12)
plt.title('Distribution of Restaurant Food Rating by Country Code', fontsize=14)
plt.show()

Greece has a relatively higher % of food ratings equal to 5 and 4.5 and a relatively lower % of food ratings equal to 4

Larger countries such as France, Germany, Italy, and Spain have an opposite distribution of food ratings - ratings of 4 and 4.5 are almost identical, but very rare ratings of 5

In [None]:
res = sns.violinplot(x='country_code', y='service', data=restaurants_df.sort_values(by='country_code'))
fig = plt.gcf()
fig.set_size_inches(15, 8)
plt.xlabel('Country Code', fontsize=12)
plt.ylabel('Restaurant Service Rating', fontsize=12)
plt.title('Distribution of Restaurant Service Rating by Country Code', fontsize=14)
plt.show()

Greece has a higher % of service ratings equal to 4.5 than any other country and lower % of service ratings equal to 4

Larger countries such as Germany and Italy have larger % of service ratings of 4 rather 4.5

In [None]:
res = sns.violinplot(x='country_code', y='value', data=restaurants_df.sort_values(by='country_code'))
fig = plt.gcf()
fig.set_size_inches(15, 8)
plt.xlabel('Country Code', fontsize=12)
plt.ylabel('Restaurant Value Rating', fontsize=12)
plt.title('Distribution of Restaurant Value Rating by Country Code', fontsize=14)
plt.show()

Greece is again a positive anomaly, with a higher % of value ratings equal to 5 and 4.5, and a lower % of value ratings equal to 4

Larger countries such as Germany, Italy, and Spain have an opposite distribution of value ratings

Analysing the European countries data

In [None]:
countries_df = restaurants_df.groupby('country_code').agg(
    restaurants_count=pd.NamedAgg(column='restaurant_link', aggfunc=np.size),
    open_days_per_week=pd.NamedAgg(column='open_days_per_week', aggfunc=np.mean),
    open_hours_per_week=pd.NamedAgg(column='open_hours_per_week', aggfunc=np.mean),
    working_shifts_per_week=pd.NamedAgg(column='working_shifts_per_week', aggfunc=np.mean),
    avg_rating=pd.NamedAgg(column='avg_rating', aggfunc=np.mean),
    reviews_count=pd.NamedAgg(column='total_reviews_count', aggfunc=np.sum),
    median_price=pd.NamedAgg(column='average_price', aggfunc=np.median))
countries_df.reset_index(level=0, inplace=True)

country_names_dict = {'AUT': 'Austria', 'BEL': 'Belgium', 'BGR': 'Bulgaria', 'CZE': 'Czech Republic', 'DEU': 'Germany',
                      'DNK': 'Denmark', 'ESP': 'Spain', 'FIN': 'Finland', 'FRA': 'France', 'GBR': 'Great Britain',
                      'GRC': 'Greece', 'HRV': 'Croatia', 'HUN': 'Hungary', 'IRL': 'Ireland', 'ITA': 'Italy',
                      'NLD': 'Netherlands', 'POL': 'Poland', 'PRT': 'Portugal', 'ROU': 'Romania', 'SVK': 'Slovakia',
                      'SWE': 'Sweden'}
countries_df['country_name'] = countries_df['country_code'].map(country_names_dict).fillna(countries_df['country_code'])
countries_df = countries_df[['country_name', 'country_code', 'restaurants_count', 'open_days_per_week', 'open_hours_per_week',
                             'working_shifts_per_week', 'avg_rating', 'reviews_count', 'median_price']]

countries_df['reviews_per_restaurant'] = countries_df['reviews_count'] / countries_df['restaurants_count']

# restaurants vegetarian_friendly
vegetarian_df = restaurants_df[restaurants_df['vegetarian_friendly'] == 'Y'].groupby('country').agg(
    vegetarian_count=pd.NamedAgg(column='restaurant_link', aggfunc=np.size)).reset_index(level=0)
countries_df = pd.merge(countries_df, vegetarian_df, how='inner', left_on='country_name', right_on='country').drop(columns=['country'])
countries_df['vegetarian_count_perc'] = countries_df['vegetarian_count'] / countries_df['restaurants_count']

# vegan_options
vegan_df = restaurants_df[restaurants_df['vegan_options'] == 'Y'].groupby('country').agg(
    vegan_count=pd.NamedAgg(column='restaurant_link', aggfunc=np.size)).reset_index(level=0)
countries_df = pd.merge(countries_df, vegan_df, how='inner', left_on='country_name', right_on='country').drop(columns=['country'])
countries_df['vegan_count_perc'] = countries_df['vegan_count'] / countries_df['restaurants_count']

# gluten_free
gluten_free_df = restaurants_df[restaurants_df['gluten_free'] == 'Y'].groupby('country').agg(
    gluten_free_count=pd.NamedAgg(column='restaurant_link', aggfunc=np.size)).reset_index(level=0)
countries_df = pd.merge(countries_df, gluten_free_df, how='inner', left_on='country_name', right_on='country').drop(columns=['country'])
countries_df['gluten_free_count_perc'] = countries_df['gluten_free_count'] / countries_df['restaurants_count']

# dropping the count fields that have been used to calculate the percentages
countries_df.drop(['vegetarian_count', 'vegan_count', 'gluten_free_count'], axis=1, inplace=True)

countries_df

In [None]:
fig = go.Figure(data=go.Scatter(x=countries_df['open_hours_per_week'], y=countries_df['avg_rating'],
                                marker=dict(size=countries_df['restaurants_count']/2000,
                                            color=countries_df['avg_rating']),
                                mode='markers+text',
                                text=countries_df['country_name'], textposition='top center', textfont=dict(size=9),
                                hoverlabel=dict(namelength=0), # removes the trace number off to the side of the tooltip box
                                hovertemplate='%{text}:<br>%{x:.2f} days<br>%{y:.1f} hours'))
fig.update_layout(title='Average rating based on open hours per week (size by restaurants count)',
                  title_x=0.5, legend=dict(yanchor='bottom', y=-0.15, xanchor='left', x=0,
                                           font=dict(size=10), orientation='h'))
fig['layout']['xaxis']['title'] = 'Open hours per week'
fig['layout']['yaxis']['title'] = 'Average rating'
fig.show()

There is a direct relationship between open hours per week and average rating

In [None]:
fig = plotly.subplots.make_subplots(rows=1, cols=3, subplot_titles=('Vegetarian', 'Vegan', 'Gluten-free'),
                                   specs=[[{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]])
fig.add_trace(go.Scatter(x=countries_df['vegetarian_count_perc'], y=countries_df['reviews_per_restaurant'],
                         marker=dict(size=countries_df['restaurants_count']/5000,
                                     color=countries_df['reviews_per_restaurant']), mode='markers+text', showlegend=False,
                         text=countries_df['country_name'], textposition='top center', textfont=dict(size=9),
                         hoverlabel=dict(namelength=0), # removes the trace number off to the side of the tooltip box
                         hovertemplate='%{text}:<br>%{x:.2f} days<br>%{y:.1f} hours'), row=1, col=1)
fig.add_trace(go.Scatter(x=countries_df['vegan_count_perc'], y=countries_df['reviews_per_restaurant'],
                         marker=dict(size=countries_df['restaurants_count']/5000,
                                     color=countries_df['reviews_per_restaurant']), mode='markers+text', showlegend=False,
                         text=countries_df['country_name'], textposition='top center', textfont=dict(size=9),
                         hoverlabel=dict(namelength=0), # removes the trace number off to the side of the tooltip box
                         hovertemplate='%{text}:<br>%{x:.2f} days<br>%{y:.1f} hours'), row=1, col=2)
fig.add_trace(go.Scatter(x=countries_df['gluten_free_count_perc'], y=countries_df['reviews_per_restaurant'],
                         marker=dict(size=countries_df['restaurants_count']/5000,
                                     color=countries_df['reviews_per_restaurant']), mode='markers+text', showlegend=False,
                         text=countries_df['country_name'], textposition='top center', textfont=dict(size=9),
                         hoverlabel=dict(namelength=0), # removes the trace number off to the side of the tooltip box
                         hovertemplate='%{text}:<br>%{x:.2f} days<br>%{y:.1f} hours'), row=1, col=3)
fig.update_layout(title='Reviews per Restaurant based on Vegetarian %, Vegan %, and Gluten-free %', title_x=0.5,
                 legend=dict(yanchor='bottom', y=-0.15, xanchor='left', x=0,
                             font=dict(size=8), orientation='h'))
fig['layout']['xaxis']['title'] = 'Vegetarian %'
fig['layout']['xaxis2']['title'] = 'Vegan %'
fig['layout']['xaxis3']['title'] = 'Gluten-free %'
fig.show()

No real insight here, just worth to notice how vegetarian-friendly restaurants are way more popular than vegan and especially gluten-free restaurants

In [None]:
fig = px.choropleth(countries_df, locations='country_code', color='open_hours_per_week', hover_name='country_name',
                    scope='europe', color_continuous_scale='Viridis', projection='natural earth')
fig.update_layout(title='Open Hours per week', title_x=0.5, margin={'r':0, 'l':0, 'b':0, 'pad':0})
fig.show()

In [None]:
fig = px.choropleth(countries_df, locations='country_code', color='working_shifts_per_week', hover_name='country_name',
                    scope='europe', color_continuous_scale='Viridis', projection='natural earth')
fig.update_layout(title='Working Shifts per week', title_x=0.5, margin={'r':0, 'l':0, 'b':0, 'pad':0})
fig.show()

Restaurants in European countries that are open for more hours have longer and less recurrent shifts compared to the countries where restaurants are open for less hours 

In [None]:
print(str(len(restaurants_df[restaurants_df['country']=='Italy']['region'].value_counts())))
restaurants_df[restaurants_df['country']=='Italy']['region'].value_counts().sort_values().plot(kind='barh')

Analysing the Italian regions data

In [None]:
regions_df = restaurants_df[restaurants_df['country']=='Italy'].groupby('region').agg(
    restaurants_count=pd.NamedAgg(column='restaurant_link', aggfunc=np.size),
    open_days_per_week=pd.NamedAgg(column='open_days_per_week', aggfunc=np.mean),
    open_hours_per_week=pd.NamedAgg(column='open_hours_per_week', aggfunc=np.mean),
    working_shifts_per_week=pd.NamedAgg(column='working_shifts_per_week', aggfunc=np.mean),
    avg_rating=pd.NamedAgg(column='avg_rating', aggfunc=np.mean),
    reviews_count=pd.NamedAgg(column='total_reviews_count', aggfunc=np.sum),
    median_price=pd.NamedAgg(column='average_price', aggfunc=np.median))
regions_df.reset_index(level=0, inplace=True)

region_names_dict = {'Piedmont': 'Piemonte', 'Lombardy': 'Lombardia', 'Sardinia': 'Sardegna',
                     'Sicily': 'Sicilia', 'Tuscany': 'Toscana'}
regions_df['region_name'] = regions_df['region'].map(region_names_dict).fillna(regions_df['region'])

# restaurants with at least one award
restaurant_awards_df = restaurants_df[~restaurants_df['awards'].isna()].groupby('region').agg(
    {'restaurant_link': 'count'}).reset_index(level=0)
restaurant_awards_df.rename(columns={'restaurant_link': 'restaurants_with_award'}, inplace=True)
regions_df = pd.merge(regions_df, restaurant_awards_df[['region', 'restaurants_with_award']],
                        on='region', how='inner')
regions_df['restaurants_with_award_perc'] = regions_df['restaurants_with_award'] / regions_df['restaurants_count']

# restaurants with at least one Michelin award
restaurant_michelin_awards_df = restaurants_df[(~restaurants_df['awards'].isna()) &
                                               (restaurants_df['awards'].str.contains('Michelin'))].groupby('region').agg(
    {'restaurant_link': 'count'}).reset_index(level=0)
restaurant_michelin_awards_df.rename(columns={'restaurant_link': 'restaurants_with_michelin_award'}, inplace=True)
regions_df = pd.merge(regions_df, restaurant_michelin_awards_df[['region', 'restaurants_with_michelin_award']],
                        on='region', how='inner')
regions_df['restaurants_with_michelin_award_perc'] = regions_df['restaurants_with_michelin_award'] / regions_df['restaurants_count']

# dropping the count fields that have been used to calculate the percentages
regions_df.drop(['restaurants_with_award', 'restaurants_with_michelin_award'], axis=1, inplace=True)

regions_df

In [None]:
regions = ['Piemonte', 'Trentino-Alto Adige', 'Lombardia', 'Puglia', 'Basilicata', 
           'Friuli Venezia Giulia', 'Liguria', "Valle d'Aosta", 'Emilia-Romagna',
           'Molise', 'Lazio', 'Veneto', 'Sardegna', 'Sicilia', 'Abruzzo',
           'Calabria', 'Toscana', 'Umbria', 'Campania', 'Marche']

# Create a dataframe with the region names
df = pd.DataFrame(regions, columns=['NOME_REG'])
# merge the dataframe with the one containing the restaurants data grouped by Italian region
df = pd.merge(df, regions_df[['region_name', 'restaurants_with_award_perc', 'median_price']],
                        how='inner', left_on='NOME_REG', right_on='region_name')

# Read the geojson data with Italy's regional borders [enter image description here][2]from github
repo_url = 'https://gist.githubusercontent.com/datajournalism-it/48e29e7c87dca7eb1d29/raw/2636aeef92ba0770a073424853f37690064eb0ea/regioni.geojson'
italy_regions_geo = requests.get(repo_url).json()

In [None]:
# Choropleth representing the restaurants with any award
fig = px.choropleth(data_frame=df, geojson=italy_regions_geo, 
                    locations='NOME_REG', # name of dataframe column
                    featureidkey='properties.NOME_REG',  # path to field in GeoJSON feature object with which to match the values passed in to locations
                    color='restaurants_with_award_perc', color_continuous_scale='Magma', scope='europe')
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds='locations')
fig.update_layout(title='Italian Regions - Restaurants with at least one award (% to Total)', title_x=0.5, margin={'r':0, 'l':0, 'b':0, 'pad':0})
fig.show()

In [None]:
# Choropleth representing the median restaurant prices
fig = px.choropleth(data_frame=df, geojson=italy_regions_geo, 
                    locations='NOME_REG', # name of dataframe column
                    featureidkey='properties.NOME_REG',  # path to field in GeoJSON feature object with which to match the values passed in to locations
                    color='median_price', color_continuous_scale='Magma', scope='europe')
fig.update_geos(showcountries=False, showcoastlines=False, showland=False, fitbounds='locations')
fig.update_layout(title='Italian Regions - Median restaurant prices', title_x=0.5, margin={'r':0, 'l':0, 'b':0, 'pad':0})
fig.show()

Restaurants in the North-East area of Italy are the ones with a higher percentage of awards compared to their median prices