# Sprint 6: Final Report

## Colab Setup:

In [None]:
import numpy as np
import pandas as pd
import ast
import textwrap
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import folium

import kagglehub
path = kagglehub.dataset_download("jiashenliu/515k-hotel-reviews-data-in-europe")
print("Path to dataset files:", path)

Using Colab cache for faster access to the '515k-hotel-reviews-data-in-europe' dataset.
Path to dataset files: /kaggle/input/515k-hotel-reviews-data-in-europe


In [16]:
df = pd.read_csv(f"{path}/Hotel_Reviews.csv")

## A: Data Card (updated)

**Source & Link:** 515K Hotel Reviews Data in Europe (Kaggle) - https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe/data

**Shape:** 515,738 rows x 18 columns

**Time Coverage:** August 4, 2015 - August 3, 2017



##Column Dictionary:
- **Hotel_Address:** address of the hotel
- **Additional_Number_of_Scoring:** number of scores given without an accompanying review
- **Review_Date:** date when the review was posted
- **Average_Score:** average score of the hotel
- **Hotel_Name:** name of the hotel
- **Reviewer_Nationality:** nationality of the reviewer
- **Negative_Review:** text content of negative review, if the reviewer did not give a negative review it says "No Negative"
- **Review_Total_Negative_Word_Counts:** number of words in negative review
- **Total_Number_of_Reviews:** the total number of reviews
- **Positive_Review:** text content of positve reviews, if the reviewer did not give a positive review it says "No Positive"
- **Review_Total_Positive_Word_Counts:** number of words in positive review
- **Total_Number_of_Reviews_Reviewer_Has_Given:** number of reviews given by the reviewer
- **Reviewer_Score:** score given by the reviewer
- **Tags:** tags given to hotel by the reviewer                         
- **days_since_review:** days between date of review and the date it was scraped        
- **lat:** latitude of hotel
- **lng:** longitude of hotel

New Columns:
- **Hotel_Country:** country of the reviewed hotel
- **Year:** year when review was posted
- **Review_Total_Word_Counts:** the total word-counts derived from adding positive review word-counts and negative review word-counts




**Key ID Columns:** 'Hotel_Name', 'Review_Date'


##Missingness Snapshot:
N/A


##Quirks:
- **Hotel_Country** only includes 6 countries: Netherlands, UK, France, Spain, Italy, Austria


## B: Transformation

In [17]:
df['Hotel_Address'] = df['Hotel_Address'].str.replace('United Kingdom', 'UK')
hotel_country = df['Hotel_Address'].str.split(' ').str[-1]
df['Hotel_Address'] = df.apply(lambda row: ' '.join(row['Hotel_Address'].split(' ')[:-1]), axis=1)

df['Hotel_Country'] = hotel_country

In [18]:
colors = {
    'Netherlands':'red',
    'UK':'grey',
    'France':'blue',
    'Spain':'yellow',
    'Italy':'green',
    'Austria':'orange'
}

In [19]:
countries = df['Hotel_Country'].unique()
top_hotels = {}
for country in countries:
  popular_hotel = df[df['Hotel_Country']==country].groupby('Hotel_Name').Reviewer_Score.mean().idxmax();
  popular_hotel_reviews = df[df['Hotel_Name']==popular_hotel].copy()
  top_hotels[country] = popular_hotel_reviews
  # Turn Tags column from string representation of list to string with comma-separated tags
  top_hotels[country]['Tags'] = top_hotels[country].apply(lambda row: ', '.join(map(lambda x: x.strip(), ast.literal_eval(row['Tags']))), axis=1)
  # Add line wrapping to Tags (for hover text)
  top_hotels[country]['Tags'] = top_hotels[country].apply(lambda row: '<br>'.join(textwrap.wrap(row['Tags'], width=40)), axis=1)

'''
for a in top_hotels:
  print(a, top_hotels[a]['Hotel_Name'].unique(), top_hotels[a].Reviewer_Score.count())
'''

"\nfor a in top_hotels:\n  print(a, top_hotels[a]['Hotel_Name'].unique(), top_hotels[a].Reviewer_Score.count())\n"

## C: Visuals

In [20]:
hotels = df.groupby(['lat', 'lng', 'Hotel_Name', 'Hotel_Country'])

m = folium.Map(location=[54.526, 15.255], zoom_start=4)
for (lat, lng, hotel_name, hotel_country), group in hotels:
    counts = group['Reviewer_Nationality'].value_counts().head(5)
    popup_text = "<br>".join([f"{c}: {v}" for c, v in counts.items()])
    marker_color = colors.get(hotel_country)
    folium.CircleMarker(
        location=[lat, lng],
        radius=7,
        color=marker_color,
        fill=True,
        fill_opacity=0.8,
        tooltip=popup_text
    ).add_to(m)
map_title = "Number of Reviews for Hotels by Reviewer Nationality"
title_html = f'<h3 align="center" style="font-size:16px"><b>{map_title}</b></h3>'
m.get_root().html.add_child(folium.Element(title_html))
m.save('map-with-title.html')
m

In [21]:
fig = go.Figure()

for country, hotel_reviews in top_hotels.items():
  # Create a Plotly Express figure for each country
  px_fig = px.strip(
    hotel_reviews,
    x='Hotel_Name',
    y='Reviewer_Score',
    color='Hotel_Country',
    color_discrete_map=colors,
    custom_data=['Tags'],
  )
  px_fig.update_traces(
      hovertemplate =
                f'<b>{country}</b></br>' +
                '<b>%{x}</b><br><br>' +
                'Tags: %{customdata[0]}' +
                '<extra></extra>',
  )
  # print(px_fig.data[0])
  # print(px_fig.data[0].hovertemplate)

  # Add all traces from the Plotly Express figure to the main go.Figure()
  for trace in px_fig.data:
    fig.add_trace(trace)

fig.update_layout(
    title='Review Scores for Top Hotel by Country',
    xaxis_title='Hotel Name',
    yaxis_title='Reviewer Score',
    width=1000,
    height=600
)

fig.show()