# Summary Findings

### Identifying Entities  
- There are 2 `ENTITY_TYPE` == 'PARK' that are the 2 primary parks for this study. 
- Each `ENTITY_TYPE` == 'PARK' has as their own `ENTITY_TPYE` == 'ATTR' to differentiate attraction names from park names.
- They are linked via the  **link_attraction_park** dataframe.

### Entity Schedule
- In the **entity_schedule** dataframe, when the `REF_CLOSING_DESCRIPTION` is not null, it represents a closure and it's reason.
- There are 2 reasons for closure:
1. Fermeture Réhab
2. Fermeture Opérationnelle

In [2]:
import pandas as pd
import glob
import os

dfs = {}

for path in glob.glob("../data/raw_data/*.parquet"):
    name = os.path.basename(path).replace(".parquet", "")
    dfs[name] = pd.read_parquet(path)

# EDA - Waiting Times

It appears that every ENTITY_DESCRIPTION_SHORT is sampled in 15 minute intervals everyday

In [None]:
waiting_times = dfs["waiting_times"]

display(waiting_times.head())

# printing raw d_types
display(waiting_times.dtypes)

Unnamed: 0,WORK_DATE,DEB_TIME,DEB_TIME_HOUR,FIN_TIME,ENTITY_DESCRIPTION_SHORT,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,OPEN_TIME,UP_TIME,DOWNTIME,NB_MAX_UNIT
0,2018-01-01,2018-01-01 21:00:00.000,21,2018-01-01 21:15:00.000,Roller Coaster,0,2.0,0.0,0.0,0.0,0,0,0,2.0
1,2018-01-01,2018-01-01 19:30:00.000,19,2018-01-01 19:45:00.000,Bumper Cars,5,18.0,148.0,254.749,254.75,15,15,0,18.0
2,2018-01-01,2018-01-01 22:30:00.000,22,2018-01-01 22:45:00.000,Rapids Ride,0,1.0,0.0,0.0,0.0,0,0,0,2.0
3,2018-01-01,2018-01-01 12:45:00.000,12,2018-01-01 13:00:00.000,Crazy Dance,5,1.0,46.0,250.001,250.0,15,15,0,1.0
4,2018-01-01,2018-01-01 17:00:00.000,17,2018-01-01 17:15:00.000,Skyway,5,15.0,92.0,211.5,198.25,15,15,0,16.0


WORK_DATE                       str
DEB_TIME                        str
DEB_TIME_HOUR                 int64
FIN_TIME                        str
ENTITY_DESCRIPTION_SHORT        str
WAIT_TIME_MAX                 int64
NB_UNITS                    float64
GUEST_CARRIED               float64
CAPACITY                    float64
ADJUST_CAPACITY             float64
OPEN_TIME                     int64
UP_TIME                       int64
DOWNTIME                      int64
NB_MAX_UNIT                 float64
dtype: object

# EDA - Entity Schedule
1. When `REF_CLOSING_DESCRIPTION` is not null, there is a reason for closure

In [None]:
entity_schedule = dfs["entity_schedule"]

display(entity_schedule.head())

# printing raw d_types
display(entity_schedule.dtypes)
entity_schedule

Unnamed: 0,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME,FIN_TIME,UPDATE_TIME,WORK_DATE
0,,Tivoli Gardens,PARK,2018-11-19 10:00:00.000,2018-11-19 18:00:00.000,2018-11-20 08:24:32.000,2018-11-19
1,,Dizzy Dropper,ATTR,2022-04-07 08:30:00.000,2022-04-07 22:04:00.000,2022-04-08 08:00:30.000,2022-04-07
2,,Sling Shot,ATTR,2018-03-28 08:37:00.000,2018-03-28 18:12:00.000,2018-03-29 08:24:37.000,2018-03-28
3,,Gondola,ATTR,2019-04-11 09:55:00.000,2019-04-11 20:19:00.000,2019-04-12 08:59:29.000,2019-04-11
4,,Monorail,ATTR,2019-06-29 08:30:00.000,2019-06-29 20:35:00.000,2019-06-30 08:14:16.000,2019-06-29


REF_CLOSING_DESCRIPTION     str
ENTITY_DESCRIPTION_SHORT    str
ENTITY_TYPE                 str
DEB_TIME                    str
FIN_TIME                    str
UPDATE_TIME                 str
WORK_DATE                   str
dtype: object

In [25]:
"""
when there is not null in `REF_CLOSING_DESCRIPTION`
lets check if there are entries record closure for longer than a day

if there isn't then every entry in `REF_CLOSING_DESCRIPTION` is a closure for a day
(we assume...)
"""

# convert datetime columns to datetime
entity_schedule['DEB_TIME'] = pd.to_datetime(entity_schedule['DEB_TIME'])
entity_schedule['FIN_TIME'] = pd.to_datetime(entity_schedule['FIN_TIME'])

# checks each row if there is a non-null value in `REF_CLOSING_DESCRIPTION` and calculates the downtime
for idx, row in entity_schedule.iterrows():
    if pd.notna(row['REF_CLOSING_DESCRIPTION']):
        downtime = row['FIN_TIME'] - row['DEB_TIME']
        
        # if downtime is > 1 day, investigate
        if downtime > pd.Timedelta(days=0):
            print(f"Recorded closure for more than a day: {row['WORK_DATE']}")

# EDA - Link Attraction Park
link_attraction_park

In [5]:
link_attraction_park = dfs["link_attraction_park"]

display(link_attraction_park.head())

display(link_attraction_park.dtypes)

Unnamed: 0,ATTRACTION;PARK
0,Aeroplane Ride;Tivoli Gardens
1,Bumper Cars;PortAventura World
2,Bungee Jump;PortAventura World
3,Circus Train;PortAventura World
4,Crazy Bus;Tivoli Gardens


ATTRACTION;PARK    str
dtype: object

In [None]:
# Checks if 'ATTRACTION;PARK' exists. If it doesnt exist, we've already ran this cell
if 'ATTRACTION;PARK' in link_attraction_park.columns:
    link_attraction_park[['ATTRACTION', 'PARK']] = link_attraction_park['ATTRACTION;PARK'].str.split(';', expand=True)
    link_attraction_park.drop(columns=['ATTRACTION;PARK'], inplace=True)

display(link_attraction_park.head())

# download this updated link for easier merging later on
link_attraction_park.to_parquet("../data/processed_data/link_attraction_park.parquet", index=False)

Unnamed: 0,ATTRACTION,PARK
0,Aeroplane Ride,Tivoli Gardens
1,Bumper Cars,PortAventura World
2,Bungee Jump,PortAventura World
3,Circus Train,PortAventura World
4,Crazy Bus,Tivoli Gardens


# EDA - Attendance

In [6]:
attendance = dfs['attendance']

display(attendance.head())

display(attendance.dtypes)

Unnamed: 0,USAGE_DATE,FACILITY_NAME,attendance
0,2018-06-01,PortAventura World,46804
1,2018-06-01,Tivoli Gardens,20420
2,2018-06-02,PortAventura World,57940
3,2018-06-02,Tivoli Gardens,29110
4,2018-06-03,PortAventura World,44365


USAGE_DATE         str
FACILITY_NAME      str
attendance       int64
dtype: object

# EDA - Weather Data
- PortAventura World - 41.087° N, 1.157° E
- Tivoli Gardens - 55.6737° N, 12.5681° E


In [None]:
weather_data = dfs['weather_data']

display(weather_data.head())

display(weather_data.dtypes)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,915148800,1999-01-01 00:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.33,,3.39,5.28,...,,,,,,8,800,Clear,sky is clear,01n
1,915152400,1999-01-01 01:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,3.54,5.18,...,,,,,,6,800,Clear,sky is clear,01n
2,915156000,1999-01-01 02:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,8.08,,4.11,5.38,...,,,,,,14,801,Clouds,few clouds,02n
3,915159600,1999-01-01 03:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,7.31,,3.73,4.42,...,,,,,,39,802,Clouds,scattered clouds,03n
4,915163200,1999-01-01 04:00:00 +0000 UTC,3600,Custom location,48.873492,2.295104,6.91,,3.53,4.0,...,,,,,,52,803,Clouds,broken clouds,04n


dt                       int64
dt_iso                     str
timezone                 int64
city_name                  str
lat                    float64
lon                    float64
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main               str
weather_description        str
weather_icon               str
dtype: object

In [None]:
import plotly.express as px
from geopy.distance import geodesic

# calculating if the weather report is equidistant between both park locations 
# saving in lat and lon order
report = (48.873492, 2.295104)
tivoli_gardens = (55.6737, 12.5681)         # i got these from google
portaventura_world = (41.087, 1.157)

# calculate distances in kilometers using geodescic distance (Haversine formula)
# pythagorean theorem can be a good approximation but let's be precise
dist_to_tivoli = geodesic(report, tivoli_gardens).km
dist_to_portaventura = geodesic(report, portaventura_world).km

print(f"Distance to Tivoli Gardens: {dist_to_tivoli:.2f} km")
print(f"Distance to PortAventura World: {dist_to_portaventura:.2f} km")

# visualizing the location of the weather reports compared to the parks
locations = pd.DataFrame({
    'lat': [tivoli_gardens[0], portaventura_world[0], report[0]],
    'lon': [tivoli_gardens[1], portaventura_world[1], report[1]],
    'park': ['Tivoli Gardens', 'PortAventura World', 'Weather Report']
})

px.scatter_geo(locations, lat='lat', lon='lon', hover_name='park')

Distance to Tivoli Gardens: 1029.55 km
Distance to PortAventura World: 869.94 km


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed