# Combining raw dataframes information together to form a master

In [None]:
"""
using waiting_times as the base for master as it contains 
all relevant information for attraction specific wait times
"""
import pandas as pd

master = pd.read_parquet("../data/raw_data/waiting_times.parquet")


# ===================================
# |  link attraction park merging   |
# ===================================

# link attraction park to label which attraction belongoning to which park
link_attraction_park = pd.read_parquet("../data/processed_data/link_attraction_park.parquet")
master_merged_1 = master.merge(link_attraction_park, left_on = 'ENTITY_DESCRIPTION_SHORT', right_on = 'ATTRACTION', how = 'left')

# replacing ENTITY_DESCRIPTION_SHORT with ATTRACTION for clarity
master_merged_1.drop(columns = ['ENTITY_DESCRIPTION_SHORT'], inplace = True)  


# ===================================
# |           attendance            |
# ===================================

# link attendance to mark attendance for each park and date. this way wait times will have daily attendance
attendance = pd.read_parquet("../data/raw_data/attendance.parquet")

# converting datetimes to datetime format
attendance['USAGE_DATE'] = pd.to_datetime(attendance['USAGE_DATE'])
master_merged_1['WORK_DATE'] = pd.to_datetime(master_merged_1['WORK_DATE'])

master_merged_2 = master_merged_1.merge(attendance, left_on = ['PARK', 'WORK_DATE'], right_on = ['FACILITY_NAME', 'USAGE_DATE'], how = 'left')
#! there are a bunch of dates that dont have attendance data, keep for now

# removing FACILITY_NAME and USAGE_DATE since redundant after merge
master_merged_2.drop(columns = ['FACILITY_NAME', 'USAGE_DATE'], inplace = True)
master_merged_2


# ===================================
# |         weather data            |
# ===================================

# link weather data to mark weather for each park and date. this way wait times will have daily weather
weather_data = pd.read_parquet("../data/raw_data/weather_data.parquet")

# weather data time is in iso format and UTC, converting it to datetime with timezone offset
weather_data['dt_iso'] = pd.to_datetime(weather_data['dt'], unit = 's') + pd.to_timedelta(weather_data['timezone'], unit='s')

master_merged_3 = master_merged_2.merge(weather_data, left_on = 'WORK_DATE', right_on = 'dt_iso', how = 'left')

# removing all redundant datetime/ timeszone/ location columns
master_merged_3.drop(columns = ['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon'], inplace = True)


# ===================================
# |        entity schedule          |
# ===================================

# link entity schedule to indicate when there are closures for whichever attraction
entity_schedule = pd.read_parquet("../data/raw_data/entity_schedule.parquet")

# creating a subset so i dont merge the entire entity schedule with redundant columns
entity_schedule_subset = entity_schedule[['REF_CLOSING_DESCRIPTION', 'ENTITY_DESCRIPTION_SHORT', 'WORK_DATE']].copy()

# converting datetimes to datetime format
entity_schedule_subset['WORK_DATE'] = pd.to_datetime(entity_schedule_subset['WORK_DATE'])

master_merged_4 = master_merged_3.merge(entity_schedule_subset, left_on = ['ATTRACTION', 'WORK_DATE'], right_on = ['ENTITY_DESCRIPTION_SHORT', 'WORK_DATE'], how = 'left')

# dropping redundant merge columns
master_merged_4.drop(columns = ['WORK_DATE', 'ENTITY_DESCRIPTION_SHORT'], inplace = True)

Unnamed: 0,WORK_DATE,DEB_TIME_x,DEB_TIME_HOUR,FIN_TIME_x,WAIT_TIME_MAX,NB_UNITS,GUEST_CARRIED,CAPACITY,ADJUST_CAPACITY,OPEN_TIME,...,weather_id,weather_main,weather_description,weather_icon,REF_CLOSING_DESCRIPTION,ENTITY_DESCRIPTION_SHORT,ENTITY_TYPE,DEB_TIME_y,FIN_TIME_y,UPDATE_TIME
0,2018-01-01,2018-01-01 21:00:00.000,21,2018-01-01 21:15:00.000,0,2.0,0.0,0.000,0.00,0,...,802,Clouds,scattered clouds,03n,,Roller Coaster,ATTR,2018-01-01 10:00:00.000,2018-01-01 17:44:00.000,2018-01-02 07:46:01.000
1,2018-01-01,2018-01-01 19:30:00.000,19,2018-01-01 19:45:00.000,5,18.0,148.0,254.749,254.75,15,...,802,Clouds,scattered clouds,03n,,Bumper Cars,ATTR,2018-01-01 08:30:00.000,2018-01-01 21:05:00.000,2018-01-02 07:51:31.000
2,2018-01-01,2018-01-01 22:30:00.000,22,2018-01-01 22:45:00.000,0,1.0,0.0,0.000,0.00,0,...,802,Clouds,scattered clouds,03n,,Rapids Ride,ATTR,2018-01-01 09:29:00.000,2018-01-01 17:08:00.000,2018-01-02 07:51:31.000
3,2018-01-01,2018-01-01 12:45:00.000,12,2018-01-01 13:00:00.000,5,1.0,46.0,250.001,250.00,15,...,802,Clouds,scattered clouds,03n,,Crazy Dance,ATTR,2018-01-01 09:12:00.000,2018-01-01 22:00:00.000,2018-01-02 07:43:47.000
4,2018-01-01,2018-01-01 17:00:00.000,17,2018-01-01 17:15:00.000,5,15.0,92.0,211.500,198.25,15,...,802,Clouds,scattered clouds,03n,,Skyway,ATTR,2018-01-01 10:00:00.000,2018-01-01 19:00:00.000,2018-01-02 07:55:16.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509319,2022-08-18,2022-08-18 18:45:00.000,18,2022-08-18 19:00:00.000,0,0.0,0.0,0.000,0.00,0,...,804,Clouds,overcast clouds,04n,Fermeture Réhab,Himalaya Ride,ATTR,2022-08-18 23:59:00.000,2022-08-18 23:59:00.000,2022-08-19 07:56:33.000
3509320,2022-08-18,2022-08-18 10:15:00.000,10,2022-08-18 10:30:00.000,0,0.0,0.0,0.000,0.00,0,...,804,Clouds,overcast clouds,04n,Fermeture Réhab,Crazy Dance,ATTR,2022-08-18 23:59:00.000,2022-08-18 23:59:00.000,2022-08-19 07:56:29.000
3509321,2022-08-18,2022-08-18 09:15:00.000,9,2022-08-18 09:30:00.000,0,0.0,0.0,0.000,0.00,0,...,804,Clouds,overcast clouds,04n,Fermeture Réhab,Crazy Dance,ATTR,2022-08-18 23:59:00.000,2022-08-18 23:59:00.000,2022-08-19 07:56:29.000
3509322,2022-08-18,2022-08-18 20:30:00.000,20,2022-08-18 20:45:00.000,0,0.0,0.0,0.000,0.00,0,...,804,Clouds,overcast clouds,04n,Fermeture Réhab,Giga Coaster,ATTR,2022-08-18 23:59:00.000,2022-08-18 23:59:00.000,2022-08-19 07:56:35.000
