# Package Imports

In [43]:
# Data Processing
import importlib


import data_processing_code.processing_functions as pf
importlib.reload(pf) 

from datetime import datetime as dt
# import json
import numpy as np
import pandas as pd
# import urllib.request

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from statsmodels.tsa.seasonal import STL


import warnings
warnings.filterwarnings("ignore")

# Data Acquisition 

Our data is sourced through Open-Meteo's Historical Weather API.

## Selections for Data Download 

**General Details:**

- Data Source: https://open-meteo.com/en/docs/historical-weather-api

- API Call: https://archive-api.open-meteo.com/v1/era5?latitude=43.70011&longitude=-79.4163&start_date=1984-01-01&end_date=2023-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,cloud_cover,wind_speed_10m,sunshine_duration&timezone=America%2FNew_York

https://archive-api.open-meteo.com/v1/era5?latitude=43.70011&longitude=-79.4163&start_date=1984-01-01&end_date=2023-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,cloud_cover,wind_speed_10m,sunshine_duration,shortwave_radiation&timezone=America%2FNew_York

Per the documentation, we specify era5 in the API call, in order to ensure data consistency and to prevent unintentional alterations that may have arised from the adoption of different weather model upgrades. 


**Toronto Details:**

- Latitude: 43.70011  
- Longitude: -79.4163  
- Timezone: America/New_York  
- Start Date: 1984-01-01  
- End Date: 2023-12-31  

We will primarily focus our analyses on data between 1994 and 2023; however, we are downloading an additional decade's worth, in case it becomes necessary. 


Although Daily Weather Variables are available for selection, we opt for Hourly Variables instead, so that we can include Cloud Cover information in our analyses. 


**Hourly Weather Variables:**
- Temperature (2 m)
- Relative Humidity (2 m)
- Dew Point (2 m)
- Precipitation (rain + snow)
- Cloud Cover Total
- Wind Speed (10 m)

**Additional Variables:**
- Sunshine Duration

**Solar Radiation Variables:**
- Shortwave Solar Radition GHI

Although this variable will not be used in the experiments, we download it to assess the strength of its relationship to sunshine duration. 

**Settings:**

- Temperature Unit: Celsius  
- Wind Speed Unit: Km/h  
- Precipitation Unit: Millimeter  
- Timeformat: ISO 8601 (e.g. 2022-12-31)  


In [2]:
# Download data from Open Meteo
file_path = '../../data/raw_data/'
file_name = 'era5_data_toronto.json'
# api_call = 'https://archive-api.open-meteo.com/v1/era5?latitude=43.70011&longitude=-79.4163&start_date=1984-01-01&end_date=2023-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,cloud_cover,wind_speed_10m,sunshine_duration,shortwave_radiation&timezone=America%2FNew_York'
# pf.download_data(api_call, file_path, file_name)

In [9]:
# read in data 
file = f'{file_path}{file_name}'
weather_data = pf.df_from_json(file)
weather_data.name = 'weather_data'

pf.generate_df_summary(weather_data)

Dataframe: weather_data

------ Head: ------


Unnamed: 0,time,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,cloud_cover,wind_speed_10m,sunshine_duration,shortwave_radiation
0,1984-01-01T00:00,-9.3,75,-13.0,0.0,100,8.0,0.0,0.0
1,1984-01-01T01:00,-9.6,76,-13.0,0.0,85,7.6,0.0,0.0
2,1984-01-01T02:00,-10.1,77,-13.3,0.0,60,6.6,0.0,0.0
3,1984-01-01T03:00,-9.9,79,-12.8,0.0,51,6.0,0.0,0.0
4,1984-01-01T04:00,-10.8,81,-13.4,0.0,32,6.0,0.0,0.0




------ Tail: ------


Unnamed: 0,time,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,cloud_cover,wind_speed_10m,sunshine_duration,shortwave_radiation
350635,2023-12-31T19:00,0.4,82,-2.3,0.1,100,12.7,0.0,0.0
350636,2023-12-31T20:00,0.3,84,-2.1,0.3,100,11.9,0.0,0.0
350637,2023-12-31T21:00,0.2,88,-1.5,0.5,100,13.6,0.0,0.0
350638,2023-12-31T22:00,0.2,89,-1.5,0.4,100,13.0,0.0,0.0
350639,2023-12-31T23:00,0.1,89,-1.5,0.3,100,13.2,0.0,0.0




------ Column Summaries: ------


Unnamed: 0,count,unique,top,freq
time,350640,350640,1984-01-01T00:00,1


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
temperature_2m,350640.0,8.385759,10.55907,-27.1,0.4,8.4,17.4,34.2
relative_humidity_2m,350640.0,71.907723,14.841568,18.0,61.0,73.0,84.0,100.0
dew_point_2m,350640.0,3.302916,10.535977,-31.7,-4.1,3.6,12.2,24.9
precipitation,350640.0,0.092773,0.39593,0.0,0.0,0.0,0.0,19.4
cloud_cover,350640.0,49.286633,38.425978,0.0,11.0,43.0,90.0,100.0
wind_speed_10m,350640.0,13.56912,6.646731,0.0,8.6,12.6,17.7,50.8
sunshine_duration,350640.0,1238.381057,1649.720954,0.0,0.0,0.0,3600.0,3600.0
shortwave_radiation,350640.0,162.148574,239.368387,0.0,0.0,7.0,271.0,1008.0




------ Counts: ------

Rows: 350,640
Columns: 9
Duplicate Rows = 0 | % of Total Rows = 0.0%


------ Info: ------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350640 entries, 0 to 350639
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   time                  350640 non-null  object 
 1   temperature_2m        350640 non-null  float64
 2   relative_humidity_2m  350640 non-null  int64  
 3   dew_point_2m          350640 non-null  float64
 4   precipitation         350640 non-null  float64
 5   cloud_cover           350640 non-null  int64  
 6   wind_speed_10m        350640 non-null  float64
 7   sunshine_duration     350640 non-null  float64
 8   shortwave_radiation   350640 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 24.1+ MB


None



------ Missing Data Percentage: ------


time                    0.0
temperature_2m          0.0
relative_humidity_2m    0.0
dew_point_2m            0.0
precipitation           0.0
cloud_cover             0.0
wind_speed_10m          0.0
sunshine_duration       0.0
shortwave_radiation     0.0
dtype: float64

There is no missing data, nor do there appear to be any immediately obvious outliers. However, we will formally assess the latter later. 

In [44]:
# rename columns 
mapper = {
    'temperature_2m' : 'temp',
    'relative_humidity_2m' : 'humidity',
    'dew_point_2m' : 'dew_point',
    'precipitation': 'precipitation',
    'cloud_cover' : 'cloud_cover',
    'wind_speed_10m' : 'wind_speed',
    'sunshine_duration' : 'sunshine_s'
}

weather_data.rename(columns=mapper, inplace=True)
weather_data['time'] = pd.to_datetime(weather_data['time'])
df_daily = pf.daily_aggregations(weather_data)

# subset to only include data from 1995
df_daily = df_daily['1995':]
print(f'The aggregated daily dataset has {df_daily.shape[0]} rows and {df_daily.shape[1]} columns.')

The aggregated daily dataset has 10592 rows and 18 columns.


In [23]:
# REORD

In [45]:
df_daily.head()

Unnamed: 0_level_0,sunshine_hr,shortwave_radiation,precipitation,min_temp,mean_temp,max_temp,min_humidity,mean_humidity,max_humidity,min_dew_point,mean_dew_point,max_dew_point,min_cloud_cover,mean_cloud_cover,max_cloud_cover,min_wind_speed,mean_wind_speed,max_wind_speed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1995-01-01,1.966,870.0,3.7,-0.3,0.825,2.7,78,90.083,97,-2.4,-0.654,-0.0,69,97.5,100,3.9,7.471,12.6
1995-01-02,7.805,1745.0,0.0,-5.0,-3.35,-0.7,54,61.917,85,-12.0,-9.708,-2.8,1,50.417,100,13.7,26.333,35.6
1995-01-03,4.189,1365.0,0.0,-6.4,-4.408,-1.9,47,62.083,69,-12.2,-10.658,-9.9,0,37.875,100,18.1,22.992,27.3
1995-01-04,7.547,1977.0,0.0,-10.4,-8.871,-6.8,40,58.208,67,-18.9,-15.779,-12.6,0,46.875,100,20.2,25.179,32.4
1995-01-05,7.944,1770.0,0.0,-11.0,-8.508,-5.7,46,60.208,71,-16.1,-15.025,-14.2,1,32.25,100,21.8,28.812,33.9
