# Data Handling
The main notebook containing all relevant data handling and pre-processing steps except interaction with the weather API. The code for this can be found in the following files: 
- `weather_data_pull.py` -> API call
- `weather_data_process.py` -> Processing the JSON into a useable format, feature engineering

Feature explanations for the historical datasets used here can be found at: https://www.nationalgrideso.com/data-portal/historic-demand-data/historic_demand_data_2023

In [15]:
import pandas as pd
import numpy as np

### Combine Demand Data

In [16]:
df_2019 = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/demanddata_2019.csv')
df_2020 = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/demanddata_2020.csv')
df_2021 = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/demanddata_2021.csv')
df_2022 = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/demanddata_2022.csv')
df_2023 = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/demanddata_2023.csv')


frames_1 = [df_2019, df_2020, df_2021, df_2022]

for i in frames_1: 
    i['SETTLEMENT_DATE'] = pd.to_datetime(i['SETTLEMENT_DATE'], format='%d-%b-%Y')


frames_2 = [df_2019, df_2020, df_2021, df_2022, df_2023]

for i in frames_2: 
    i = i[['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND', 'TSD',
       'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION',
       'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION',
       'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING',
       'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW',
       'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW']]
    i['SETTLEMENT_DATE'] = pd.to_datetime(i['SETTLEMENT_DATE'], format='%Y-%m-%d')


df = pd.concat([df_2019, df_2020, df_2021, df_2022, df_2023])
df['SETTLEMENT_DATE'] = pd.to_datetime(df['SETTLEMENT_DATE']).dt.date
df = df.reset_index()


### Seasonal feature engineering

In [17]:
df['SETTLEMENT_DATE'] = pd.to_datetime(df['SETTLEMENT_DATE'])

df['month'] = df['SETTLEMENT_DATE'].dt.month
df['day_of_week'] = df['SETTLEMENT_DATE'].dt.dayofweek


# apply sinusoidal transformations
# why? -> removes ordinal relationships (months 1-12, days 1-7) while not creating many new features such as via one hot encoding
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

df['settlement_period_sin'] = np.sin(2 * np.pi * df['SETTLEMENT_PERIOD']/7)
df['settlement_period_cos'] = np.cos(2 * np.pi * df['SETTLEMENT_PERIOD']/7)

df

Unnamed: 0,index,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,...,NSL_FLOW,ELECLINK_FLOW,month,day_of_week,month_sin,month_cos,day_of_week_sin,day_of_week_cos,settlement_period_sin,settlement_period_cos
0,0,2019-01-01,1,23808,25291,22393,2548,5918,0,13052,...,0,0,1,1,0.5,0.866025,0.781831,0.623490,0.781831,0.623490
1,1,2019-01-01,2,24402,25720,22962,2475,5918,0,13052,...,0,0,1,1,0.5,0.866025,0.781831,0.623490,0.974928,-0.222521
2,2,2019-01-01,3,24147,25495,22689,2396,5918,0,13052,...,0,0,1,1,0.5,0.866025,0.781831,0.623490,0.433884,-0.900969
3,3,2019-01-01,4,23197,24590,21849,2317,5918,0,13052,...,0,0,1,1,0.5,0.866025,0.781831,0.623490,-0.433884,-0.900969
4,4,2019-01-01,5,22316,24346,20979,2236,5918,0,13052,...,0,0,1,1,0.5,0.866025,0.781831,0.623490,-0.974928,-0.222521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85435,15307,2023-11-15,44,31455,33043,28328,476,6488,0,15595,...,1140,996,11,2,-0.5,0.866025,0.974928,-0.222521,0.974928,-0.222521
85436,15308,2023-11-15,45,29899,31488,26916,447,6488,0,15595,...,1368,952,11,2,-0.5,0.866025,0.974928,-0.222521,0.433884,-0.900969
85437,15309,2023-11-15,46,28415,30005,25563,417,6488,0,15595,...,1396,950,11,2,-0.5,0.866025,0.974928,-0.222521,-0.433884,-0.900969
85438,15310,2023-11-15,47,26811,28401,24118,385,6488,0,15595,...,1396,995,11,2,-0.5,0.866025,0.974928,-0.222521,-0.974928,-0.222521


### Lag Features by 1 Day

In [18]:
df['SETTLEMENT_DATE'] = df['SETTLEMENT_DATE'] + pd.Timedelta(days=1)

### Reconstruct Datetime Column

In [19]:
from datetime import timedelta

# Adjust the calculation for hours and minutes
def calculate_datetime(row):
    minute_of_day = (row['SETTLEMENT_PERIOD'] - 1) * 30
    hour = minute_of_day // 60
    minute = minute_of_day % 60
    datetime = row['SETTLEMENT_DATE']

    if hour >= 24:
        hour -= 24
        datetime += timedelta(days=1)

    return datetime.replace(hour=hour, minute=minute)

# Apply the function to each row
df['DATETIME'] = df.apply(calculate_datetime, axis=1)

### Weather Data

In [20]:
df_weather = pd.read_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/external/historical_weather_bulk.csv')

df_w_london = df_weather[df_weather['lat'] == 51.503655][['dt_iso', 'temp', 'temp_min', 'temp_max', 'humidity']]
df_w_bath = df_weather[df_weather['lat'] == 51.387535][['dt_iso', 'temp', 'temp_min', 'temp_max', 'humidity']]
df_w_liverpool = df_weather[df_weather['lat'] == 53.402859][['dt_iso', 'temp', 'temp_min', 'temp_max', 'humidity']]

df_w_london.columns = ['dt_iso', 'temp_london', 'temp_min_london', 'temp_max_london', 'humidity_london']
df_w_bath.columns = ['dt_iso', 'temp_bath', 'temp_min_bath', 'temp_max_bath', 'humidity_bath']
df_w_liverpool.columns = ['dt_iso', 'temp_liverpool', 'temp_min_liverpool', 'temp_max_liverpool', 'humidity_liverpool']

combined_df = df_w_london.merge(df_w_bath, on='dt_iso', how='outer')
combined_df = combined_df.merge(df_w_liverpool, on='dt_iso', how='outer')

combined_df = combined_df[['dt_iso', 'temp_london', 'temp_bath', 'temp_liverpool', 'temp_min_london', 'temp_min_bath', 'temp_min_liverpool', 
                           'temp_max_london','temp_max_bath', 'temp_max_liverpool', 'humidity_london', 'humidity_bath', 'humidity_liverpool']]

Convert ISO time to the timestamp format used above

In [21]:
date_format = "%Y-%m-%d %H:%M:%S +0000 UTC"
combined_df['dt_iso'] = pd.to_datetime(combined_df['dt_iso'], format=date_format).dt.strftime('%Y-%m-%d %H:%M:%S')
combined_df['DATETIME'] = combined_df['dt_iso']
combined_df = combined_df.drop(columns=['dt_iso'])

Merge with weather data, keep in mind that this reduces frequency to hourly due to the weather dataset

In [22]:
combined_df['DATETIME'] = pd.to_datetime(combined_df['DATETIME'])
df['DATETIME'] = pd.to_datetime(df['DATETIME'])
merged_df = pd.merge(df, combined_df, on='DATETIME', how='inner')
merged_df = merged_df.drop(columns=['index'])

merged_df

Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,...,temp_liverpool,temp_min_london,temp_min_bath,temp_min_liverpool,temp_max_london,temp_max_bath,temp_max_liverpool,humidity_london,humidity_bath,humidity_liverpool
0,2019-01-02,1,23808,25291,22393,2548,5918,0,13052,0,...,274.72,277.17,276.83,271.85,279.12,278.89,276.30,71,87,79
1,2019-01-02,3,24147,25495,22689,2396,5918,0,13052,0,...,274.15,276.06,275.57,271.29,279.12,278.89,275.75,74,85,79
2,2019-01-02,5,22316,24346,20979,2236,5918,0,13052,0,...,273.73,277.17,274.98,271.29,279.32,278.33,275.01,72,84,80
3,2019-01-02,7,20958,23695,19433,2081,5918,0,13052,0,...,273.08,277.17,273.87,270.74,279.21,277.77,274.59,69,85,82
4,2019-01-02,9,20331,23080,18687,1958,5918,0,13052,0,...,272.77,277.17,273.78,270.74,279.12,277.22,274.03,68,86,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43826,2023-11-16,39,37489,39082,33728,768,6488,0,15595,0,...,280.29,279.95,277.03,278.94,281.66,281.04,281.51,88,89,90
43827,2023-11-16,41,35214,36802,31696,625,6488,0,15595,0,...,280.23,279.95,277.03,279.12,281.66,281.01,280.70,87,91,90
43828,2023-11-16,43,32697,34287,29445,520,6488,0,15595,0,...,280.09,279.39,277.03,278.83,281.17,281.01,280.60,87,92,91
43829,2023-11-16,45,29899,31488,26916,447,6488,0,15595,0,...,279.95,277.72,278.12,279.12,282.78,281.14,280.59,84,93,93


### Create Unlagged Target Variable

In [23]:
# Create an unlagged target variable and a lagged feature (yesterday's national demand)
merged_df['ND_TARGET'] = merged_df['ND'].shift(-1)
merged_df = merged_df.drop(merged_df.tail(1).index)
merged_df['ND_TARGET'] = merged_df['ND_TARGET'].astype(int)
merged_df['ND_PREV'] = merged_df['ND']
merged_df = merged_df.drop(columns=['ND'])

### Export

In [25]:
df_full = merged_df[['DATETIME', 'ND_TARGET', 'SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'ND_PREV', 'TSD',
       'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION',
       'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION',
       'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING',
       'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW',
       'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW', 'month', 'day_of_week',
       'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos',
       'settlement_period_sin', 'settlement_period_cos',  'temp_london', 
       'temp_bath', 'temp_liverpool', 'temp_min_london',
       'temp_min_bath', 'temp_min_liverpool', 'temp_max_london',
       'temp_max_bath', 'temp_max_liverpool', 'humidity_london',
       'humidity_bath', 'humidity_liverpool']]

df_train = merged_df[['DATETIME', 'ND_TARGET','ND_PREV', 'TSD',
       'ENGLAND_WALES_DEMAND', 'EMBEDDED_WIND_GENERATION',
       'EMBEDDED_WIND_CAPACITY', 'EMBEDDED_SOLAR_GENERATION',
       'EMBEDDED_SOLAR_CAPACITY', 'NON_BM_STOR', 'PUMP_STORAGE_PUMPING',
       'IFA_FLOW', 'IFA2_FLOW', 'BRITNED_FLOW', 'MOYLE_FLOW', 'EAST_WEST_FLOW',
       'NEMO_FLOW', 'NSL_FLOW', 'ELECLINK_FLOW',
       'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos',
       'settlement_period_sin', 'settlement_period_cos', 'temp_london', 
       'temp_bath', 'temp_liverpool', 'temp_min_london',
       'temp_min_bath', 'temp_min_liverpool', 'temp_max_london',
       'temp_max_bath', 'temp_max_liverpool', 'humidity_london',
       'humidity_bath', 'humidity_liverpool']]

df_full.to_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/custom/df_full.csv', index=False)
df_train.to_csv('/Users/laurenzschneeberger/Downloads/EnergyDemand-main/00_Data/custom/df_train.csv', index=False)