In [1]:
# import modules
# basic
import pandas as pd
import numpy as np
import datetime as dt
from functools import reduce

# SQL
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine

# sklearn
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.datasets import fetch_dataset
from sklearn.ensemble import RandomForestRegressor


# Stationary Tests
from statsmodels.tsa.stattools import adfuller,kpss

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

NSEED = 42

In [2]:
# set graphic parameters to have unified plot appearance
sns.set_style(
    style='darkgrid', 
    rc={'axes.facecolor': 'white', 'grid.color': '.8'}
)
NF_ORANGE = '#ff5a36'
NF_BLUE = '#163251'
cmaps_hex = ['#193251','#FF5A36','#696969', '#7589A2','#FF5A36', '#DB6668']
sns.set_palette(palette=cmaps_hex)
sns_c = sns.color_palette(palette=cmaps_hex)
%matplotlib inline
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 300

In [3]:
# Load environment variables from .env file
load_dotenv()

# Get database connection parameters from environment variables
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')

# Create the database URL
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create an engine
engine = create_engine(db_url)

# Define your query -> set your table name here
query = 'SELECT * FROM "03_gold"."fact_electricity_market_germany"'

# Execute the query and load the data into a pandas DataFrame
df = pd.read_sql(query, engine)

In [4]:
################ OUTLIER TREATMENT #######################
# Replace outliers with NAs
df.loc[df['price_eur_mwh'] < -200, 'price_eur_mwh'] = np.nan

# Replace NAs by imputation
df['price_eur_mwh'] = df['price_eur_mwh'].interpolate()

In [5]:
# Create lag feature for the same hour on the previous three days
df['price_lag24'] = df['price_eur_mwh'].shift(24)
df['price_lag48'] = df['price_eur_mwh'].shift(48)
df['price_lag72'] = df['price_eur_mwh'].shift(72)

In [6]:
# Create forecast feature for the same hour on the next three days
df_with_forecast = df

weathers = ['temperature_2m', 'relative_humidity_2m', 'apparent_temperature', 'precipitation', 'cloud_cover', 'wind_speed_10m', 'wind_direction_10m',
            'direct_radiation', 'diffuse_radiation', 'sunshine_duration']

for weather in weathers:
    df_with_forecast[f'{weather}_24'] = df_with_forecast[f'{weather}'].shift(-24)
    df_with_forecast[f'{weather}_48'] = df_with_forecast[f'{weather}'].shift(-48)
    df_with_forecast[f'{weather}_72'] = df_with_forecast[f'{weather}'].shift(-72)

In [7]:
# Get forecast data
# Define your query
query3 = 'SELECT * FROM "02_silver"."fact_full_weather"'

# Execute the query and load the data into a pandas DataFrame
forecasts = pd.read_sql(query3, engine).sort_values('timestamp')

In [8]:
# Aggregate forecasts across stations and cut unnecessary data
aggregated_forecasts = forecasts.drop(columns=['station_id', 'is_forecast', 'source_table']).groupby('timestamp').mean().reset_index()

max_timestamp = df_with_forecast['timestamp'].max()
aggregated_forecasts = aggregated_forecasts[aggregated_forecasts['timestamp'] > max_timestamp]


In [9]:
# Create shifted dataframes from aggregated_forecasts with the timestamp shifted by 24, 48 and 72 hours
df24 = aggregated_forecasts.copy()
df24['timestamp'] -= pd.DateOffset(hours=72)
for weather in weathers:
    df24.rename(columns={f'{weather}': f'{weather}_24'}, inplace=True)

df48 = aggregated_forecasts.copy()
df48['timestamp'] -= pd.DateOffset(hours=48)
for weather in weathers:
    df48.rename(columns={f'{weather}': f'{weather}_48'}, inplace=True)

# Create a shifted dataframe from aggregated_forecasts with the timestamp shifted by 24 hours
df72 = aggregated_forecasts.copy()
df72['timestamp'] -= pd.DateOffset(hours=72)
for weather in weathers:
    df72.rename(columns={f'{weather}': f'{weather}_72'}, inplace=True)

In [10]:
# Merge the dataframes on the shifted timestamp
df1 = df[['timestamp']]
dfs = [df1, df24, df48, df72]
merged_df = reduce(lambda left, right: pd.merge(left, right, on='timestamp', how='outer'), dfs)

In [11]:
# Fill NAs from merged df
df_with_forecast = df_with_forecast.fillna(merged_df)

In [12]:
# save data (optional)
df_with_forecast.to_csv('../data/data_with_forecasts.csv')