In [None]:
import os

import torch
torch.cuda.is_available()

In [None]:
import optuna
import os
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import partial
from pytorch_lightning.callbacks import EarlyStopping

from darts import TimeSeries
from darts.models import *
from darts.metrics import rmse, coefficient_of_variation
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries

from time_series_model.data.weather.weather_dataloader import MeteostatDataLoader
from time_series_model.data.data_loading import SMARDDataLoader
from time_series_model import evaluation

import warnings
warnings.filterwarnings("ignore")

import logging
logging.getLogger("time_series_model").setLevel(logging.INFO)
logging.getLogger('lightning').setLevel(0)
logging.getLogger('pytorch_lightning').setLevel(0)
logging.getLogger('darts').setLevel(0)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## Load data

### Load the weather data

In [None]:
meteostat_data_loader = MeteostatDataLoader(
    file_paths=[
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', 'weather_data_solar_stations.csv')
    ],
    solar=True,
    wind=True
)
meteostat_data_loader.load_data()
weather_data = meteostat_data_loader.data

print("Missing values")
for col in weather_data.columns:
    print(f"  Column {col} has {weather_data[col].isna().mean() * 100:0.2f}% missing values")

weather_data['time'] = pd.to_datetime(weather_data['time'])
weather_data = weather_data.set_index('time')
print(f"Weather data index: {weather_data.index.min()} - {weather_data.index.max()}")

weather_data = TimeSeries.from_dataframe(
    weather_data, 
    value_cols=list(weather_data.columns), 
    fill_missing_dates=True, 
    fillna_value=0, 
    freq='1H'
)
# To float32
weather_data = weather_data.astype(np.float32)

### Load SMARD data

In [None]:
smard_dataloader = SMARDDataLoader(
    file_paths=[
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', '2015_2016.csv'),
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', '2017_2018.csv'),
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', '2019_2020.csv'),
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', '2021_2022.csv'),
        os.path.join(os.getcwd(), os.pardir, 'data', 'raw', '2022_2023.csv')
    ]
)
smard_dataloader.load_data()
smard_dataloader.preprocess_data()
smard_dataloader.validate_data()

smard_data = smard_dataloader.data
smard_data['timestamp'] = pd.to_datetime(smard_data['timestamp'])
smard_data = smard_data.set_index('timestamp')
smard_data = TimeSeries.from_dataframe(
    smard_data, 
    value_cols=list(smard_data.columns), 
    fill_missing_dates=True, 
    fillna_value=0, 
    freq='1H'
)

In [None]:
# Plot weather data
weather_data[-365:].plot()

### Create synthetic date-time covariates

In [None]:
weekday = datetime_attribute_timeseries(weather_data, attribute="weekday", dtype=np.float32)
month = datetime_attribute_timeseries(weather_data, attribute="month", dtype=np.float32)
hour = datetime_attribute_timeseries(weather_data, attribute="hour", dtype=np.float32)
covariates_time = weekday.stack(hour).stack(month)

scaler_covariates = Scaler()
covariates_time = scaler_covariates.fit_transform(
    covariates_time
)

In [None]:
plt.figure(figsize=(12,2))
covariates_time[-5*7*24:].plot()
plt.title("Covariates Time")

### Train/Val/Test split for SMARD data

In [None]:
smard_data_scaler = Scaler()
scaled_smard_data = smard_data_scaler.fit_transform(smard_data)

test_split = 0.15
val_split = 0.15
train_end_index = (1 - val_split - test_split) * len(scaled_smard_data)
val_end_index = (1 - test_split) * len(scaled_smard_data)
train_end_index, val_end_index = int(train_end_index), int(val_end_index)

print(train_end_index, val_end_index)

train, val, test = scaled_smard_data[:train_end_index], scaled_smard_data[train_end_index:val_end_index], scaled_smard_data[val_end_index:]

# Cut train to start of weather data
train = train[weather_data.start_time():]

print("Train size: ", len(train))
print("Val size: ", len(val))
print("Test size: ", len(test))

print("Train time boundaries: ", train.start_time(), train.end_time(), len(train) / 24)
print("Val time boundaries: ", val.start_time(), val.end_time(), len(val) / 24)
print("Test time boundaries: ", test.start_time(), test.end_time(), len(test) / 24)

In [None]:
plt.figure(figsize=(12,2))
train[-24*7:].plot()
plt.title("Data")

## Model training

### Simple LSTM

In [None]:
def fit_model(model):
    covariate_args = evaluation.get_covariate_args(
        model=model,
        covariates=covariates_time.stack(weather_data),
    )[0]
    model.fit(series=train, val_series=val, **covariate_args)

Train Model

In [None]:
simple_lstm = RNNModel(
    model="LSTM", 
    input_chunk_length=24, 
    training_length=24 * 2,
    hidden_dim=16,  
    n_rnn_layers=1,
    n_epochs=1,
    force_reset=True
)

fit_model(simple_lstm)

Evaluate model

In [None]:
importlib.reload(evaluation)

# Evaluate
eval_result = evaluation.cross_validation_without_refit(
    model=simple_lstm,
    prefix_series=train,
    test_series=val, 
    metrics={
        'rmse': rmse, 
        'rmse_test': partial(evaluation.co2_rmse, disable_weights=True),
        'co2_rmse': evaluation.co2_rmse, 
    },
    data_scaler=smard_data_scaler,
    covariates=covariates_time.stack(weather_data),
    max_n_split=5,
    forecast_horizon=24,
)
eval_result

### LSTM

In [None]:
def objective(trial):

    input_chunk_length = trial.suggest_int('input_chunk_length', 24, 24 * 7 * 4)
    hidden_dim = trial.suggest_categorical('hidden_dim', [32, 64, 128, 256])
    dropout = trial.suggest_float('dropout', 0.0, 0.5)
    n_rnn_layers = trial.suggest_int('n_rnn_layers', 1, 3)
    n_epochs = 20

    print(f"Trialing with {trial.params}")

    my_stopper = EarlyStopping(
        monitor="val_loss",
        patience=5,
        min_delta=0.05,
        mode='min',
    )
    pl_trainer_kwargs = {"callbacks": [my_stopper]}

    lstm_model = RNNModel(
        model="LSTM", 
        input_chunk_length=input_chunk_length, 
        training_length=input_chunk_length * 2,
        hidden_dim=hidden_dim, 
        dropout=dropout, 
        n_rnn_layers=n_rnn_layers,
        n_epochs=n_epochs,
        pl_trainer_kwargs=pl_trainer_kwargs,
        force_reset=True
    )

    fit_model(lstm_model)

    # Evaluate
    eval_result = evaluation.cross_validation_without_refit2(
        model=lstm_model,
        prefix_series=train,
        test_series=val,
        metrics={
            'rmse': rmse, 
            'co2_rmse': evaluation.co2_rmse, 
        },
        data_scaler=smard_data_scaler,
        covariates=covariates_time.stack(weather_data),
        forecast_horizon=7*24,
    ) 

    eval_co2_rmse = eval_result['co2_rmse']
    print(f"Eval CO2 RMSE: {eval_co2_rmse}")

    return eval_co2_rmse

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

In [None]:
best_params = study.best_params
print(f"Best params: {best_params}")

my_stopper = EarlyStopping(
    monitor="val_loss",
    patience=5,
    min_delta=0.05,
    mode='min',
)
pl_trainer_kwargs = {"callbacks": [my_stopper]}

lstm_model = RNNModel(
    model="LSTM", 
    **best_params,
    n_epochs=20,
    training_length=best_params["input_chunk_length"] * 2,
    pl_trainer_kwargs=pl_trainer_kwargs,
    force_reset=True
)

fit_model(lstm_model)

# Evaluate
eval_result = evaluation.cross_validation_without_refit2(
    model=lstm_model,
    prefix_series=val,
    test_series=test,
    metrics={
        'rmse': rmse, 
        'co2_rmse': evaluation.co2_rmse, 
    },
    data_scaler=smard_data_scaler,
    covariates=covariates_time.stack(weather_data),
    forecast_horizon=7*24,
) 

eval_co2_rmse = eval_result['co2_rmse']
print(f"Eval CO2 RMSE: {eval_co2_rmse}")

lstm_model.save("models/best_model.pkl")

### XGBoost

In [None]:
def fit_model(model):
    covariate_args = get_covariate_args(
        model=model,
        covariates=covariates_time.stack(weather_data),
    )[0]
    model.fit(series=train, val_series=val, **covariate_args)


lags = [-1, -2, -3, -4, -8, -16, -24, -24 * 2, -24 * 7, -24 * 7 * 2, -24 * 7 * 4, -24 * 7 * 8]


def objective(trial):

    max_depth = trial.suggest_int('max_depth', 5, 10)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)

    print(f"Trialing with {trial.params}")

    xgboost_model = XGBModel(
        max_depth=max_depth,
        n_estimators=n_estimators,
        lags=lags,
        lags_past_covariates=lags,
        verbosity=0,
    )

    fit_model(xgboost_model)

    # Evaluate
    eval_result = evaluation.cross_validation_without_refit2(
        model=xgboost_model,
        prefix_series=train,
        test_series=val,
        metrics={
            'rmse': rmse, 
            'co2_rmse': evaluation.co2_rmse, 
        },
        data_scaler=smard_data_scaler,
        covariates=covariates_time.stack(weather_data),
        forecast_horizon=7*24,
    ) 

    eval_co2_rmse = eval_result['co2_rmse']
    print(f"Eval CO2 RMSE: {eval_co2_rmse}")

    return eval_co2_rmse


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

In [None]:
study.best_params

In [None]:
best_params = {'max_depth': 9, 'n_estimators': 696} # study.best_params
print(f"Best params: {best_params}")

xgboost_model = XGBModel(**best_params, lags=lags, lags_past_covariates=lags, verbosity=0)

fit_model(xgboost_model)

# Evaluate
eval_result = evaluation.cross_validation_without_refit2(
    model=xgboost_model,
    prefix_series=val,
    test_series=test,
    metrics={
        'rmse': rmse, 
        'co2_rmse': evaluation.co2_rmse, 
    },
    data_scaler=smard_data_scaler,
    covariates=covariates_time.stack(weather_data),
    forecast_horizon=7*24,
) 

eval_co2_rmse = eval_result['co2_rmse']

xgboost_model.save("models/best_model_xgboost.pkl")

In [None]:
print(f"Best params: {best_params}")
print(f"Eval CO2 RMSE: {eval_co2_rmse}")