In [7]:
db_user

In [12]:
# Load environment variables from .env file
load_dotenv()

# Get database connection parameters from environment variables
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')

# Create the database URL
db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Create an engine
engine = create_engine(db_url)

# Define your query -> set your table name here
query = 'SELECT * FROM "03_gold"."fact_electricity_market_germany"'

# Execute the query and load the data into a pandas DataFrame
df = pd.read_sql(query, engine)

In [13]:
# Get forecast data
# Define your query
query3 = 'SELECT * FROM "02_silver"."fact_full_weather"'

# Execute the query and load the data into a pandas DataFrame
forecasts = pd.read_sql(query3, engine).sort_values('timestamp')

In [14]:
# Add time variables
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['quarter'] = df['timestamp'].dt.quarter
df['month'] = df['timestamp'].dt.month

In [26]:
# create splits for expanding window crossvalidation
# Parameters
n_splits = 23 # our data has 23 quarters = (len(X) / 2190)
train_increment = 2190  # Number of hours in a quarter year
test_size = 72 # we currently want to predict for 72 hours. Can be adjusted.

class CustomTimeSeriesSplit:
    def __init__(self, n_splits, train_increment, test_size):
        self.n_splits = n_splits
        self.train_increment = train_increment
        self.test_size = test_size

    def split(self, X):
        n_samples = len(X)
        indices = np.arange(n_samples)
        splits = []
        
        for i in range(self.n_splits):
            train_end = (i + 1) * self.train_increment
            test_end = train_end + self.test_size
            
            if test_end > n_samples:
                break

            train_index = indices[:train_end]
            test_index = indices[train_end:test_end]
            splits.append((train_index, test_index))
        
        return splits

In [None]:
# Get the custom splits
custom_splitter = CustomTimeSeriesSplit(n_splits=n_splits, train_increment=train_increment, test_size=test_size)
splits = custom_splitter.split(X)

In [15]:
def create_lagged_features(df, target_column, lagged_vars):
    for lag in lagged_vars:
        df[f'{target_column}_{lag}_lag'] = df[target_column].shift(lag)
        df[f'{target_column}_{lag}_mean'] = df[target_column].shift(1).rolling(lag).mean()
        df[f'{target_column}_{lag}_std'] = df[target_column].shift(1).rolling(lag).std()
        df[f'{target_column}_{lag}_max'] = df[target_column].shift(1).rolling(lag).max()
        df[f'{target_column}_{lag}_min'] = df[target_column].shift(1).rolling(lag).min()
    return df

In [64]:
def evaluate_time_series_model_sliding(df, known_vars, target_column, model_class, model_params=None, train_size=2190, test_size=72):
    if model_params is None:
        model_params = {}

    mae_scores = []
    plot_data = []
    mse_score_sum = []

    lagged_vars = [6, 12, 24, 48, 72]

    # Define sliding window split
    for split_start in range(0, len(df) - train_size - test_size + 1, train_size):
        train_start = split_start
        train_end = split_start + train_size
        test_start = train_end
        test_end = train_end + test_size

        train_data = df.iloc[train_start:train_end].copy()
        test_data = df.iloc[test_start:test_end].copy()

        # Combine last 72 hours of training data with test data for lagged feature creation
        combined_data = pd.concat([train_data.iloc[-72:], test_data])
        
        # Create lagged features for training data
        train_data = create_lagged_features(train_data, target_column, lagged_vars).dropna()
                
        known_features_train = train_data[known_vars + [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_min' for lag in lagged_vars]].dropna()
        target_train = train_data[target_column]
        
        # Train the model
        model = model_class(**model_params)
        model.fit(known_features_train, target_train)

        # Initialize the dataset with known values of y
        known_data = combined_data.iloc[:72].copy()

        # Predict the test data
        predictions = []
        
        
        # Iterate over the rows where we need to predict y (from 73rd to 144th row)
        for i in range(72, len(combined_data)):
            # Append the predicted value of y from the previous step to known_data
            if i > 72:
                for var in known_vars:
                    known_data[var] = pd.concat([known_data[var], pd.Series([predictions[-1]])], ignore_index=True)

            # Create lagged features based on the current known_data
            known_data_with_lags = create_lagged_features(combined_data, target_column, lagged_vars).dropna()
            # Check if there are enough rows to predict
            if i >= 144:
                break
            # Prepare predictors for the current row
            predictors = known_data_with_lags.iloc[-1][known_vars + \
                                               [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_min' for lag in lagged_vars]].values.reshape(1, -1)
            print(predictors)
            prediction = model.predict(predictors)[0]
            predictions.append(prediction)




            # # Create lagged features for the test data up to the current point
            # if i == 0:
            #     test_data_with_lags = create_lagged_features(test_data.copy(), target_column, lagged_vars)
            # else:
            #     test_data_with_lags.loc[test_data.index[i], target_column] = predictions[-1]
            #     test_data_with_lags = create_lagged_features(test_data_with_lags, target_column, lagged_vars)

            # # Ensure no NaN values exist after creating lagged features
            # test_data_with_lags.dropna(inplace=True)
            # # Check if there are enough rows to predict
            # if i >= len(test_data_with_lags):
            #     break
            
            # known_features_test = test_data_with_lags[known_vars + [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_min' for lag in lagged_vars]].iloc[i].values.reshape(1, -1)
            # predictions.append(model.predict(known_features_test)[0])

        # Calculate MAE for the current split
        actual = test_data[target_column][:test_size].values
        mae = mean_absolute_error(actual, predictions)
        mae_scores.append(mae)
        print(f"Split {split_start // test_size + 1}: MAE = {mae}")

        # Calculate MSE for plotting
        mse_train = mean_squared_error(target_train, model.predict(known_features_train))
        mse_test = mean_squared_error(actual, predictions)

        mse_score_sum.append({'split': split_start // test_size + 1, 'Type': 'Train', 'MSE': mse_train})
        mse_score_sum.append({'split': split_start // test_size + 1, 'Type': 'Test', 'MSE': mse_test})

        plot_data.append({
            'split': split_start // test_size + 1,
            'y_train': target_train,
            'y_pred_train': model.predict(known_features_train),
            'y_test': actual,
            'y_pred_test': predictions
        })

    average_mae = np.mean(mae_scores)
    print(f"Average MAE across all splits: {average_mae}")

    mse_score_sum = pd.DataFrame(mse_score_sum)

    return average_mae, plot_data, mse_score_sum

In [65]:
average_mae, plot_data, mse_score_sum = evaluate_time_series_model_sliding(
    df, known_vars, target_column, ExtraTreesRegressor)

UnboundLocalError: cannot access local variable 'known_data_with_lags' where it is not associated with a value

In [None]:
def evaluate_time_series_model_sliding(df, known_vars, target_column, model_class, model_params=None, train_size=2190, test_size=72):
    if model_params is None:
        model_params = {}

    mae_scores = []
    plot_data = []
    mse_score_sum = []

    lagged_vars = [6, 12, 24, 48, 72]

    # Define sliding window split
    for split_start in range(0, len(df) - train_size - test_size + 1, train_size):
        train_start = split_start
        train_end = split_start + train_size
        test_start = train_end
        test_end = train_end + test_size

        train_data = df.iloc[train_start:train_end].copy()
        test_data = df.iloc[test_start:test_end].copy()

        # Combine last 72 hours of training data with test data for lagged feature creation
        combined_data = pd.concat([train_data.iloc[-72:], test_data])
        
        # Create lagged features for training data
        train_data = create_lagged_features(train_data, target_column, lagged_vars).dropna()
                
        known_features_train = train_data[known_vars + [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
                                            [f'{target_column}_{lag}_min' for lag in lagged_vars]].dropna()
        target_train = train_data[target_column]
        
        # Train the model
        model = model_class(**model_params)
        model.fit(known_features_train, target_train)

        # Initialize the dataset with known values of y
        known_data = combined_data.iloc[:72].copy()

        
        
        # Predict the test data
        predictions = []
                
        # Iterate over the rows where we need to predict y (from 73rd to 144th row)
        for i in range(72, len(combined_data)):
            # Append the predicted value of y from the previous step to known_data
            if i > 72:
                for var in known_vars:
                    known_data[var] = pd.concat([known_data[var], pd.Series([predictions[-1]])], ignore_index=True)

            # Create lagged features based on the current known_data
            known_data_with_lags = create_lagged_features(combined_data, target_column, lagged_vars).dropna()
            # Check if there are enough rows to predict
            if i >= 144:
                break
            # Prepare predictors for the current row
            predictors = known_data_with_lags.iloc[-1][known_vars + \
                                               [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
                                               [f'{target_column}_{lag}_min' for lag in lagged_vars]].values.reshape(1, -1)
            print(predictors)
            prediction = model.predict(predictors)[0]
            predictions.append(prediction)




            # # Create lagged features for the test data up to the current point
            # if i == 0:
            #     test_data_with_lags = create_lagged_features(test_data.copy(), target_column, lagged_vars)
            # else:
            #     test_data_with_lags.loc[test_data.index[i], target_column] = predictions[-1]
            #     test_data_with_lags = create_lagged_features(test_data_with_lags, target_column, lagged_vars)

            # # Ensure no NaN values exist after creating lagged features
            # test_data_with_lags.dropna(inplace=True)
            # # Check if there are enough rows to predict
            # if i >= len(test_data_with_lags):
            #     break
            
            # known_features_test = test_data_with_lags[known_vars + [f'{target_column}_{lag}_lag' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_mean' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_std' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_max' for lag in lagged_vars] + \
            #                                 [f'{target_column}_{lag}_min' for lag in lagged_vars]].iloc[i].values.reshape(1, -1)
            # predictions.append(model.predict(known_features_test)[0])

        # Calculate MAE for the current split
        actual = test_data[target_column][:test_size].values
        mae = mean_absolute_error(actual, predictions)
        mae_scores.append(mae)
        print(f"Split {split_start // test_size + 1}: MAE = {mae}")

        # Calculate MSE for plotting
        mse_train = mean_squared_error(target_train, model.predict(known_features_train))
        mse_test = mean_squared_error(actual, predictions)

        mse_score_sum.append({'split': split_start // test_size + 1, 'Type': 'Train', 'MSE': mse_train})
        mse_score_sum.append({'split': split_start // test_size + 1, 'Type': 'Test', 'MSE': mse_test})

        plot_data.append({
            'split': split_start // test_size + 1,
            'y_train': target_train,
            'y_pred_train': model.predict(known_features_train),
            'y_test': actual,
            'y_pred_test': predictions
        })

    average_mae = np.mean(mae_scores)
    print(f"Average MAE across all splits: {average_mae}")

    mse_score_sum = pd.DataFrame(mse_score_sum)

    return average_mae, plot_data, mse_score_sum

In [17]:
def plot_time_series(plot_data, mse_score_sum, model_name='', only_bar=True):
    """
    Plot the time series data stored in plot_data.

    Parameters:
    - plot_data: list of dictionaries containing data for plotting
    - mse_score_sum: DataFrame containing MSE scores for each split and type (Train/Test)
    - only_bar: if False, Time Series Plots will created for each split. Default = True.
    """

    if not only_bar:
        for data in plot_data:
            split = data['split']
            y_train = data['y_train']
            y_pred_train = data['y_pred_train']
            y_test = data['y_test']
            y_pred_test = data['y_pred_test']

            # Plotting Full Time Series
            plt.figure()
            sns.lineplot(x=y_train.index, y=y_train, label='Actual train')
            sns.lineplot(x=y_train.index, y=y_pred_train, label='Predicted train', linestyle='--')
            sns.lineplot(x=y_test.index, y=y_test, label='Actual test')
            sns.lineplot(x=y_test.index, y=y_pred_test, label='Predicted test', linestyle='--')
            plt.title(f'Actual vs Predicted for Split {split}')
            plt.xlabel('Time')
            plt.ylabel('Target')
            plt.legend()
            plt.show()

            # Plotting Zoomed Time Series
            plt.figure()
            sns.lineplot(x=y_train.index[-168:], y=y_train[-168:], label='Actual train')
            sns.lineplot(x=y_train.index[-168:], y=y_pred_train[-168:], label='Predicted train', linestyle='--')
            sns.lineplot(x=y_test.index, y=y_test, label='Actual test')
            sns.lineplot(x=y_test.index, y=y_pred_test, label='Predicted test', linestyle='--')
            plt.title(f'Zoomed: Actual vs Predicted for Split {split}')
            plt.xlabel('Time')
            plt.ylabel('Target')
            plt.legend()
            plt.show()

    # Plot using seaborn
    if model_name:
        model_name = f'{model_name}: '
    plt.figure()
    sns.barplot(x='split', y='MSE', hue='Type', data=mse_score_sum)
    plt.title(f'{model_name}Comparison of MSE for Training and Test Sets')
    plt.ylabel('Mean Squared Error')
    plt.xlabel('Split')
    plt.show()

In [18]:
known_vars = ['temperature_2m', 'relative_humidity_2m', 'apparent_temperature', 'precipitation', 'cloud_cover', 'wind_speed_10m', 'wind_direction_10m',
            'direct_radiation', 'diffuse_radiation', 'sunshine_duration', 'hour', 'dayofweek', 'quarter', 'month']

In [19]:
target_column = 'price_eur_mwh'

In [None]:
# Number of rows where y is known initially
initial_known_rows = 72

# Lagged variables for y
lagged_vars = [6, 12, 24, 48, 72]

# Function to create lagged features for y
def create_lagged_features(df, target_column, lagged_vars):
    for lag in lagged_vars:
        df[f'{target_column}_{lag}_lag'] = df[target_column].shift(lag)
        df[f'{target_column}_{lag}_mean'] = df[target_column].shift(1).rolling(lag).mean()
        df[f'{target_column}_{lag}_std'] = df[target_column].shift(1).rolling(lag).std()
        df[f'{target_column}_{lag}_max'] = df[target_column].shift(1).rolling(lag).max()
        df[f'{target_column}_{lag}_min'] = df[target_column].shift(1).rolling(lag).min()
    return df

# Initialize the dataset with known values of y
known_data = df.iloc[:initial_known_rows].copy()

# Initialize predictions list to store predicted values of y
predictions = []

# Fit the model initially with known data
known_data_with_lags = create_lagged_features(known_data.copy(), 'y', lagged_vars).dropna()
model = ExtraTreesRegressor()  # Example model
model.fit(known_data_with_lags[['known_var1', 'known_var2'] + \
                               [f'y_{lag}_lag' for lag in lagged_vars] + \
                               [f'y_{lag}_mean' for lag in lagged_vars] + \
                               [f'y_{lag}_std' for lag in lagged_vars] + \
                               [f'y_{lag}_max' for lag in lagged_vars] + \
                               [f'y_{lag}_min' for lag in lagged_vars]], 
          known_data_with_lags['y'])

# Iterate over the rows where we need to predict y (from 73rd to 144th row)
for i in range(initial_known_rows, len(df)):
    # For the first prediction, no previous predictions exist
    if i == initial_known_rows:
        predictors = known_data_with_lags.iloc[-1][['known_var1', 'known_var2'] + \
                                                   [f'y_{lag}_lag' for lag in lagged_vars] + \
                                                   [f'y_{lag}_mean' for lag in lagged_vars] + \
                                                   [f'y_{lag}_std' for lag in lagged_vars] + \
                                                   [f'y_{lag}_max' for lag in lagged_vars] + \
                                                   [f'y_{lag}_min' for lag in lagged_vars]].values.reshape(1, -1)
    else:
        # Append the predicted value of y from the previous step to known_data
        known_data = pd.concat([known_data, pd.DataFrame({var: [predictions[-1] if var == 'y' else df.iloc[i][var]] for var in df.columns})], ignore_index=True)

        # Create lagged features based on the current known_data
        known_data_with_lags = create_lagged_features(known_data.copy(), 'y', lagged_vars).dropna()

        # Prepare predictors for the current row
        predictors = known_data_with_lags.iloc[-1][['known_var1', 'known_var2'] + \
                                                   [f'y_{lag}_lag' for lag in lagged_vars] + \
                                                   [f'y_{lag}_mean' for lag in lagged_vars] + \
                                                   [f'y_{lag}_std' for lag in lagged_vars] + \
                                                   [f'y_{lag}_max' for lag in lagged_vars] + \
                                                   [f'y_{lag}_min' for lag in lagged_vars]].values.reshape(1, -1)
    
    # Example model prediction (you should replace this with your model fitting and prediction logic)
    prediction = model.predict(predictors)[0]
    predictions.append(prediction)

# Calculate MAE for the predicted values (from 73rd to 144th row)
actual = df.iloc[initial_known_rows:]['y'].values
mae = mean_absolute_error(actual, predictions)
print(f"Mean Absolute Error: {mae}")