# Baseline Model
---

The aim of this script is to provide a baseline model for our electricity prices project.

Idea:
For the baseline model, we assume that the predicted value for each hour is the same as the value from the previous day at the same hour. This simple approach leverages the daily seasonal pattern often observed in electricity prices.

In [None]:
# Import MLFlow -> probably not possible as no ML is used???

In [None]:
# load MLFlow package

In [None]:
####################################### Setup #######################################

import psycopg2
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import numpy as np
import pandas as pd
import pytz
from sklearn.metrics import mean_squared_error

In [None]:
# load data
# Load login data from .env file
load_dotenv()

DB_NAME = os.getenv('DB_NAME')
DB_USERNAME = os.getenv('DB_USERNAME')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')

DB_STRING = f'postgresql://{DB_USERNAME}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'

# Create SQLAlchemy engine
engine = create_engine(DB_STRING)

# Create a new connection using psycopg2 for non-pandas operations
conn = psycopg2.connect(
    database=DB_NAME,
    user=DB_USERNAME,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)

try:
    cursor = conn.cursor()
    cursor.execute("SELECT version();")
    record = cursor.fetchone()
    print("You are connected to -", record, "\n")
    
    ####################################### EXTRACT #######################################
    
    # Load data from the database using SQLAlchemy engine
    print("Energy generation data loading!")
    query_string2 = 'SELECT * FROM "03_gold"."FILENAME_HERE"'
    df_baseline = pd.read_sql(query_string2, engine)
    print("Loading finished!")

except Exception as error:
print("Error while connecting to PostgreSQL:", error)
    
finally:
    if conn:
        cursor.close()
        conn.close()
        print("PostgreSQL connection is closed")

In [None]:
# Create lag feature for the same hour on the previous day
df_baseline['price_lag24'] = df_baseline['price'].shift(24)

In [None]:
# Drop missing values introduced due to the lag
df_baseline.dropna(inplace=True)

In [None]:
# Define the target and the feature (lagged values)
X = df_baseline.drop(columns=['price'])
y = df_baseline['price']

In [None]:
# Number of splits
n_splits = 5
test_size = int(len(X) / (n_splits + 1))

splits = []
mse_scores = []

for i in range(n_splits):
    train_end = (i + 1) * test_size
    test_end = train_end + test_size
    
    # Ensure that train_end does not go beyond the length of the dataset
    if test_end > len(X):
        test_end = len(X)

    X_train = X.iloc[:train_end]
    X_test = X.iloc[train_end:test_end]
    y_train = y.iloc[:train_end]
    y_test = y.iloc[train_end:test_end]
    
    splits.append((X_train, X_test, y_train, y_test))
    
    # Predict using baseline (prior day's values)
    y_pred = X_test['lagged_target']
    
    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

    # Plotting
    plt.figure(figsize=(10, 4))
    plt.plot(y_test.index, y_test, label='Actual', color='blue')
    plt.plot(y_test.index, y_pred, label='Predicted', color='red', linestyle='--')
    plt.title(f'Actual vs Predicted for Split {i+1}')
    plt.xlabel('Time')
    plt.ylabel('Target')
    plt.legend()
    plt.show()

    print(f"Split {i+1}: Training data: {X_train.shape}, Validation data: {X_test.shape}")
    print(f"Training target: {y_train.shape}, Validation target: {y_test.shape}")
    print(f"MSE for Split {i+1}: {mse}\n")

# Optional: Average MSE across all splits for overall evaluation
average_mse = sum(mse_scores) / len(mse_scores)
print(f"Average MSE across all splits: {average_mse}")