In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from utils.matrix_builder import create_feature_matrix

# Debug parameter
DEBUG = False

if DEBUG:
    print("Debug mode is ON. Detailed output will be printed.")

# Load Database
csv_hour_file = '../data/ta_metrics/final_price_ta_metrics.csv'
df = pd.read_csv(csv_hour_file, parse_dates=['Datetime'])

# Define the Sliding Windows for this run
sliding_window = 10 # of days to train on (matrix rows)
lag_price_window = 3  # Window of the number of previous days as features (matrix columns)

# The following +1 is used to include the next row as the test set for model validation
training_sliding_window = sliding_window + 1

# Calculate number of sliding window models to train in the dataset
num_sliding_windows = len(df) - training_sliding_window
if DEBUG:
    print(f"Number of rows in the dataset: {len(df)}")

print(f"Number of sliding windows to train: {num_sliding_windows}")

# Initialize lists to store predictions, actuals, and timestamps
predictions_list = []
actuals_list = []
timestamps_list = []

# Initialize the model
model = LinearRegression()

for i in range(num_sliding_windows):
    if DEBUG:
        print(f"Processing sliding window {i + 1}/{num_sliding_windows}...")

    # Ensure we do not go out of bounds
    if i + training_sliding_window >= len(df):
        break
    
    sliding_window_set = df.iloc[i : i + training_sliding_window]
    
    # Create feature matrix and target variable for training
    X_train, y_train = create_expanded_feature_matrix(sliding_window_set, lag_price_window)
    
    # Split into training matrix and prediction feature row
    X_train_fit = X_train.iloc[:-1]
    y_train_fit = y_train.iloc[:-1]

    X_to_predict = X_train.iloc[-1:]
    y_to_predict = y_train.iloc[-1]

    # Train the model with the feature matrix
    model.fit(X_train_fit, y_train_fit)

    # Make prediction with the next row of features
    y_predicted = model.predict(X_to_predict)

    # Add bounds checking to catch extreme predictions
    if y_predicted[0] < 0: # Avoid negative price predictions, set lower limit to 0
        y_predicted[0] = 0
    if y_predicted[0] > 1000: # limit to 1000, avoiding unrealistic predictions
        y_predicted[0] = 1000

    # Store results for this sliding window
    predictions_list.append(y_predicted[0])
    actuals_list.append(y_to_predict)
    if 'Datetime' in sliding_window_set.columns:
        timestamps_list.append(sliding_window_set.iloc[-1]['Datetime'])
    else:
        timestamps_list.append(i + training_sliding_window - 1)

# Create prediction vs actual DataFrame
prediction_df = pd.DataFrame({
    'Timestamp': timestamps_list,
    'Predicted': predictions_list,
    'Actual': actuals_list
})

# Calculate overall metrics
mse = mean_squared_error(actuals_list, predictions_list)
mae = mean_absolute_error(actuals_list, predictions_list)
r2 = r2_score(actuals_list, predictions_list)

print(f"Overall Mean Squared Error: {mse}")
print(f"Overall Mean Absolute Error: {mae}")
print(f"Overall R^2 Score: {r2}")

In [None]:
import pandas as pd
import numpy as np
from model_library import preferred_model
from utils.matrix_builder import create_feature_matrix

# Load Database
csv_hour_file = '../data/ta_metrics/final_price_ta_metrics.csv'
df = pd.read_csv(csv_hour_file, parse_dates=['Datetime'])

def generic_training(df, sliding_window, lag_price_window, DEBUG):
    """
    Train a machine learning model using a sliding window approach.
    
    Parameters:
    - df: DataFrame containing the dataset with features and target variable.
    - sliding_window: Number of rows to use for training in each sliding window.
    - lag_price_window: Number of previous days to use as features.
    
    Returns:
    - prediction_df: DataFrame containing predictions and actual values.
    """
    if DEBUG:
        print("Debug mode is ON. Detailed output will be printed.")

    # Validate input parameters
    if sliding_window <= lag_price_window:
        raise ValueError("Sliding window must be greater than to the price feature window.")

    training_sliding_window = sliding_window + 1  # +1 to include the next row as the test set

    # Calculate number of sliding window models to train in the dataset
    num_sliding_windows = len(df) - training_sliding_window
    if DEBUG:
        # Training for sliding windows and price feature window
        print(f"Training sliding window size: {training_sliding_window}, Price feature window size: {lag_price_window}")
        print(f"Number of rows in the dataset: {len(df)}")

    print(f"Number of models to train: {num_sliding_windows}")

    # Initialize lists to store predictions, actuals, and timestamps
    predictions_list = []
    actuals_list = []
    timestamps_list = []

    model = preferred_model()

    for i in range(num_sliding_windows):
        if DEBUG:
            print(f"Processing sliding window {i + 1}/{num_sliding_windows}...")

        # Ensure we do not exceed the DataFrame length
        if i + training_sliding_window >= len(df):
            break  # Avoid index out of bounds
        
        # Extract the sliding window set, including the next row to use for prediction
        sliding_window_set = df.iloc[i : i + training_sliding_window]

        if DEBUG:
            print(f"Sliding window should have {training_sliding_window} rows, got {len(sliding_window_set)} rows.")
            print(f"Sliding window set:\n{sliding_window_set}")
        
        # Create feature matrix and target variable for training
        X_train, y_train = create_feature_matrix(sliding_window_set, lag_price_window)
        
        if DEBUG:
            print(f"Feature matrix shape: {X_train.shape}, Target variable shape: {y_train.shape}")
            print(f"Feature matrix:\n{X_train.head()}")
            print(f"Target variable:\n{y_train.head()}")

        # Split for training and prediction
        X_train_fit = X_train.iloc[:-1]
        y_train_fit = y_train.iloc[:-1]

        X_to_predict = X_train.iloc[-1:]
        y_to_predict = y_train.iloc[-1]

        if DEBUG:
            print(f"Training features shape: {X_train_fit.shape}, Training target shape: {y_train_fit.shape}")
            print(f"Features to predict shape: {X_to_predict.shape}, Target to predict: {y_to_predict}")

        model.fit(X_train_fit, y_train_fit)
        y_predicted = model.predict(X_to_predict)

        # Add a lower bounds to set extreme negative predictions to 0, assuming prices cannot be negative
        if y_predicted[0] < 0:
            y_predicted[0] = 0

        # Store results
        predictions_list.append(y_predicted[0])
        actuals_list.append(y_to_predict)
        if 'Datetime' in sliding_window_set.columns:
            timestamps_list.append(sliding_window_set.iloc[-1]['Datetime'])
        else:
            timestamps_list.append(i + training_sliding_window - 1)

    # Create final prediction DataFrame
    prediction_df = pd.DataFrame({
        'Timestamp': timestamps_list,
        'Predicted': predictions_list,
        'Actual': actuals_list
    })

    if DEBUG:
        print(f"Final prediction DataFrame:\n{prediction_df.head()}")

    return prediction_df