Pierre Nikitits
## Course Project: Electricity Price Explanation

Dataset:

- Consumption
- Exchange
- Net Export/Import
- Energy Sources
- Residual Load
- Weather Conditions
- Market Dynamics

Steps:

1. Preprocessing Data
2. Random Forest
3. Training and Validation
4. Hyperparameter Tuning
5. Evaluation
6. Interpretation


## Loading and Preprocessing the data

In [1]:
import pandas as pd
path = "/Users/pierre/Documents/GitHub/EnsembleLearningProject/Data/"

X_train = pd.read_csv(path + 'X_train.csv').set_index('ID')
y_train = pd.read_csv(path + 'y_train.csv').set_index('ID')
X_test = pd.read_csv(path + 'X_test.csv').set_index('ID')
y_test = pd.read_csv(path + 'y_test.csv').set_index('ID')

In [2]:
print("X_train :" , X_train.shape)
print("y_train :" , y_train.shape)

print("\nX_test  :" , X_test.shape)
print("y_test  :" , y_test.shape)

X_train : (1494, 34)
y_train : (1494, 1)

X_test  : (654, 34)
y_test  : (654, 1)


In [3]:
X_train.drop(columns=['COUNTRY' , 'DAY_ID'], inplace=True)
X_test.drop(columns=['COUNTRY' , 'DAY_ID'], inplace=True)

In [4]:
X_train.head()

Unnamed: 0_level_0,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,DE_GAS,FR_GAS,...,FR_RESIDUAL_LOAD,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1054,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,-0.69286,0.441238,-0.213766,...,-0.444661,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445
2049,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,1.130838,0.174773,0.42694,...,-1.183194,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365
1924,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,1.682587,2.351913,2.122241,...,1.947273,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952
297,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,-0.56323,0.487818,0.194659,...,-0.976974,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948
1101,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,-0.990324,0.238693,-0.240862,...,-0.526267,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378


In [5]:
X_train = X_train.dropna()
y_train = y_train.loc[X_train.index]

X_test = X_test.dropna()
y_test = y_test.loc[X_test.index]

In [6]:
print(X_train.shape)
print(y_train.shape)

(1276, 32)
(1276, 1)


## Model definition and training

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

random_forest = RandomForestRegressor(n_estimators=500, random_state=42, max_depth=6 , min_samples_split=14)
random_forest.fit(X_train, y_train.values.ravel())
predictions = random_forest.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.1848723490624737


## Metrics

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
import numpy as np

rmse = sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error (RMSE): {rmse}")

mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error (MAE): {mae}")

r2 = r2_score(y_test, predictions)
print(f"R-squared (R²): {r2}")

n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1-r2) * (n-1) / (n-p-1)
print(f"Adjusted R-squared: {adjusted_r2}")

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, predictions)
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")


Root Mean Squared Error (RMSE): 1.0885184192573287
Mean Absolute Error (MAE): 0.878871901860084
R-squared (R²): -0.1283011719434577
Adjusted R-squared: -0.19591472531834664
Mean Absolute Percentage Error (MAPE): 308.9077584391481%


## Grid search over Random Forest parameters

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [300, 400, 500],
    'max_depth': [6, 8, 10],
    'min_samples_split': [10, 12, 14]
}


random_forest = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train.values.ravel())

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_depth': 6, 'min_samples_split': 14, 'n_estimators': 500}
