In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.metrics import balanced_accuracy_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight

# 5. Binary-classification-based forecasting models

### 5.1. Testing all Binary-classification-based forecasting models for the month of APRIL 2021 to select the best two models 

In [3]:
# Loading the dataset
file_path = 'Database_1_capped.csv'
data = pd.read_csv(file_path)

# Converting 'Date and Time' to datetime 
data['Date and Time'] = pd.to_datetime(data['Date and Time'], format='%d/%m/%Y %H:%M')
data.set_index('Date and Time', inplace=True)

# Splitting data into training and test periods
train_start = '2019-01-01'
train_end = '2021-03-31'
test_start = '2021-04-01'
test_end = '2021-04-30'

train_data = data[train_start:train_end]
test_data = data[test_start:test_end]

# Separating features and target variable
X_train = train_data.drop(columns=['Demand_Capped'])
y_train = train_data['Demand_Capped']
X_test = test_data.drop(columns=['Demand_Capped'])
y_test = test_data['Demand_Capped']

# Converting target variable to binary (e.g., threshold at median value)
threshold = y_train.median()
y_train = (y_train > threshold).astype(int)
y_test = (y_test > threshold).astype(int)

# One-Hot Encode categorical features
categorical_features = ['REGION', 'Holiday', 'Season']
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(X_train[categorical_features])

# Concatenate encoded features with numerical features
X_train_numerical = X_train.drop(columns=categorical_features).values
X_train_processed = np.concatenate([X_train_numerical, encoded_categorical], axis=1)

# Applying the same transformation to test data
encoded_categorical_test = encoder.transform(X_test[categorical_features])
X_test_numerical = X_test.drop(columns=categorical_features).values
X_test_processed = np.concatenate([X_test_numerical, encoded_categorical_test], axis=1)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

# Scaling the features after SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test_processed)

# Defining models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'k-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Artificial Neural Network': MLPClassifier(hidden_layer_sizes=(20,), max_iter=2000, random_state=42)
}

# Training and evaluating each model
results = {}

for name, model in models.items():
    # Training the model
    model.fit(X_train_scaled, y_train_balanced)
    
    # Predicting on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculating evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    sensitivity = recall_score(y_test, y_pred, pos_label=1)  # Sensitivity (recall for PELDs)
    confusion = confusion_matrix(y_test, y_pred)
    
    # Storing results
    results[name] = {
        'Accuracy': accuracy,
        'Sensitivity': sensitivity,
        'Confusion Matrix': confusion
    }

# Printing the results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Sensitivity: {metrics['Sensitivity']}")
    print(f"Confusion Matrix: \n{metrics['Confusion Matrix']}")



Model: Logistic Regression
Accuracy: 0.8159722222222222
Sensitivity: 0.4845605700712589
Confusion Matrix: 
[[971  48]
 [217 204]]

Model: k-Nearest Neighbors
Accuracy: 0.7875
Sensitivity: 0.5130641330166271
Confusion Matrix: 
[[918 101]
 [205 216]]

Model: Random Forest
Accuracy: 0.7951388888888888
Sensitivity: 0.39429928741092635
Confusion Matrix: 
[[979  40]
 [255 166]]

Model: Artificial Neural Network
Accuracy: 0.8125
Sensitivity: 0.5320665083135392
Confusion Matrix: 
[[946  73]
 [197 224]]


#### RESULT:

Individual performances of four different binary classification models:

     Model: Logistic Regression
     Accuracy: 0.8159722222222222
     Sensitivity: 0.4845605700712589

     Model: k-Nearest Neighbors
     Accuracy: 0.7875
     Sensitivity: 0.5130641330166271


     Model: Random Forest
     Accuracy: 0.7951388888888888
     Sensitivity: 0.39429928741092635


     Model: Artificial Neural Network
     Accuracy: 0.8125
     Sensitivity: 0.5320665083135392
     


So we have finalised two models for our hybrid model and those are Logistic Regression and Artificial Neural Network.

### 5.2 Logistic Regression model(Tested for 2021 data)

In [67]:
# Loading the dataset
file_path = 'Database_1_capped.csv'
data = pd.read_csv(file_path)

# Converting 'Date and Time' to datetime 
data['Date and Time'] = pd.to_datetime(data['Date and Time'], format='%d/%m/%Y %H:%M')
data.set_index('Date and Time', inplace=True)

# Defining the fixed training and test periods
train_start = '2019-01-01'
train_end = '2020-12-31'
test_start = '2021-01-01'
test_end = '2021-12-31'

train_data = data[train_start:train_end]
test_data = data[test_start:test_end]

# Separating features and target variable
X_train = train_data.drop(columns=['Demand_Capped'])
y_train = train_data['Demand_Capped']
X_test = test_data.drop(columns=['Demand_Capped'])
y_test = test_data['Demand_Capped']

# Converting target variable to binary using a threshold based on the 95th percentile for training data
threshold = train_data['Demand_Capped'].quantile(0.95)
y_train = (y_train > threshold).astype(int)

# Converting target variable to binary for test data using actual peak load days
test_threshold = test_data['Demand_Capped'].quantile(0.95)
y_test = (y_test > test_threshold).astype(int)

# One-Hot Encode categorical features (including temporal features)
categorical_features = ['REGION', 'Holiday', 'Season', 'Month', 'Day', 'Hour']
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(X_train[categorical_features])

# Concatenate encoded features with numerical features, including weather data
numerical_features = X_train.drop(columns=categorical_features).values
X_train_processed = np.concatenate([numerical_features, encoded_categorical], axis=1)

# Applying the same transformation to test data
encoded_categorical_test = encoder.transform(X_test[categorical_features])
X_test_numerical = X_test.drop(columns=categorical_features).values
X_test_processed = np.concatenate([X_test_numerical, encoded_categorical_test], axis=1)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

# Scaling the features after SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test_processed)

# Training the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42, penalty='l2', class_weight='balanced')
lr_model.fit(X_train_scaled, y_train_balanced)

# Predicting on the test set using Logistic Regression
y_pred_lr = lr_model.predict(X_test_scaled)

# Creating a DataFrame to save 30-minute predictions for Logistic Regression
lr_prediction_df = test_data.copy()
lr_prediction_df['PLED_Prediction'] = y_pred_lr

# Aggregating predictions to daily level for Logistic Regression
lr_prediction_df['Date'] = lr_prediction_df.index.date
daily_lr_prediction = lr_prediction_df.groupby('Date')['PLED_Prediction'].max().reset_index()
daily_lr_prediction['PLED_Prediction'] = daily_lr_prediction['PLED_Prediction'].astype(int)

# Including actual values in the output CSV
daily_lr_prediction['Actual_PLED'] = y_test.groupby(y_test.index.date).max().values

# Saving the daily results to CSV for Logistic Regression
lr_output_file = "Logistic_Regression_PLED_Classification_Daily999.csv"
daily_lr_prediction.to_csv(lr_output_file, index=False)

# Calculating evaluation metrics for Logistic Regression
daily_y_test = y_test.groupby(y_test.index.date).max()  # Aggregate ground truth to daily level
balanced_accuracy_lr = balanced_accuracy_score(daily_y_test, daily_lr_prediction['PLED_Prediction'])
sensitivity_lr = recall_score(daily_y_test, daily_lr_prediction['PLED_Prediction'], pos_label=1)  # Sensitivity (recall for PELDs)
confusion_lr = confusion_matrix(daily_y_test, daily_lr_prediction['PLED_Prediction'])

# Printing results for Logistic Regression
print(f"\nModel: Logistic Regression")
print(f"Balanced Accuracy: {balanced_accuracy_lr}")
print(f"Sensitivity: {sensitivity_lr}")
print(f"Confusion Matrix: \n{confusion_lr}")

# Counting number of PLEDs (1) and Non-PLEDs (0) 
def count_pled_nonpled(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df['PLED_Prediction'].value_counts()

# File path for the output CSV
lr_csv_path = 'Logistic_Regression_PLED_Classification_Daily999.csv'

# Counting PLEDs and Non-PLEDs for Logistic Regression
lr_counts = count_pled_nonpled(lr_csv_path)

print("\nLogistic Regression (LR) PLED Counts:")
print(lr_counts)

# Comparing Actual vs Forecasted PLEDs and Non-PLEDs
actual_vs_forecast_df = pd.DataFrame({
    'Date': daily_y_test.index,
    'Actual': daily_y_test.values,
    'Forecast': daily_lr_prediction['PLED_Prediction']
})

actual_pled_count = actual_vs_forecast_df[actual_vs_forecast_df['Actual'] == 1].shape[0]
forecast_pled_count = actual_vs_forecast_df[actual_vs_forecast_df['Forecast'] == 1].shape[0]
matching_pled_count = actual_vs_forecast_df[(actual_vs_forecast_df['Actual'] == 1) & (actual_vs_forecast_df['Forecast'] == 1)].shape[0]
non_pled_count = actual_vs_forecast_df[(actual_vs_forecast_df['Actual'] == 0) & (actual_vs_forecast_df['Forecast'] == 0)].shape[0]

print(f"\nActual PLED Count: {actual_pled_count}")
print(f"Forecasted PLED Count: {forecast_pled_count}")
print(f"Matching PLED Count: {matching_pled_count}")
print(f"Matching Non-PLED Count: {non_pled_count}")


Model: Logistic Regression
Balanced Accuracy: 0.8443256090314915
Sensitivity: 0.9411764705882353
Confusion Matrix: 
[[222  75]
 [  4  64]]

Logistic Regression (LR) PLED Counts:
PLED_Prediction
0    226
1    139
Name: count, dtype: int64

Actual PLED Count: 68
Forecasted PLED Count: 139
Matching PLED Count: 64
Matching Non-PLED Count: 222


#### Result : 
     Model: Logistic Regression
            Balanced Accuracy: 0.8443256090314915
            Sensitivity: 0.9411764705882353

we will use this csv "Logistic_Regression_PLED_Classification_Daily999.csv" to feed the meta classifier hybrid model. this csv is having pelds forecasted by LOGISTIC REGRESSION MODEL.

### 5.3 Artificial Neural Network (ANN) model (Tested for 2021 data)

In [1]:
# Loading the dataset
file_path = 'Database_1_capped.csv'
data = pd.read_csv(file_path)

# Converting 'Date and Time' to datetime 
data['Date and Time'] = pd.to_datetime(data['Date and Time'], format='%d/%m/%Y %H:%M')
data.set_index('Date and Time', inplace=True)

# Defining the fixed training and test periods
train_start = '2019-01-01'
train_end = '2020-12-31'
test_start = '2021-01-01'
test_end = '2021-12-31'

train_data = data[train_start:train_end]
test_data = data[test_start:test_end]

# Addding '12 AM' demand value as a feature
train_data['Demand_12AM'] = train_data.groupby(train_data.index.date)['Demand_Capped'].transform('first')
test_data['Demand_12AM'] = test_data.groupby(test_data.index.date)['Demand_Capped'].transform('first')

# Separatng features and target variable
X_train = train_data.drop(columns=['Demand_Capped'])
y_train = train_data['Demand_Capped']
X_test = test_data.drop(columns=['Demand_Capped'])
y_test = test_data['Demand_Capped']

# Converting target variable to binary using a threshold based on the 95th percentile for training data
y_train_threshold = train_data['Demand_Capped'].quantile(0.95)
y_train = (y_train > y_train_threshold).astype(int)

# Converting target variable to binary for test data using actual peak load days
test_threshold = test_data['Demand_Capped'].quantile(0.95)
y_test = (y_test > test_threshold).astype(int)

# One-Hot Encode categorical features (including temporal features)
categorical_features = ['REGION', 'Holiday', 'Season', 'Month', 'Day', 'Hour']
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(X_train[categorical_features])

# Concatenate encoded features with numerical features, including weather data
numerical_features = X_train.drop(columns=categorical_features).values
X_train_processed = np.concatenate([numerical_features, encoded_categorical], axis=1)

# Applying the same transformation to test data
encoded_categorical_test = encoder.transform(X_test[categorical_features])
X_test_numerical = X_test.drop(columns=categorical_features).values
X_test_processed = np.concatenate([X_test_numerical, encoded_categorical_test], axis=1)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

# Scaling the features after SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test_processed)

# Training the Artificial Neural Network (ANN) model using advanced training
ann_model = MLPClassifier(hidden_layer_sizes=(150, 100), activation='relu', solver='adam', max_iter=3000, random_state=42, learning_rate='adaptive')
ann_model.fit(X_train_scaled, y_train_balanced)

# Predicting on the test set using ANN
y_pred_ann = ann_model.predict(X_test_scaled)

# Creating a DataFrame to save 30-minute predictions for ANN
ann_prediction_df = test_data.copy()
ann_prediction_df['PLED_Prediction'] = y_pred_ann

# Aggregating predictions to daily level for ANN
ann_prediction_df['Date'] = ann_prediction_df.index.date
daily_ann_prediction = ann_prediction_df.groupby('Date')['PLED_Prediction'].max().reset_index()
daily_ann_prediction['PLED_Prediction'] = daily_ann_prediction['PLED_Prediction'].astype(int)

# Including actual values in the output CSV
daily_ann_prediction['Actual_PLED'] = y_test.groupby(y_test.index.date).max().values

# Saving the daily results to CSV for ANN
ann_output_file = "Artificial_Neural_Network_PLED_Classification_Daily999.csv"
daily_ann_prediction.to_csv(ann_output_file, index=False)

# Calculating evaluation metrics for ANN
daily_y_test = y_test.groupby(y_test.index.date).max()  # Aggregate ground truth to daily level
balanced_accuracy_ann = balanced_accuracy_score(daily_y_test, daily_ann_prediction['PLED_Prediction'])
sensitivity_ann = recall_score(daily_y_test, daily_ann_prediction['PLED_Prediction'], pos_label=1)  # Sensitivity (recall for PELDs)
confusion_ann = confusion_matrix(daily_y_test, daily_ann_prediction['PLED_Prediction'])

# Calculating actual peak days with the test set
actual_peak_days = daily_y_test[daily_y_test == 1].index
actual_pled_count = len(actual_peak_days)
print(f"\nActual Peak Days: {actual_peak_days}")
print(f"Actual PLED Count: {actual_pled_count}")

# Printing results for ANN
print(f"\nModel: Artificial Neural Network")
print(f"Balanced Accuracy: {balanced_accuracy_ann}")
print(f"Sensitivity: {sensitivity_ann}")
print(f"Confusion Matrix: \n{confusion_ann}")

# Counting number of PLEDs (1) and Non-PLEDs (0) 
def count_pled_nonpled(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df['PLED_Prediction'].value_counts()

# File path for the output CSV
ann_csv_path = 'Artificial_Neural_Network_PLED_Classification_Daily999.csv'

# Counting PLEDs and Non-PLEDs for ANN
ann_counts = count_pled_nonpled(ann_csv_path)

print("\nArtificial Neural Network (ANN) PLED Counts:")
print(ann_counts)

# Comparing Actual vs Forecasted PLEDs and Non-PLEDs
actual_vs_forecast_df = pd.DataFrame({
    'Date': daily_y_test.index,
    'Actual': daily_y_test.values,
    'Forecast': daily_ann_prediction['PLED_Prediction']
})

forecast_pled_count = actual_vs_forecast_df[actual_vs_forecast_df['Forecast'] == 1].shape[0]
matching_pled_count = actual_vs_forecast_df[(actual_vs_forecast_df['Actual'] == 1) & (actual_vs_forecast_df['Forecast'] == 1)].shape[0]
non_pled_count = actual_vs_forecast_df[(actual_vs_forecast_df['Actual'] == 0) & (actual_vs_forecast_df['Forecast'] == 0)].shape[0]

print(f"\nForecasted PLED Count: {forecast_pled_count}")
print(f"Matching PLED Count: {matching_pled_count}")
print(f"Matching Non-PLED Count: {non_pled_count}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Demand_12AM'] = train_data.groupby(train_data.index.date)['Demand_Capped'].transform('first')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Demand_12AM'] = test_data.groupby(test_data.index.date)['Demand_Capped'].transform('first')



Actual Peak Days: Index([2021-06-14, 2021-06-15, 2021-06-16, 2021-06-17, 2021-06-18, 2021-06-21,
       2021-06-27, 2021-06-28, 2021-06-30, 2021-07-01, 2021-07-02, 2021-07-06,
       2021-07-07, 2021-07-08, 2021-07-09, 2021-07-10, 2021-07-11, 2021-07-12,
       2021-07-13, 2021-07-14, 2021-07-15, 2021-07-16, 2021-07-17, 2021-07-18,
       2021-07-19, 2021-07-20, 2021-07-21, 2021-07-22, 2021-07-23, 2021-07-27,
       2021-07-28, 2021-07-29, 2021-07-30, 2021-07-31, 2021-08-01, 2021-08-02,
       2021-08-03, 2021-08-04, 2021-08-05, 2021-08-06, 2021-08-09, 2021-08-10,
       2021-08-11, 2021-08-12, 2021-08-13, 2021-08-14, 2021-08-15, 2021-08-16,
       2021-08-17, 2021-08-25, 2021-08-26, 2021-08-27, 2021-08-28, 2021-08-29,
       2021-08-30, 2021-09-04, 2021-09-05, 2021-09-06, 2021-09-07, 2021-09-08,
       2021-09-09, 2021-09-10, 2021-09-11, 2021-09-12, 2021-09-13, 2021-09-14,
       2021-09-21, 2021-09-22],
      dtype='object')
Actual PLED Count: 68

Model: Artificial Neural Network
Ba

#### RESULT: 

     Model: Artificial Neural Network
                Balanced Accuracy: 0.863834422657952
                Sensitivity: 0.7647058823529411


we will use this csv "Artificial_Neural_Network_PLED_Classification_Daily999.csv" to feed the meta classifier hybrid model. this csv is having pelds forecasted by Artificial Neural Network MODEL.