In [None]:
from energy_model.configs.columns import SystemColumns, ProcessColumns
from energy_model.pipelines.pipeline_utils import extract_x_y
import pandas as pd
from typing import Any
import numpy as np

from energy_model.energy_model_parameters import PROCESS_SYSTEM_DF_PATH, SYSTEM_ONLY_DF_PATH
from energy_model.pipelines.grid_search_pipeline_executor import GridSearchPipelineExecutor

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import Lars
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LarsCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoLarsCV
from sklearn.linear_model import LassoLarsIC
from sklearn.svm import LinearSVR
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.linear_model import SGDRegressor


# Models

In [None]:
GradientBoostingRegressorModel = {"classifier": [GradientBoostingRegressor()],
                                  "classifier__loss": ["squared_error", "huber"],
                                  'classifier__max_depth': [80, 110],
                                  'classifier__max_features': [3],
                                  'classifier__min_samples_leaf': [3, 5],
                                  'classifier__min_samples_split': [8, 12],
                                  'classifier__n_estimators': [100, 500, 1000]}

ExtraTreeRegressorModel = {"classifier": [ExtraTreeRegressor()],
                           # 'classifier__n_estimators': [10, 50, 100],
                           'classifier__criterion': ['squared_error', 'absolute_error'],
                           'classifier__max_depth': [2, 16,50],
                           'classifier__min_samples_split': [2, 6],
                           'classifier__min_samples_leaf': [1, 2],
                           # 'oob_score': [True, False],
                           'classifier__max_features': ['sqrt']}
# ElasticNetModel = {"classifier": [ElasticNet()],
#                    "classifier__max_iter": [5, 50],
#                    "classifier__alpha": [0.001, 0.01, 0.1],
#                    "classifier__l1_ratio": np.arange(0.0, 1.0, 0.1)}

HistGradientBoostingRegressorModel = {"classifier": [HistGradientBoostingRegressor()],
                                        "classifier__loss": ["squared_error", "quantile"],
                                      "classifier__quantile": [0.5, 0.6, 0.7],
                                      "classifier__max_iter": [400, 600, 800],
                                      "classifier__l2_regularization": [0.1, 0.3, 1.0, 3.0],
                                      'classifier__max_depth': [3, 4, 5, 6, 8],#range(5, 16, 2),
                                      'classifier__min_samples_leaf': [20, 50, 100, 200]} #range(10, 100, 10)}
ExtraTreesRegressorModel = {"classifier": [ExtraTreesRegressor()],
                            "classifier__max_depth": [3, 5, 7, 12],
                            "classifier__min_samples_leaf": [3, 7],
                            "classifier__min_weight_fraction_leaf": [0.1, 0.5],
                            "classifier__max_features": ["sqrt"],
                            "classifier__max_leaf_nodes": [10, 60, 90]}

RandomForestRegressorModel = {"classifier": [RandomForestRegressor()],
                              'classifier__n_estimators': [50, 100, 500, 1000],
                              'classifier__max_features': ['sqrt'],
                              'classifier__max_depth': [5, 7, 15, 60],
                              'classifier__min_samples_split': [2, 5, 10],
                              'classifier__min_samples_leaf': [1, 4]}

In [None]:
all_possible_models = {
    "GradientBoostingRegressorModel": GradientBoostingRegressorModel,
    "ExtraTreesRegressorModel": ExtraTreesRegressorModel,
    "ExtraTreeRegressorModel": ExtraTreeRegressorModel,
    "HistGradientBoostingRegressorModel": HistGradientBoostingRegressorModel,
    "RandomForestRegressorModel": RandomForestRegressorModel
}

system_fine_tune_model = {
    "HistGradientBoostingRegressorModel": HistGradientBoostingRegressorModel}

In [None]:
process_df_path = PROCESS_SYSTEM_DF_PATH
system_only_df_path = SYSTEM_ONLY_DF_PATH

system_target = SystemColumns.ENERGY_USAGE_SYSTEM_COL
process_target = ProcessColumns.ENERGY_USAGE_PROCESS_COL

In [None]:
from typing import Callable


def run_grid_search(target_col: str, dataset_path: str, possible_models: list[dict], scoring_methods: dict[str, str | Callable]) -> dict[str, Any]:
    grid_search_pipeline = GridSearchPipelineExecutor(possible_models=possible_models, scoring_methods=scoring_methods)
    dataset = pd.read_csv(dataset_path, index_col=0)
    X, y = extract_x_y(dataset, target_column=target_col)
    best_model_per_metric = grid_search_pipeline.run_grid_search(X, y)
    return best_model_per_metric

In [None]:
def print_best_models(best_model: dict[str, Any], model_name: str) -> str:
    res = f"\n\nGrid Search Results for Model {model_name}: \n{best_model}"
    print(res)
    return res

In [None]:
def run_grid_search_on_all_models(target_col: str, dataset_path: str, model_options: dict[str, dict[str, Any]], scoring_methods: dict[str, str | Callable]) -> tuple[dict[str, dict[str, Any]], str]:
    best_model_per_type = {}
    full_results = ""
    for model_name, model in model_options.items():
        full_results += f"\n\n***** Starting Grid Search for Model {model_name}: *****\n"
        print(f"\n\n***** Starting Grid Search for Model {model_name}: *****\n")
        best_model_per_metric = run_grid_search(target_col, dataset_path, [model], scoring_methods)
        res = print_best_models(best_model_per_metric, model_name)
        full_results += res
        best_model_per_type[model_name] = best_model_per_metric
        print(f"\n\n***** Finished Grid Search for Model {model_name}: *****\n")
        full_results += f"\n\n***** Finished Grid Search for Model {model_name}: *****\n"

    return best_model_per_type, full_results


# Additional methods for metrics and loss functions

In [None]:
from sklearn.metrics import make_scorer

def negative_penalty(y_pred):
    y_pred = np.asarray(y_pred)
    neg = y_pred[y_pred < 0]
    if len(neg) == 0:
        return 0.0
    return np.mean(np.abs(neg))


def tail_rmse(y_true, y_pred, percentile=95):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    squared_errors = (y_true - y_pred) ** 2
    threshold = np.percentile(squared_errors, percentile)
    tail_errors = squared_errors[squared_errors >= threshold]

    return np.sqrt(np.mean(tail_errors))

def combined_tail_rmse_and_negative_penalty_loss(y_true, y_pred, percentile=95, lambda_neg=10.0) -> float:
    tail = tail_rmse(y_true, y_pred, percentile)
    neg_pen = negative_penalty(y_pred)
    return tail + lambda_neg * neg_pen


def system_additional_scorer(percentile=95, lambda_neg=5.0):
    return make_scorer(
        lambda y_true, y_pred: -combined_tail_rmse_and_negative_penalty_loss(
            y_true, y_pred,
            percentile=percentile,
            lambda_neg=lambda_neg
        ),
        greater_is_better=True
    )


In [None]:
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0 + eps

    return np.mean(numerator / denominator) * 100

def combined_smape_and_negative_penalty_loss(y_true, y_pred, lambda_neg=5.0):
    s = smape(y_true, y_pred)
    neg_pen = negative_penalty(y_pred)
    return s + lambda_neg * neg_pen


def process_additional_scorer(lambda_neg=5.0):
    return make_scorer(
        lambda y_true, y_pred: -combined_smape_and_negative_penalty_loss(
            y_true, y_pred,
            lambda_neg=lambda_neg
        ),
        greater_is_better=True
    )

# Find best system energy model

In [None]:
system_additional_scoring = system_additional_scorer()
system_scoring_methods = {
    "neg_mean_squared_error": "neg_mean_squared_error",
    "neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "tail_rmse_and_negative_penalty": system_additional_scoring
}

In [None]:
best_system_models, results_system_txt = run_grid_search_on_all_models(system_target, system_only_df_path, all_possible_models, system_scoring_methods)

# Fine tune system model

In [None]:
system_additional_scoring = system_additional_scorer()
system_updated_scoring_methods = {
    "neg_mean_squared_error": "neg_mean_squared_error",
    "neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "tail_rmse_and_negative_penalty": system_additional_scoring,
    "tail_rmse": make_scorer(tail_rmse, greater_is_better=False),
}

In [None]:
best_system_models_finetune, results_system_txt_finetune = run_grid_search_on_all_models(system_target, system_only_df_path, system_fine_tune_model, system_updated_scoring_methods)

In [None]:
with open(f"finetune_system_results.txt", "w") as f:
    f.write(results_system_txt)

# Find Best Process Energy Model

In [None]:
process_additional_scoring = process_additional_scorer()
process_scoring_methods = {
    "neg_mean_squared_error": "neg_mean_squared_error",
    "neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "tail_rmse_and_negative_penalty": process_additional_scoring
}

In [None]:
best_process_models, results_process_txt = run_grid_search_on_all_models(process_target, process_df_path, all_possible_models, process_scoring_methods)

In [None]:
with open(f"full_results_process.txt", "w") as f:
    f.write(results_process_txt)