# Import libraries

In [1]:
from sklearn_utils import get_all_regressors
import pandas as pd
from PersistanceManager import PersistenceManager
from predictions import create_model_machine_learning_algorithm
from own_utils import load_json
from cleaning import prepare_dataframe_from_db, process_time_series_data
from predictions import run_time_series_prediction_pipeline, process_model_machine_learning, evaluate_model
from own_utils import execute_concurrently
from own_utils import list_directories_by_depth

# Import dataset

In [2]:
df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
df

Unnamed: 0,id_data,id_device,id_sensor,id_variable,timestamp,value,unit,id_location
0,1,DBEM003,sWEA,00-temp,2023-04-18 09:31:00,18.57,ºC,
1,2,DBEM003,sWEA,00-temp,2023-04-18 09:32:00,18.56,ºC,
2,3,DBEM003,sWEA,00-temp,2023-04-18 09:33:00,18.55,ºC,
3,4,DBEM003,sWEA,00-temp,2023-04-18 09:34:00,18.53,ºC,
4,5,DBEM003,sWEA,00-temp,2023-04-18 09:35:00,18.53,ºC,
...,...,...,...,...,...,...,...,...
1527268,3261165,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:10,31.00,ppb,
1527269,3261166,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:20,37.00,ppb,
1527270,3261167,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:30,25.00,ppb,
1527271,3261168,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:40,26.00,ppb,


# Create dataframe tidy

## parameters

In [3]:
prepare_dataframe_from_db_cols_for_query = [
    "00-eco2",
    "00-temp",
    "01-hum",
    "01-tvoc",
    "02-pres",
    "03-siaq",
    "04-diaq"
]
preprocess_time_series_data_resample_freq = "60S"
preprocess_time_series_data_aggregation_func = "mean"
preprocess_time_series_data_method = "linear"
preprocess_time_series_data_outlier_cols = None

## Dataframe preprocessed

In [4]:

df = prepare_dataframe_from_db(
    df=df,
    cols_for_query = prepare_dataframe_from_db_cols_for_query,
)

df


Unnamed: 0,id_device,id_sensor,id_variable,timestamp,value,unit,id_location
0,DBEM003,sWEA,00-temp,2023-04-18 09:31:00+00:00,18.57,ºC,
1,DBEM003,sWEA,00-temp,2023-04-18 09:32:00+00:00,18.56,ºC,
2,DBEM003,sWEA,00-temp,2023-04-18 09:33:00+00:00,18.55,ºC,
3,DBEM003,sWEA,00-temp,2023-04-18 09:34:00+00:00,18.53,ºC,
4,DBEM003,sWEA,00-temp,2023-04-18 09:35:00+00:00,18.53,ºC,
...,...,...,...,...,...,...,...
1187874,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:10+00:00,31.00,ppb,
1187875,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:20+00:00,37.00,ppb,
1187876,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:30+00:00,25.00,ppb,
1187877,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:40+00:00,26.00,ppb,


In [5]:

# Process time series data: resample and interpolate
df_resampled_interpolated = process_time_series_data(
    df=df,
    resample_freq = preprocess_time_series_data_resample_freq,
    aggregation_func = preprocess_time_series_data_aggregation_func,
    method = preprocess_time_series_data_method,
    outlier_cols = preprocess_time_series_data_outlier_cols,
)
df_resampled_interpolated


Unnamed: 0,id_device,id_sensor,id_variable,timestamp,value
0,DBEM003,sAQU,00-eco2,2023-04-18 09:31:00+00:00,400.000000
1,DBEM003,sAQU,00-eco2,2023-04-18 09:32:00+00:00,400.000000
2,DBEM003,sAQU,00-eco2,2023-04-18 09:33:00+00:00,400.000000
3,DBEM003,sAQU,00-eco2,2023-04-18 09:34:00+00:00,400.000000
4,DBEM003,sAQU,00-eco2,2023-04-18 09:35:00+00:00,400.000000
...,...,...,...,...,...
227838,DBEM003,sWEA,04-diaq,2023-05-10 23:55:00+00:00,26.666667
227839,DBEM003,sWEA,04-diaq,2023-05-10 23:56:00+00:00,28.000000
227840,DBEM003,sWEA,04-diaq,2023-05-10 23:57:00+00:00,29.000000
227841,DBEM003,sWEA,04-diaq,2023-05-10 23:58:00+00:00,30.333333


In [6]:

# Pivot and rename columns for uniformity
df_preprocessed = pd.pivot_table(
    df_resampled_interpolated.reset_index()[["timestamp", "id_device", "id_variable", "value"]],
    index=["timestamp", "id_device"],
    columns=["id_variable"]
).reset_index()

df_preprocessed


Unnamed: 0_level_0,timestamp,id_device,value,value,value,value,value,value,value
id_variable,Unnamed: 1_level_1,Unnamed: 2_level_1,00-eco2,00-temp,01-hum,01-tvoc,02-pres,03-siaq,04-diaq
0,2023-04-18 09:31:00+00:00,DBEM003,400.000000,18.570000,33.050000,2.000000,934.700000,25.000000,27.000000
1,2023-04-18 09:32:00+00:00,DBEM003,400.000000,18.560000,33.000000,1.000000,934.720000,25.000000,25.000000
2,2023-04-18 09:33:00+00:00,DBEM003,400.000000,18.550000,33.030000,8.000000,934.700000,25.000000,25.000000
3,2023-04-18 09:34:00+00:00,DBEM003,400.000000,18.530000,33.090000,4.000000,934.660000,25.000000,26.000000
4,2023-04-18 09:35:00+00:00,DBEM003,400.000000,18.530000,33.050000,3.000000,934.680000,26.000000,29.000000
...,...,...,...,...,...,...,...,...,...
32544,2023-05-10 23:55:00+00:00,DBEM003,449.833333,25.281667,29.266667,38.000000,939.910000,26.166667,26.666667
32545,2023-05-10 23:56:00+00:00,DBEM003,427.000000,25.278333,29.221667,39.333333,939.913333,27.166667,28.000000
32546,2023-05-10 23:57:00+00:00,DBEM003,414.500000,25.271667,29.205000,22.333333,939.873333,27.666667,29.000000
32547,2023-05-10 23:58:00+00:00,DBEM003,407.166667,25.268333,29.241667,16.166667,939.873333,28.833333,30.333333


In [7]:

df_preprocessed.columns = [
    col[0] if col[-1] == '' else col[-1]
    for col in df_preprocessed.columns.to_flat_index()
]

df_preprocessed.rename(columns={"00-eco2":"y"}, inplace=True)

df_preprocessed

Unnamed: 0,timestamp,id_device,y,00-temp,01-hum,01-tvoc,02-pres,03-siaq,04-diaq
0,2023-04-18 09:31:00+00:00,DBEM003,400.000000,18.570000,33.050000,2.000000,934.700000,25.000000,27.000000
1,2023-04-18 09:32:00+00:00,DBEM003,400.000000,18.560000,33.000000,1.000000,934.720000,25.000000,25.000000
2,2023-04-18 09:33:00+00:00,DBEM003,400.000000,18.550000,33.030000,8.000000,934.700000,25.000000,25.000000
3,2023-04-18 09:34:00+00:00,DBEM003,400.000000,18.530000,33.090000,4.000000,934.660000,25.000000,26.000000
4,2023-04-18 09:35:00+00:00,DBEM003,400.000000,18.530000,33.050000,3.000000,934.680000,26.000000,29.000000
...,...,...,...,...,...,...,...,...,...
32544,2023-05-10 23:55:00+00:00,DBEM003,449.833333,25.281667,29.266667,38.000000,939.910000,26.166667,26.666667
32545,2023-05-10 23:56:00+00:00,DBEM003,427.000000,25.278333,29.221667,39.333333,939.913333,27.166667,28.000000
32546,2023-05-10 23:57:00+00:00,DBEM003,414.500000,25.271667,29.205000,22.333333,939.873333,27.666667,29.000000
32547,2023-05-10 23:58:00+00:00,DBEM003,407.166667,25.268333,29.241667,16.166667,939.873333,28.833333,30.333333


# Inputs

In [25]:
tidy_data = df_preprocessed  
ini_train = "2023-04-18 00:00:00+00:00"
fin_train = "2023-04-25 00:00:00+00:00"
fin_test = "2023-04-26 00:00:00+00:00"
model_sklearn_name = "ARDRegression"
X_name_features = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))
Y_name_features = "y"
n_lags = 10
n_leads = 10 
lag_columns = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device'])) + ["y"]
lead_columns = "y"
scale_in_preprocessing=True
name_time_column="timestamp"
name_id_sensor_column="id_device"
save_preprocessing=True
path_to_save_model= "paper"
folder_name_model= model_sklearn_name
folder_name_time_execution="execution-time-no-defined"
folder_name_preprocessed_data="preprocessed-data-to-use-in-model"
machine_learning_model_args= {
    "max_iter": 300,
    "tol": 0.001,
    "alpha_1": 1e-06,
    "alpha_2": 1e-06,
    "lambda_1": 1e-06,
    "lambda_2": 1e-06,
    "compute_score": False,
    "threshold_lambda": 10000.0,
    "fit_intercept": True,
    "copy_X": True,
    "verbose": False,
}
measure_time = True
logger = None

In [26]:
model_machine_learning = create_model_machine_learning_algorithm(
    tidy_data = tidy_data,
    ini_train = ini_train,
    fin_train = fin_train,
    fin_test = fin_test,
    model_sklearn_name = model_sklearn_name,
    X_name_features = X_name_features,
    Y_name_features = Y_name_features,
    n_lags = n_lags,
    n_leads = n_leads,
    lag_columns = lag_columns,
    lead_columns = lead_columns,
    scale_in_preprocessing = scale_in_preprocessing,
    name_time_column = name_time_column,
    save_preprocessing = save_preprocessing,
    path_to_save_model = path_to_save_model,
    folder_name_model = folder_name_model,
    folder_name_time_execution = folder_name_time_execution,
    folder_name_preprocessed_data = folder_name_preprocessed_data,
    machine_learning_model_args = machine_learning_model_args,
    measure_time = measure_time,
    logger = logger
)



{'execution_times': {'preprocessing_test': {'total': 0.03728485107421875, 'details': {'preprocess_columns': 0.0, 'preprocess_scaler': 0.011007547378540039, 'preprocess_lags': 0.02028369903564453, 'preprocess_leads': 0.003998756408691406}}}}


# Multiples hiperparameters concurrently

In [10]:
import concurrent.futures
import logging
from typing import List, Dict, Callable

#TODO: Arreglar el logger
def machine_learning_concurrently(
    model_func: Callable,
    hyperparameter_combinations: List[Dict],
    data: pd.DataFrame,
    logger: logging.Logger = None
):
    """
    Executes multiple machine learning concurrently with different hyperparameter sets.
    
    Parameters:
    - model_func (Callable): The function that trains (or another step in pipeline) the model.
    - hyperparameter_combinations (List[Dict]): A list of dictionaries, where each dictionary contains
                                                the hyperparameters for the model training.
    - data (pd.DataFrame): The preprocessed DataFrame to be passed to the model function.
    - logger (logging.Logger, optional): A logger object for tracking progress and errors.
    
    Returns:
    - List: A list of results from the model function for each hyperparameter set.
    """
    
    # Function wrapper to include data and logging
    def wrapped_model_func(hyperparameters):
        try:
            result = model_func(tidy_data=data, **hyperparameters)
            # logger.info(f"Experiment with params {hyperparameters} completed successfully.")
            return result
        except Exception as e:
            # logger.error(f"Experiment with params {hyperparameters} failed. Error: {e}")
            return e

    # Execute concurrently
    results = execute_concurrently(wrapped_model_func, hyperparameter_combinations)
    return results

In [11]:
from itertools import product

def generate_combinations(parameter_space, model_specific_args):
    """
    Generate combinations of parameters for machine learning experiments in an optimized manner.

    This function dynamically combines a generic set of parameters with model-specific hyperparameter
    domains, ensuring efficient generation of all possible parameter combinations. It avoids the
    creation of intermediate lists and unnecessary memory usage by leveraging generator expressions
    and efficient dictionary construction.

    Parameters:
    - parameter_space (dict): A dictionary defining the generic parameters and their possible values.
                              This includes common training parameters such as:
                              - `ini_train` (list of str): Start dates for training.
                              - `fin_train` (list of str): End dates for training.
                              - `fin_test` (list of str): End dates for testing.
                              - `model_sklearn_name` (list of str): Names of scikit-learn models to evaluate.
                              - `n_lags` (list of int): Number of lag features to include.
                              - `n_leads` (list of int): Number of lead features to include.
                              - `X_name_features` (list of list of str): Lists of feature names for predictors.
                              Other parameters can also be included as required.

    - model_specific_args (dict): A dictionary mapping each `model_sklearn_name` to its corresponding
                                  hyperparameter domains. Each model's domain is defined as a dictionary,
                                  where the keys are hyperparameter names and the values are lists of
                                  possible values. Example:
                                  {
                                      "ARDRegression": {
                                          "max_iter": [200, 300],
                                          "tol": [0.001, 0.01],
                                          "alpha_1": [1e-06, 1e-05],
                                      },
                                      "Ridge": {
                                          "alpha": [0.1, 1.0, 10.0],
                                          "solver": ["auto", "svd"]
                                      }
                                  }

    Returns:
    - Generator[Dict]: A generator that yields dictionaries representing unique parameter combinations.
                       Each dictionary contains both the generic parameters and the model-specific
                       hyperparameters for one combination. Example output:
                       {
                           "ini_train": "2023-04-18",
                           "fin_train": "2023-04-25",
                           "fin_test": "2023-04-27",
                           "model_sklearn_name": "ARDRegression",
                           "n_lags": 5,
                           "n_leads": 10,
                           "X_name_features": ["feature1", "feature2"],
                           "machine_learning_model_args": {
                               "max_iter": 200,
                               "tol": 0.001,
                               "alpha_1": 1e-06,
                           }
                       }
    """
    generic_keys, generic_values = zip(*((k, v) for k, v in parameter_space.items() if k != "model_sklearn_name"))
    model_names = parameter_space.get("model_sklearn_name", [])

    return (
        {
            **dict(zip(generic_keys, generic_comb)),
            "model_sklearn_name": model,
            "machine_learning_model_args": dict(zip(model_args_keys, model_args_comb))
        }
        for generic_comb in product(*generic_values)
        for model in model_names
        for model_args_keys, model_args_values in [(list(model_specific_args[model].keys()), list(model_specific_args[model].values()))]
        for model_args_comb in product(*model_args_values)
    )


In [23]:
# tidy_data = df_preprocessed  
# ini_train = "2023-04-18 00:00:00+00:00"
# fin_train = "2023-04-25 00:00:00+00:00"
# fin_test = "2023-04-26 00:00:00+00:00"
# model_sklearn_name = "ARDRegression"
# X_name_features = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))
# Y_name_features = "y"
# n_lags = 10
# n_leads = 10 
# lag_columns = None 
# lead_columns = "y"
# scale_in_preprocessing=True
# name_time_column="timestamp"
# name_id_sensor_column="id_device"
# save_preprocessing=True
# path_to_save_model= "paper"
# folder_name_model= model_sklearn_name
# folder_name_time_execution="execution-time-no-defined"
# folder_name_preprocessed_data="preprocessed-data-to-use-in-model"
# machine_learning_model_args= {
#     "max_iter": 300,
#     "tol": 0.001,
#     "alpha_1": 1e-06,
#     "alpha_2": 1e-06,
#     "lambda_1": 1e-06,
#     "lambda_2": 1e-06,
#     "compute_score": False,
#     "threshold_lambda": 10000.0,
#     "fit_intercept": True,
#     "copy_X": True,
#     "verbose": False,
# }
# measure_time = True
# logger = None
parameter_space = {
    "tidy_data": [df_preprocessed],
    "ini_train": ["2023-04-18 00:00:00+00:00", "2023-04-19 00:00:00+00:00"],
    "fin_train": ["2023-04-25 00:00:00+00:00", "2023-04-26 00:00:00+00:00"],
    "fin_test": ["2023-04-26 00:00:00+00:00"],
    "model_sklearn_name": ["ARDRegression"],
    "n_lags": [5, 10],
    "n_leads": [5, 10],
    "X_name_features": [list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))],
    "Y_name_features": ["y"],
    "lag_columns": [None],
    "lead_columns": ["y"],
}

model_specific_args = {
    "ARDRegression": {
        "max_iter": [200, 300],
        "tol": [0.001, 0.01],
        "alpha_1": [1e-06, 1e-05],
    }
}

combinations = generate_combinations(parameter_space, model_specific_args)
hyperparameters = [combination for combination in combinations]
hyperparameters

[{'tidy_data':                       timestamp id_device           y    00-temp     01-hum  \
  0     2023-04-18 09:31:00+00:00   DBEM003  400.000000  18.570000  33.050000   
  1     2023-04-18 09:32:00+00:00   DBEM003  400.000000  18.560000  33.000000   
  2     2023-04-18 09:33:00+00:00   DBEM003  400.000000  18.550000  33.030000   
  3     2023-04-18 09:34:00+00:00   DBEM003  400.000000  18.530000  33.090000   
  4     2023-04-18 09:35:00+00:00   DBEM003  400.000000  18.530000  33.050000   
  ...                         ...       ...         ...        ...        ...   
  32544 2023-05-10 23:55:00+00:00   DBEM003  449.833333  25.281667  29.266667   
  32545 2023-05-10 23:56:00+00:00   DBEM003  427.000000  25.278333  29.221667   
  32546 2023-05-10 23:57:00+00:00   DBEM003  414.500000  25.271667  29.205000   
  32547 2023-05-10 23:58:00+00:00   DBEM003  407.166667  25.268333  29.241667   
  32548 2023-05-10 23:59:00+00:00   DBEM003  423.000000  25.275000  29.125000   
  
           0

In [21]:
hyperparameters[0]['X_name_features']

['00-temp', '04-diaq', '03-siaq', '01-hum', '02-pres', '01-tvoc']

In [24]:
execute_concurrently(create_model_machine_learning_algorithm, hyperparameters[:2])

Task with args {'tidy_data':                       timestamp id_device           y    00-temp     01-hum  \
0     2023-04-18 09:31:00+00:00   DBEM003  400.000000  18.570000  33.050000   
1     2023-04-18 09:32:00+00:00   DBEM003  400.000000  18.560000  33.000000   
2     2023-04-18 09:33:00+00:00   DBEM003  400.000000  18.550000  33.030000   
3     2023-04-18 09:34:00+00:00   DBEM003  400.000000  18.530000  33.090000   
4     2023-04-18 09:35:00+00:00   DBEM003  400.000000  18.530000  33.050000   
...                         ...       ...         ...        ...        ...   
32544 2023-05-10 23:55:00+00:00   DBEM003  449.833333  25.281667  29.266667   
32545 2023-05-10 23:56:00+00:00   DBEM003  427.000000  25.278333  29.221667   
32546 2023-05-10 23:57:00+00:00   DBEM003  414.500000  25.271667  29.205000   
32547 2023-05-10 23:58:00+00:00   DBEM003  407.166667  25.268333  29.241667   
32548 2023-05-10 23:59:00+00:00   DBEM003  423.000000  25.275000  29.125000   

         01-tvoc     0

[TypeError('expected str, bytes or os.PathLike object, not NoneType'),
 TypeError('expected str, bytes or os.PathLike object, not NoneType')]

In [13]:
stop

NameError: name 'stop' is not defined

In [None]:
machine_learning_concurrently(
    model_func=create_model_machine_learning_algorithm,
    hyperparameter_combinations=hyperparameters,
    data=df_preprocessed
)

Task with args {'ini_train': '2023-04-18 00:00:00+00:00', 'fin_train': '2023-04-26 00:00:00+00:00', 'fin_test': '2023-04-26 00:00:00+00:00', 'n_lags': 5, 'n_leads': 10, 'X_name_features': ['feature3', 'feature4'], 'model_sklearn_name': 'ARDRegression', 'machine_learning_model_args': {'max_iter': 300, 'tol': 0.01, 'alpha_1': 1e-05}} generated an exception: machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'
Task with args {'ini_train': '2023-04-18 00:00:00+00:00', 'fin_train': '2023-04-25 00:00:00+00:00', 'fin_test': '2023-04-26 00:00:00+00:00', 'n_lags': 10, 'n_leads': 10, 'X_name_features': ['feature3', 'feature4'], 'model_sklearn_name': 'Ridge', 'machine_learning_model_args': {'alpha': 10.0, 'solver': 'auto'}} generated an exception: machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'
Task with args {'ini_train': '2023-04-18 00:00:00+00:00', 'fin_train': '2023-04-25 00:00:00+

[TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_learning_concurrently.<locals>.wrapped_model_func() got an unexpected keyword argument 'ini_train'"),
 TypeError("machine_lear