# Import libraries

In [5]:
from sklearn_utils import get_all_regressors
import pandas as pd
from PersistenceManager import PersistenceManager
from predictions import create_model_machine_learning_algorithm
from own_utils import load_json
from cleaning import prepare_dataframe_from_db, process_time_series_data
from predictions import run_time_series_prediction_pipeline, process_model_machine_learning, evaluate_model
from own_utils import execute_concurrently
from own_utils import list_directories_by_depth

# Import dataset

In [6]:
df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
df

Unnamed: 0,id_data,id_device,id_sensor,id_variable,timestamp,value,unit,id_location
0,1,DBEM003,sWEA,00-temp,2023-04-18 09:31:00,18.57,ºC,
1,2,DBEM003,sWEA,00-temp,2023-04-18 09:32:00,18.56,ºC,
2,3,DBEM003,sWEA,00-temp,2023-04-18 09:33:00,18.55,ºC,
3,4,DBEM003,sWEA,00-temp,2023-04-18 09:34:00,18.53,ºC,
4,5,DBEM003,sWEA,00-temp,2023-04-18 09:35:00,18.53,ºC,
...,...,...,...,...,...,...,...,...
1527268,3261165,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:10,31.00,ppb,
1527269,3261166,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:20,37.00,ppb,
1527270,3261167,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:30,25.00,ppb,
1527271,3261168,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:40,26.00,ppb,


# Create dataframe tidy

## parameters

In [7]:
prepare_dataframe_from_db_cols_for_query = [
    "00-eco2",
    "00-temp",
    "01-hum",
    "01-tvoc",
    "02-pres",
    "03-siaq",
    "04-diaq"
]
preprocess_time_series_data_resample_freq = "60S"
preprocess_time_series_data_aggregation_func = "mean"
preprocess_time_series_data_method = "linear"
preprocess_time_series_data_outlier_cols = None

## Dataframe preprocessed

In [8]:
df = prepare_dataframe_from_db(
    df=df,
    cols_for_query = prepare_dataframe_from_db_cols_for_query,
)

df

Unnamed: 0,id_device,id_sensor,id_variable,timestamp,value,unit,id_location
0,DBEM003,sWEA,00-temp,2023-04-18 09:31:00+00:00,18.57,ºC,
1,DBEM003,sWEA,00-temp,2023-04-18 09:32:00+00:00,18.56,ºC,
2,DBEM003,sWEA,00-temp,2023-04-18 09:33:00+00:00,18.55,ºC,
3,DBEM003,sWEA,00-temp,2023-04-18 09:34:00+00:00,18.53,ºC,
4,DBEM003,sWEA,00-temp,2023-04-18 09:35:00+00:00,18.53,ºC,
...,...,...,...,...,...,...,...
1187874,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:10+00:00,31.00,ppb,
1187875,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:20+00:00,37.00,ppb,
1187876,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:30+00:00,25.00,ppb,
1187877,DBEM003,sAQU,01-tvoc,2023-05-10 23:59:40+00:00,26.00,ppb,


In [9]:
# Process time series data: resample and interpolate
df_resampled_interpolated = process_time_series_data(
    df=df,
    resample_freq = preprocess_time_series_data_resample_freq,
    aggregation_func = preprocess_time_series_data_aggregation_func,
    method = preprocess_time_series_data_method,
    outlier_cols = preprocess_time_series_data_outlier_cols,
)
df_resampled_interpolated

Unnamed: 0,id_device,id_sensor,id_variable,timestamp,value
0,DBEM003,sAQU,00-eco2,2023-04-18 09:31:00+00:00,400.000000
1,DBEM003,sAQU,00-eco2,2023-04-18 09:32:00+00:00,400.000000
2,DBEM003,sAQU,00-eco2,2023-04-18 09:33:00+00:00,400.000000
3,DBEM003,sAQU,00-eco2,2023-04-18 09:34:00+00:00,400.000000
4,DBEM003,sAQU,00-eco2,2023-04-18 09:35:00+00:00,400.000000
...,...,...,...,...,...
227838,DBEM003,sWEA,04-diaq,2023-05-10 23:55:00+00:00,26.666667
227839,DBEM003,sWEA,04-diaq,2023-05-10 23:56:00+00:00,28.000000
227840,DBEM003,sWEA,04-diaq,2023-05-10 23:57:00+00:00,29.000000
227841,DBEM003,sWEA,04-diaq,2023-05-10 23:58:00+00:00,30.333333


In [10]:

# Pivot and rename columns for uniformity
df_preprocessed = pd.pivot_table(
    df_resampled_interpolated.reset_index()[["timestamp", "id_device", "id_variable", "value"]],
    index=["timestamp", "id_device"],
    columns=["id_variable"]
).reset_index()

df_preprocessed


Unnamed: 0_level_0,timestamp,id_device,value,value,value,value,value,value,value
id_variable,Unnamed: 1_level_1,Unnamed: 2_level_1,00-eco2,00-temp,01-hum,01-tvoc,02-pres,03-siaq,04-diaq
0,2023-04-18 09:31:00+00:00,DBEM003,400.000000,18.570000,33.050000,2.000000,934.700000,25.000000,27.000000
1,2023-04-18 09:32:00+00:00,DBEM003,400.000000,18.560000,33.000000,1.000000,934.720000,25.000000,25.000000
2,2023-04-18 09:33:00+00:00,DBEM003,400.000000,18.550000,33.030000,8.000000,934.700000,25.000000,25.000000
3,2023-04-18 09:34:00+00:00,DBEM003,400.000000,18.530000,33.090000,4.000000,934.660000,25.000000,26.000000
4,2023-04-18 09:35:00+00:00,DBEM003,400.000000,18.530000,33.050000,3.000000,934.680000,26.000000,29.000000
...,...,...,...,...,...,...,...,...,...
32544,2023-05-10 23:55:00+00:00,DBEM003,449.833333,25.281667,29.266667,38.000000,939.910000,26.166667,26.666667
32545,2023-05-10 23:56:00+00:00,DBEM003,427.000000,25.278333,29.221667,39.333333,939.913333,27.166667,28.000000
32546,2023-05-10 23:57:00+00:00,DBEM003,414.500000,25.271667,29.205000,22.333333,939.873333,27.666667,29.000000
32547,2023-05-10 23:58:00+00:00,DBEM003,407.166667,25.268333,29.241667,16.166667,939.873333,28.833333,30.333333


In [11]:

df_preprocessed.columns = [
    col[0] if col[-1] == '' else col[-1]
    for col in df_preprocessed.columns.to_flat_index()
]

df_preprocessed.rename(columns={"00-eco2":"y"}, inplace=True)

df_preprocessed

Unnamed: 0,timestamp,id_device,y,00-temp,01-hum,01-tvoc,02-pres,03-siaq,04-diaq
0,2023-04-18 09:31:00+00:00,DBEM003,400.000000,18.570000,33.050000,2.000000,934.700000,25.000000,27.000000
1,2023-04-18 09:32:00+00:00,DBEM003,400.000000,18.560000,33.000000,1.000000,934.720000,25.000000,25.000000
2,2023-04-18 09:33:00+00:00,DBEM003,400.000000,18.550000,33.030000,8.000000,934.700000,25.000000,25.000000
3,2023-04-18 09:34:00+00:00,DBEM003,400.000000,18.530000,33.090000,4.000000,934.660000,25.000000,26.000000
4,2023-04-18 09:35:00+00:00,DBEM003,400.000000,18.530000,33.050000,3.000000,934.680000,26.000000,29.000000
...,...,...,...,...,...,...,...,...,...
32544,2023-05-10 23:55:00+00:00,DBEM003,449.833333,25.281667,29.266667,38.000000,939.910000,26.166667,26.666667
32545,2023-05-10 23:56:00+00:00,DBEM003,427.000000,25.278333,29.221667,39.333333,939.913333,27.166667,28.000000
32546,2023-05-10 23:57:00+00:00,DBEM003,414.500000,25.271667,29.205000,22.333333,939.873333,27.666667,29.000000
32547,2023-05-10 23:58:00+00:00,DBEM003,407.166667,25.268333,29.241667,16.166667,939.873333,28.833333,30.333333


In [12]:
list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))

['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres']

In [13]:
list(set(df_preprocessed.columns)-set(['y','timestamp','id_device'])) + ["y"]

['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres', 'y']

# Create function of standarization DataFrame

In [None]:
def preprocess_and_standardize_dataframe(
    df: pd.DataFrame,
    resample_freq: str,
    aggregation_func: str,
    interpolation_method: str,
    target_variable: str,
    outlier_cols: list = None,
    pivot: bool = False,
    pivot_index: list = None,
    pivot_columns: list = None,
    pivot_values: list = None,
    subset_cols: list = None,
    target_column_name: str = "y",
    save_metadata: bool = False,
) -> pd.DataFrame:
    """
    Preprocess and standardize a dataframe for cross-validation.

    This function processes time-series data with resampling, aggregation, and interpolation. 
    Optionally, it applies a pivot operation to reshape the dataframe into a standardized format.

    Parameters:
    ----------
    df : pd.DataFrame
        The dataframe to preprocess and standardize.
    resample_freq : str
        Frequency for resampling the time-series data.
    aggregation_func : str
        Aggregation function to apply during resampling.
    interpolation_method : str
        Method to use for interpolating missing values.
    target_variable : str, optional
        The name of the column to set as the target variable (default is None).
    outlier_cols : list, optional
        List of columns to apply outlier handling (default is None, no outlier handling applied).
    pivot : bool, optional
        Whether to apply a pivot operation to the dataframe (default is False).
    pivot_index : list, optional
        Columns to use as the index in the pivot table (required if `pivot=True`).
    pivot_columns : list, optional
        Columns to use as the columns in the pivot table (required if `pivot=True`).
    pivot_values : list, optional
        Columns to use as the values in the pivot table (required if `pivot=True`).
    subset_cols : list, optional
        Columns to subset the dataframe to (default is None, no subsetting applied).
    target_column_name : str, optional
        The new name for the target variable column in the standardized dataframe (default is "y").

    Returns:
    -------
    pd.DataFrame
        A preprocessed and standardized dataframe ready for further analysis or modeling.

    Example:
    --------
    ```python
    df_preprocessed = preprocess_and_standardize_dataframe(
        df=my_df,
        resample_freq="60S",
        aggregation_func="mean",
        interpolation_method="linear",
        target_variable="00-eco2",
        pivot=True,
        pivot_index=["timestamp", "id_device"],
        pivot_columns=["id_variable"],
        pivot_values=["value"]
    )
    ```
    """
    # Process time-series data: resample, aggregate, and interpolate
    df_resampled_interpolated = process_time_series_data(
        df=df,
        resample_freq=resample_freq,
        aggregation_func=aggregation_func,
        method=interpolation_method,
        outlier_cols=outlier_cols,
    )

    # Apply pivot operation if required
    if pivot:
        if not pivot_index or not pivot_columns or not pivot_values:
            raise ValueError("`pivot_index`, `pivot_columns`, and `pivot_values` must be provided if `pivot=True`.")
        
        df_resampled_interpolated = pd.pivot_table(
            df_resampled_interpolated.reset_index()[subset_cols],
            index=pivot_index,
            columns=pivot_columns,
            values=pivot_values
        ).reset_index()

        # Flatten column hierarchy if created by pivot_table
        df_resampled_interpolated.columns = [
            col[0] if col[-1] == '' else col[-1]
            for col in df_resampled_interpolated.columns.to_flat_index()
        ]

    # Rename target variable column if specified
    if target_variable:
        df_resampled_interpolated.rename(columns={target_variable: target_column_name}, inplace=True)

    return df_resampled_interpolated

df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
prepare_dataframe_from_db_cols_for_query = [
    "00-eco2",
    "00-temp",
    "01-hum",
    "01-tvoc",
    "02-pres",
    "03-siaq",
    "04-diaq"
]
# Prepare dataframe with selected columns
df = prepare_dataframe_from_db(
    df=df,
    cols_for_query=prepare_dataframe_from_db_cols_for_query,
)

preprocess_time_series_data_resample_freq = "60S"
preprocess_time_series_data_aggregation_func = "mean"
preprocess_time_series_data_method = "linear"
target_variable = "00-eco2"
outlier_cols = None
pivot = True
pivot_index = ["timestamp", "id_device"]
pivot_columns = ["id_variable"]
pivot_values = ["value"]
subset_cols = ["timestamp", "id_device", "id_variable", "value"]
target_column_name = "y"


preprocess_time_series_data_outlier_cols = None
df_preprocessed = preprocess_and_standardize_dataframe(
    df = df,
    resample_freq = preprocess_time_series_data_resample_freq,
    aggregation_func = preprocess_time_series_data_aggregation_func,
    interpolation_method = preprocess_time_series_data_method,
    target_variable=target_variable,
    outlier_cols=outlier_cols,
    pivot=pivot,
    pivot_index=pivot_index,
    pivot_columns=pivot_columns,
    pivot_values=pivot_values,
    subset_cols=subset_cols,
    target_column_name=target_column_name
)
df_preprocessed

Unnamed: 0,timestamp,id_device,y,00-temp,01-hum,01-tvoc,02-pres,03-siaq,04-diaq
0,2023-04-18 09:31:00+00:00,DBEM003,400.000000,18.570000,33.050000,2.000000,934.700000,25.000000,27.000000
1,2023-04-18 09:32:00+00:00,DBEM003,400.000000,18.560000,33.000000,1.000000,934.720000,25.000000,25.000000
2,2023-04-18 09:33:00+00:00,DBEM003,400.000000,18.550000,33.030000,8.000000,934.700000,25.000000,25.000000
3,2023-04-18 09:34:00+00:00,DBEM003,400.000000,18.530000,33.090000,4.000000,934.660000,25.000000,26.000000
4,2023-04-18 09:35:00+00:00,DBEM003,400.000000,18.530000,33.050000,3.000000,934.680000,26.000000,29.000000
...,...,...,...,...,...,...,...,...,...
32544,2023-05-10 23:55:00+00:00,DBEM003,449.833333,25.281667,29.266667,38.000000,939.910000,26.166667,26.666667
32545,2023-05-10 23:56:00+00:00,DBEM003,427.000000,25.278333,29.221667,39.333333,939.913333,27.166667,28.000000
32546,2023-05-10 23:57:00+00:00,DBEM003,414.500000,25.271667,29.205000,22.333333,939.873333,27.666667,29.000000
32547,2023-05-10 23:58:00+00:00,DBEM003,407.166667,25.268333,29.241667,16.166667,939.873333,28.833333,30.333333


In [29]:
stop

NameError: name 'stop' is not defined

# Inputs

In [None]:
tidy_data = df_preprocessed  
ini_train = "2023-04-18 00:00:00+00:00"
fin_train = "2023-04-25 00:00:00+00:00"
fin_test = "2023-04-26 00:00:00+00:00"
model_sklearn_name = "ARDRegression"
X_name_features = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))
Y_name_features = "y"
n_lags = 10
n_leads = 10 
lag_columns = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device'])) + ["y"]
lead_columns = "y"
scale_in_preprocessing=True
name_time_column="timestamp"
name_id_sensor_column="id_device"
save_preprocessing=True
path_to_save_model= "paper"
folder_name_model= model_sklearn_name
folder_name_time_execution="execution-time-no-defined"
folder_name_preprocessed_data="preprocessed-data-to-use-in-model"
machine_learning_model_args= {
    "max_iter": 300,
    "tol": 0.001,
    "alpha_1": 1e-06,
    "alpha_2": 1e-06,
    "lambda_1": 1e-06,
    "lambda_2": 1e-06,
    "compute_score": False,
    "threshold_lambda": 10000.0,
    "fit_intercept": True,
    "copy_X": True,
    "verbose": False,
}
measure_time = True
logger = None

In [None]:
model_machine_learning = create_model_machine_learning_algorithm(
    tidy_data = tidy_data,
    ini_train = ini_train,
    fin_train = fin_train,
    fin_test = fin_test,
    model_sklearn_name = model_sklearn_name,
    X_name_features = X_name_features,
    Y_name_features = Y_name_features,
    n_lags = n_lags,
    n_leads = n_leads,
    lag_columns = lag_columns,
    lead_columns = lead_columns,
    scale_in_preprocessing = scale_in_preprocessing,
    name_time_column = name_time_column,
    save_preprocessing = save_preprocessing,
    path_to_save_model = path_to_save_model,
    folder_name_model = folder_name_model,
    folder_name_time_execution = folder_name_time_execution,
    folder_name_preprocessed_data = folder_name_preprocessed_data,
    machine_learning_model_args = machine_learning_model_args,
    measure_time = measure_time,
    logger = logger
)

# Multiples hiperparameters concurrently

In [None]:
# import concurrent.futures
# import logging
# from typing import List, Dict, Callable

# #TODO: Arreglar el logger
# def machine_learning_concurrently(
#     model_func: Callable,
#     hyperparameter_combinations: List[Dict],
#     data: pd.DataFrame,
#     logger: logging.Logger = None
# ):
#     """
#     Executes multiple machine learning concurrently with different hyperparameter sets.
    
#     Parameters:
#     - model_func (Callable): The function that trains (or another step in pipeline) the model.
#     - hyperparameter_combinations (List[Dict]): A list of dictionaries, where each dictionary contains
#                                                 the hyperparameters for the model training.
#     - data (pd.DataFrame): The preprocessed DataFrame to be passed to the model function.
#     - logger (logging.Logger, optional): A logger object for tracking progress and errors.
    
#     Returns:
#     - List: A list of results from the model function for each hyperparameter set.
#     """
    
#     # Function wrapper to include data and logging
#     def wrapped_model_func(hyperparameters):
#         try:
#             result = model_func(tidy_data=data, **hyperparameters)
#             # logger.info(f"Experiment with params {hyperparameters} completed successfully.")
#             return result
#         except Exception as e:
#             # logger.error(f"Experiment with params {hyperparameters} failed. Error: {e}")
#             return e

#     # Execute concurrently
#     results = execute_concurrently(wrapped_model_func, hyperparameter_combinations)
#     return results

In [None]:
from itertools import product

def generate_combinations(parameter_space, model_specific_args):
    """
    Generate combinations of parameters for machine learning experiments in an optimized manner.

    This function dynamically combines a generic set of parameters with model-specific hyperparameter
    domains, ensuring efficient generation of all possible parameter combinations. It avoids the
    creation of intermediate lists and unnecessary memory usage by leveraging generator expressions
    and efficient dictionary construction.

    Parameters:
    - parameter_space (dict): A dictionary defining the generic parameters and their possible values.
                              This includes common training parameters such as:
                              - `ini_train` (list of str): Start dates for training.
                              - `fin_train` (list of str): End dates for training.
                              - `fin_test` (list of str): End dates for testing.
                              - `model_sklearn_name` (list of str): Names of scikit-learn models to evaluate.
                              - `n_lags` (list of int): Number of lag features to include.
                              - `n_leads` (list of int): Number of lead features to include.
                              - `X_name_features` (list of list of str): Lists of feature names for predictors.
                              Other parameters can also be included as required.

    - model_specific_args (dict): A dictionary mapping each `model_sklearn_name` to its corresponding
                                  hyperparameter domains. Each model's domain is defined as a dictionary,
                                  where the keys are hyperparameter names and the values are lists of
                                  possible values. Example:
                                  {
                                      "ARDRegression": {
                                          "max_iter": [200, 300],
                                          "tol": [0.001, 0.01],
                                          "alpha_1": [1e-06, 1e-05],
                                      },
                                      "Ridge": {
                                          "alpha": [0.1, 1.0, 10.0],
                                          "solver": ["auto", "svd"]
                                      }
                                  }

    Returns:
    - Generator[Dict]: A generator that yields dictionaries representing unique parameter combinations.
                       Each dictionary contains both the generic parameters and the model-specific
                       hyperparameters for one combination. Example output:
                       {
                           "ini_train": "2023-04-18",
                           "fin_train": "2023-04-25",
                           "fin_test": "2023-04-27",
                           "model_sklearn_name": "ARDRegression",
                           "n_lags": 5,
                           "n_leads": 10,
                           "X_name_features": ["feature1", "feature2"],
                           "machine_learning_model_args": {
                               "max_iter": 200,
                               "tol": 0.001,
                               "alpha_1": 1e-06,
                           }
                       }
    """
    generic_keys, generic_values = zip(*((k, v) for k, v in parameter_space.items() if k != "model_sklearn_name"))
    model_names = parameter_space.get("model_sklearn_name", [])

    return (
        {
            **dict(zip(generic_keys, generic_comb)),
            "model_sklearn_name": model,
            "machine_learning_model_args": dict(zip(model_args_keys, model_args_comb))
        }
        for generic_comb in product(*generic_values)
        for model in model_names
        for model_args_keys, model_args_values in [(list(model_specific_args[model].keys()), list(model_specific_args[model].values()))]
        for model_args_comb in product(*model_args_values)
    )


In [None]:
# tidy_data = df_preprocessed  
# ini_train = "2023-04-18 00:00:00+00:00"
# fin_train = "2023-04-25 00:00:00+00:00"
# fin_test = "2023-04-26 00:00:00+00:00"
# model_sklearn_name = "ARDRegression"
# X_name_features = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))
# Y_name_features = "y"
# n_lags = 10
# n_leads = 10 
# lag_columns = list(set(df_preprocessed.columns)-set(['y','timestamp','id_device'])) + ["y"]
# lead_columns = "y"
# scale_in_preprocessing=True
# name_time_column="timestamp"
# save_preprocessing=True
# path_to_save_model= "paper"
# folder_name_model= model_sklearn_name
# folder_name_time_execution="execution-time-no-defined"
# folder_name_preprocessed_data="preprocessed-data-to-use-in-model"
# machine_learning_model_args= {
#     "max_iter": 300,
#     "tol": 0.001,
#     "alpha_1": 1e-06,
#     "alpha_2": 1e-06,
#     "lambda_1": 1e-06,
#     "lambda_2": 1e-06,
#     "compute_score": False,
#     "threshold_lambda": 10000.0,
#     "fit_intercept": True,
#     "copy_X": True,
#     "verbose": False,
# }
# measure_time = True
# logger = None
hyperparameters_model_space = {
    "tidy_data": [df_preprocessed],
    "ini_train": ["2023-04-18 00:00:00+00:00", "2023-04-19 00:00:00+00:00"],
    "fin_train": ["2023-04-25 00:00:00+00:00", "2023-04-26 00:00:00+00:00"],
    "fin_test": ["2023-04-26 00:00:00+00:00"],
    "model_sklearn_name": ["ARDRegression"],
    "n_lags": [5, 10],
    "n_leads": [5, 10],
    "X_name_features": [list(set(df_preprocessed.columns)-set(['y','timestamp','id_device']))],
    "Y_name_features": ["y"],
    "lag_columns": [list(set(df_preprocessed.columns)-set(['y','timestamp','id_device'])) + ["y"]],
    "lead_columns": ["y"],
    "scale_in_preprocessing": [True],
    "name_time_column": ["timestamp"],
    "save_preprocessing": [True],
    "path_to_save_model": ["paper"],
    "folder_name_model": ["ARDRegression"],
    "folder_name_time_execution": ["execution-time-no-defined"],
    "folder_name_preprocessed_data": ["preprocessed-data-to-use-in-model"],
    "measure_time": [True],
    "logger": [None],
    "kwargs_save_object": [{"rename_if_exists": True}],
}

hyperparameters_specific_regressor_args = {
    "ARDRegression": {
        "max_iter": [200, 300],
        "tol": [0.001, 0.01],
        "alpha_1": [1e-06, 1e-05],
    }
}

combinations = generate_combinations(hyperparameters_model_space, hyperparameters_specific_regressor_args)
hyperparameters = [combination for combination in combinations]
hyperparameters

In [None]:
stop

# Generate combinations of hyperparameters for preprocessing


In [None]:
def create_dataframe_combinations(parameter_space):
    """
    Generate and preprocess dataframes with optimized parameter combinations.

    This function dynamically generates all possible combinations of preprocessing parameters,
    applies the preprocessing to the given dataframe, and yields the resulting dataframes
    along with their parameter configurations.

    Parameters:
    ----------
    parameter_space : dict
        A dictionary defining the preprocessing parameters and their possible values.
        These parameters include:
        - `resample_freq` (list of str): Frequencies for resampling the time-series data.
        - `aggregation_func` (list of str): Aggregation functions to apply during resampling.
        - `interpolation_method` (list of str): Methods to use for interpolating missing values.
        - `target_variable` (list of str): Names of the columns to set as the target variable.
        - `pivot` (list of bool): Whether to apply a pivot operation.
        - `pivot_index` (list of list of str): Columns to use as the index in the pivot table.
        - `pivot_columns` (list of list of str): Columns to use as the columns in the pivot table.
        - `pivot_values` (list of list of str): Columns to use as the values in the pivot table.
        - `subset_cols` (list of list of str): Columns to subset the dataframe to.
        - `target_column_name` (list of str): New names for the target variable column.

    Yields:
    -------
    dict
        A dictionary containing:
        - `parameters`: The parameter configuration used for preprocessing.
        - `dataframe`: The resulting preprocessed dataframe.

    Example Usage:
    --------------
    ```python
    parameter_space = {
        "resample_freq": ["60S", "30S"],
        "aggregation_func": ["mean", "median"],
        "interpolation_method": ["linear", "quadratic"],
        "target_variable": ["00-eco2"],
        "pivot": [True, False],
        "pivot_index": [["timestamp", "id_device"]],
        "pivot_columns": [["id_variable"]],
        "pivot_values": [["value"]],
        "subset_cols": [["timestamp", "id_device", "id_variable", "value"]],
        "target_column_name": ["y"]
    }

    for result in create_dataframe_combinations(parameter_space):
        print(result["parameters"])
        print(result["dataframe"])
    ```
    """
    keys, values = zip(*parameter_space.items())

    for combination in product(*values):
        params = dict(zip(keys, combination))

        # Prepare and preprocess the dataframe
        try:
            df_preprocessed = preprocess_and_standardize_dataframe(
                df=params.get("df"),  # Assume df is passed in the parameter space
                resample_freq=params["resample_freq"],
                aggregation_func=params["aggregation_func"],
                interpolation_method=params["interpolation_method"],
                target_variable=params.get("target_variable"),
                outlier_cols=params.get("outlier_cols"),
                pivot=params["pivot"],
                pivot_index=params.get("pivot_index"),
                pivot_columns=params.get("pivot_columns"),
                pivot_values=params.get("pivot_values"),
                subset_cols=params.get("subset_cols"),
                target_column_name=params.get("target_column_name"),
            )

            yield {
                "parameters": params,
                "dataframe": df_preprocessed
            }

        except Exception as e:
            print(f"Error processing combination {params}: {e}")

In [None]:
stop

# Create function of cv

In [1]:
from sklearn_utils import get_all_regressors
import numpy as np 
import pandas as pd
from PersistenceManager import PersistenceManager
from predictions import create_model_machine_learning_algorithm
from own_utils import load_json
from cleaning import prepare_dataframe_from_db, process_time_series_data
from predictions import run_time_series_prediction_pipeline, process_model_machine_learning, evaluate_model
from own_utils import execute_concurrently
from own_utils import list_directories_by_depth
from itertools import product
from predictions import create_cv_hyperparameters_model
from ConfigManager import ConfigManager
from predictions import generate_cv_grid_regressor_sklearn

In [None]:
# hyperparameters_preprocessing = {
# #     "df": [df],
#     "resample_freq": ["60S", "30S"],
#     "aggregation_func": ["mean", "median"],
#     "interpolation_method": ["linear", "quadratic"],
#     "target_variable": ["00-eco2"],
#     "pivot": [True],
#     "pivot_index": [["timestamp", "id_device"]],
#     "pivot_columns": [["id_variable"]],
#     "pivot_values": [["value"]],
#     "subset_cols": [["timestamp", "id_device", "id_variable", "value"]],
#     "target_column_name": ["y"]
# }

# hyperparameters_model_space = {
# #     "tidy_data": [df_preprocessed],
#     "ini_train": ["2023-04-18 00:00:00+00:00", "2023-04-19 00:00:00+00:00"],
#     "fin_train": ["2023-04-25 00:00:00+00:00", "2023-04-26 00:00:00+00:00"],
#     "fin_test": ["2023-04-26 00:00:00+00:00"],
#     "model_sklearn_name": ["ARDRegression"],
#     "n_lags": [5, 10],
#     "n_leads": [5, 10],
#     "X_name_features": [['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres']],
#     "Y_name_features": ["y"],
#     "lag_columns": [['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres', 'y']],
#     "lead_columns": ["y"],
#     "scale_in_preprocessing": [True],
#     "name_time_column": ["timestamp"],
#     "save_preprocessing": [True],
#     "path_to_save_model": ["paper"],
#     "folder_name_model": ["ARDRegression"],
#     "folder_name_time_execution": ["execution-time-no-defined"],
#     "folder_name_preprocessed_data": ["preprocessed-data-to-use-in-model"],
#     "measure_time": [True],
#     "logger": [None],
#     "kwargs_save_object": [{"rename_if_exists": True}],
# }

# hyperparameters_specific_regressor_args = {
#     "ARDRegression": {
#         "max_iter": [200, 300],
#         "tol": [0.001, 0.01],
#         "alpha_1": [1e-06, 1e-05],
#     }
# }

# df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
# df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
# prepare_dataframe_from_db_cols_for_query = [
#     "00-eco2",
#     "00-temp",
#     "01-hum",
#     "01-tvoc",
#     "02-pres",
#     "03-siaq",
#     "04-diaq"
# ]
# # Prepare dataframe with selected columns
# df = prepare_dataframe_from_db(
#     df=df,
#     cols_for_query=prepare_dataframe_from_db_cols_for_query,
# )

# test = create_cv_hyperparameters_model(
#         df = df,
#         hyperparameters_preprocessing = hyperparameters_preprocessing,
#         hyperparameters_model_space = hyperparameters_model_space,
#         hyperparameters_specific_regressor_args = hyperparameters_specific_regressor_args
# )

# test

Task with args {'ini_train': '2023-04-18 00:00:00+00:00', 'fin_train': '2023-04-25 00:00:00+00:00', 'fin_test': '2023-04-26 00:00:00+00:00', 'n_lags': 5, 'n_leads': 5, 'X_name_features': ['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres'], 'Y_name_features': 'y', 'lag_columns': ['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres', 'y'], 'lead_columns': 'y', 'scale_in_preprocessing': True, 'name_time_column': 'timestamp', 'save_preprocessing': True, 'path_to_save_model': 'paper', 'folder_name_model': 'ARDRegression', 'folder_name_time_execution': 'execution-time-no-defined', 'folder_name_preprocessed_data': 'preprocessed-data-to-use-in-model', 'measure_time': True, 'logger': None, 'kwargs_save_object': {'rename_if_exists': True}, 'tidy_data':                       timestamp id_device           y    00-temp     01-hum  \
0     2023-04-18 09:31:00+00:00   DBEM003  400.000000  18.570000  33.050000   
1     2023-04-18 09:32:00+00:00   DBEM003  400.000000  18.560000

[{'model': MultiOutputRegressor(estimator=ARDRegression(max_iter=300, tol=0.01)),
  'preprocessing_train': {'preprocess_columns': {'order': 1,
    'df':         04-diaq     01-hum   01-tvoc    03-siaq    00-temp     02-pres  \
    0     27.000000  33.050000  2.000000  25.000000  18.570000  934.700000   
    1     25.000000  33.000000  1.000000  25.000000  18.560000  934.720000   
    2     25.000000  33.030000  8.000000  25.000000  18.550000  934.700000   
    3     26.000000  33.090000  4.000000  25.000000  18.530000  934.660000   
    4     29.000000  33.050000  3.000000  26.000000  18.530000  934.680000   
    ...         ...        ...       ...        ...        ...         ...   
    9505  25.833333  32.765000  6.000000  25.166667  24.250000  938.323333   
    9506  25.666667  32.758333  2.833333  25.166667  24.250000  938.326667   
    9507  25.833333  32.758333  3.166667  25.500000  24.251667  938.323333   
    9508  25.166667  32.751667  3.833333  25.000000  24.253333  938.290

In [2]:
config_manager = ConfigManager("../config")

# Cargar los hiperparámetros y dominios
all_regressors_with_its_parameters_and_domains = config_manager.load_config(
    "models_parameters/metadata/all_regressors_with_its_parameters_and_domains"
)

distribution = "uniform"

# Selección de distribución por parámetro
distribution_per_param = {
    "ARDRegression": {"alpha_1": "log", "max_iter": "uniform"},
    "SVR": {"C": "log", "gamma": "log"}
}

# Límites personalizados
custom_limits = {
    "ARDRegression": {"alpha_1": [1e-6, 1e6], "max_iter": [10, 1000]},
    "SVR": {"C": [0.1, 100], "gamma": [1e-4, 1.0]}
}


# Longitudes de los parámetros
param_lengths = {
    "ARDRegression": {"alpha_1": 3, "max_iter": 3, "tol":3},
    "SVR": {"C": 3, "gamma": 2}
}

# Tipos forzados
forced_types = {
    "ARDRegression": {"max_iter": int},
    "SVR": {"C": float}
}

# Generación de CV grid para ARDRegression seleccionando ciertos hiperparámetros
cv_grid_ard = generate_cv_grid_regressor_sklearn(
    hyperparameter_dict=all_regressors_with_its_parameters_and_domains["ARDRegression"],
    distribution="uniform",
    param_lengths=param_lengths["ARDRegression"],
    custom_limits=custom_limits["ARDRegression"],
    distribution_per_param=distribution_per_param["ARDRegression"],
    selected_params=list(param_lengths["ARDRegression"].keys()),
    forced_types=forced_types["ARDRegression"]
)
cv_grid_ard = {"ARDRegression": cv_grid_ard}

# Generación de CV grid para SVR seleccionando ciertos hiperparámetros
cv_grid_svr = generate_cv_grid_regressor_sklearn(
    hyperparameter_dict=all_regressors_with_its_parameters_and_domains["SVR"],
    distribution=distribution,
    param_lengths=param_lengths["SVR"],
    custom_limits=custom_limits["SVR"],
    distribution_per_param=distribution_per_param["SVR"],
    selected_params=["C"],
    forced_types=forced_types["SVR"]
)
cv_grid_svr = {"SVR": cv_grid_svr}

# Imprimir resultados
print("ARDRegression CV Grid:")
print(cv_grid_ard)

print("\nSVR CV Grid:")
print(cv_grid_svr)

ARDRegression CV Grid:
{'ARDRegression': {'alpha_1': [2.0000000000000003e-06, 1.4142135623730951, 1000000.0], 'max_iter': [10, 505, 1000], 'tol': [0.0, 0.5, 1.0]}}

SVR CV Grid:
{'SVR': {'C': [0.100001, 3.162293471517151, 100.0]}}


In [12]:
hyperparameters_preprocessing = {
#     "df": [df],
    "resample_freq": ["60S", "30S"],
    "aggregation_func": ["mean", "median"],
    "interpolation_method": ["linear", "quadratic"],
    "target_variable": ["00-eco2"],
    "pivot": [True],
    "pivot_index": [["timestamp", "id_device"]],
    "pivot_columns": [["id_variable"]],
    "pivot_values": [["value"]],
    "subset_cols": [["timestamp", "id_device", "id_variable", "value"]],
    "target_column_name": ["y"]
}

hyperparameters_model_space = {
#     "tidy_data": [df_preprocessed],
    "ini_train": ["2023-04-18 00:00:00+00:00", "2023-04-19 00:00:00+00:00"],
    "fin_train": ["2023-04-25 00:00:00+00:00", "2023-04-26 00:00:00+00:00"],
    "fin_test": ["2023-04-26 00:00:00+00:00"],
    "model_sklearn_name": ["ARDRegression"],
    "n_lags": [5, 10],
    "n_leads": [5, 10],
    "X_name_features": [['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres']],
    "Y_name_features": ["y"],
    "lag_columns": [['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres', 'y']],
    "lead_columns": ["y"],
    "scale_in_preprocessing": [True],
    "name_time_column": ["timestamp"],
    "save_preprocessing": [True],
    "path_to_save_model": ["paper"],
    "folder_name_model": ["ARDRegression"],
    "folder_name_time_execution": ["execution-time-no-defined"],
    "folder_name_preprocessed_data": ["preprocessed-data-to-use-in-model"],
    "measure_time": [True],
    "logger": [None],
    "kwargs_save_object": [{"rename_if_exists": True}],
}

hyperparameters_specific_regressor_args = cv_grid_ard

df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
prepare_dataframe_from_db_cols_for_query = [
    "00-eco2",
    "00-temp",
    "01-hum",
    "01-tvoc",
    "02-pres",
    "03-siaq",
    "04-diaq"
]
# Prepare dataframe with selected columns
df = prepare_dataframe_from_db(
    df=df,
    cols_for_query=prepare_dataframe_from_db_cols_for_query,
)

test = create_cv_hyperparameters_model(
        df = df,
        hyperparameters_preprocessing = hyperparameters_preprocessing,
        hyperparameters_model_space = hyperparameters_model_space,
        hyperparameters_specific_regressor_args = hyperparameters_specific_regressor_args
)

test

Task with args {'ini_train': '2023-04-18 00:00:00+00:00', 'fin_train': '2023-04-25 00:00:00+00:00', 'fin_test': '2023-04-26 00:00:00+00:00', 'n_lags': 5, 'n_leads': 5, 'X_name_features': ['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres'], 'Y_name_features': 'y', 'lag_columns': ['04-diaq', '01-hum', '01-tvoc', '03-siaq', '00-temp', '02-pres', 'y'], 'lead_columns': 'y', 'scale_in_preprocessing': True, 'name_time_column': 'timestamp', 'save_preprocessing': True, 'path_to_save_model': 'paper', 'folder_name_model': 'ARDRegression', 'folder_name_time_execution': 'execution-time-no-defined', 'folder_name_preprocessed_data': 'preprocessed-data-to-use-in-model', 'measure_time': True, 'logger': None, 'kwargs_save_object': {'rename_if_exists': True}, 'tidy_data':                       timestamp id_device           y    00-temp     01-hum  \
0     2023-04-18 09:31:00+00:00   DBEM003  400.000000  18.570000  33.050000   
1     2023-04-18 09:32:00+00:00   DBEM003  400.000000  18.560000

[{'model': MultiOutputRegressor(estimator=ARDRegression(alpha_1=2.0000000000000003e-06,
                                               max_iter=10, tol=0.5)),
  'preprocessing_train': {'preprocess_columns': {'order': 1,
    'df':         04-diaq     01-hum   01-tvoc    03-siaq    00-temp     02-pres  \
    0     27.000000  33.050000  2.000000  25.000000  18.570000  934.700000   
    1     25.000000  33.000000  1.000000  25.000000  18.560000  934.720000   
    2     25.000000  33.030000  8.000000  25.000000  18.550000  934.700000   
    3     26.000000  33.090000  4.000000  25.000000  18.530000  934.660000   
    4     29.000000  33.050000  3.000000  26.000000  18.530000  934.680000   
    ...         ...        ...       ...        ...        ...         ...   
    9505  25.833333  32.765000  6.000000  25.166667  24.250000  938.323333   
    9506  25.666667  32.758333  2.833333  25.166667  24.250000  938.326667   
    9507  25.833333  32.758333  3.166667  25.500000  24.251667  938.3233

In [3]:
df = pd.read_csv(r'..\data\instants_data_saved\2023-07-04_12-09-22.csv')
# df = df.query("id_device == 'DBEM003'").reset_index(drop=True)
df

Unnamed: 0,id_data,id_device,id_sensor,id_variable,timestamp,value,unit,id_location
0,1,DBEM003,sWEA,00-temp,2023-04-18 09:31:00,18.57,ºC,
1,2,DBEM003,sWEA,00-temp,2023-04-18 09:32:00,18.56,ºC,
2,3,DBEM003,sWEA,00-temp,2023-04-18 09:33:00,18.55,ºC,
3,4,DBEM003,sWEA,00-temp,2023-04-18 09:34:00,18.53,ºC,
4,5,DBEM003,sWEA,00-temp,2023-04-18 09:35:00,18.53,ºC,
...,...,...,...,...,...,...,...,...
3477901,3477902,DBEM007,sAQU,01-tvoc,2023-05-11 13:25:00,179.00,ppb,
3477902,3477903,DBEM007,sAQU,01-tvoc,2023-05-11 14:10:00,200.00,ppb,
3477903,3477904,DBEM007,sAQU,01-tvoc,2023-05-11 14:40:00,206.00,ppb,
3477904,3477905,DBEM007,sAQU,01-tvoc,2023-05-11 15:49:00,213.00,ppb,
