# Solución etapa 4 - Training - Predict - Fallos v1

In [None]:
# Utilidades para print
from utils.print_utils import tabl, headr, titl
from utils.explore_utils import explr

In [102]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from skforecast.recursive import ForecasterRecursive
from skforecast.direct import ForecasterDirect

from skforecast.model_selection import backtesting_forecaster
from skforecast.datasets import fetch_dataset
from skforecast.model_selection import TimeSeriesFold
from skforecast.model_selection import grid_search_forecaster

In [None]:
version_to_load = 'v1'

In [None]:
# Cargar el dataset
csv_path = f'../data/preprocessed/preprocessed_data_{version_to_load}.csv'
print('... Loading:', csv_path, '...')
final_data = pd.read_csv(csv_path)

tabl(final_data)

In [None]:
final_data.info()

In [85]:
target_forecast_column = 'Horas_Operativas'
target_class_column = 'Fallo'

In [None]:
# Steps y lags
steps = 15
lags = 50

## Funciones procesado

In [None]:
# Preparar el dataset

def prep_equipo(equipo):
    equipo.drop(columns=['ID_Equipo', 'Tipo_Equipo', 'Modelo', 'Potencia_kW',
                'Horas_Recomendadas_Revision', 'Fabricante'], inplace=True)

    equipo['Fecha'] = pd.to_datetime(equipo['Fecha'], format='%Y-%m-%d')

    # fecha como índice
    equipo = equipo.set_index('Fecha')

    # Eliminar índices duplicados (mantener el primero)
    equipo_clean = equipo[~equipo.index.duplicated(keep='first')]
    equipo_clean[equipo_clean.index.duplicated()]

    # Convierte Timeseries a frecuencia especificada.
    equipo_fq = equipo_clean.asfreq('D', fill_value=0)

    # print(equipo_fq.info())

    return equipo_fq

In [None]:
# Separación datos train-test

def sep_train_test(equipo_fq, steps, doPlot=True):
    steps = steps
    datos_train = equipo_fq[:-steps]
    datos_test = equipo_fq[-steps:]
    # print(f"Fechas train : {datos_train.index.min()} --- {datos_train.index.max()}  (n={len(datos_train)})")
    # print(f"Fechas test  : {datos_test.index.min()} --- {datos_test.index.max()}  (n={len(datos_test)})")

    if (doPlot):
        fig, ax = plt.subplots(figsize=(16, 5))
        datos_train['Horas_Operativas'].plot(ax=ax, label='train')
        datos_test['Horas_Operativas'].plot(ax=ax, label='test')
        ax.legend()

    return datos_train, datos_test

In [None]:
# Búsqueda de hiperparámetros: grid search

def search_best_forecaster(steps, lags_grid, param_grid, datos_train, exogs_train, target_column):
    forecaster = ForecasterRecursive(
        regressor=Ridge(random_state=123),
        transformer_y=StandardScaler(),
        lags=30
    )

    # Particiones de entrenamiento y validación
    cv = TimeSeriesFold(
        steps=steps,
        initial_train_size=int(len(datos_train) * 0.5),
        refit=False,
        fixed_train_size=False,
    )

    resultados_grid = grid_search_forecaster(
        forecaster=forecaster,
        y=datos_train[target_column],
        cv=cv,
        param_grid=param_grid,
        lags_grid=lags_grid,
        metric='mean_squared_error',
        return_best=True,
        n_jobs='auto',
        verbose=False,
        exog=exogs_train,
    )

    # Resultados de la búsqueda de hiperparámetros
    bestfc = resultados_grid.loc[0]

    return bestfc

In [None]:
# Generar modelo final

def gen_finalModel(steps, best_lags, best_params, datos_train, exogs_train, target_column):
    # Crear y entrenar forecaster final
    forecaster = ForecasterRecursive(
        regressor=Ridge(alpha=best_params['alpha'], random_state=123),
        transformer_y=StandardScaler(),
        lags=best_lags
    )

    forecaster.fit(y=datos_train[target_column], exog=exogs_train)

    return forecaster

In [None]:
# Evaluar error

def get_error(target_column, predicciones, equipo_fq, datos_test):
    # Error test
    error_mse = mean_squared_error(
        y_true=datos_test[target_column],
        y_pred=predicciones
    )

    data_var = equipo_fq[target_column].var()

    mse2var = round(error_mse/data_var*100, 2)

    print(f"Error de test (mse): {error_mse}")
    print(f"Varianza datos: {data_var}")
    print(f"mse2var: {mse2var}%")

    return mse2var

In [None]:
# Ingeniería de características en residuos
def create_features_w_residuals(data, residuals, lags):
    features = pd.DataFrame(index=data.index)
    for lag in range(1, lags + 1):
        features[f'lag_{lag}'] = data.shift(lag)

    features['residuals'] = residuals
    features.dropna(inplace=True)

    return features

## Un equipo

In [51]:
eq = 1

best_lags = 50
best_params = {'alpha': 0.1}

In [52]:
# Filtrar el df del equipo
equipo = final_data[final_data['ID_Equipo'] == eq].copy()

# Preparar el dataset para el equipo seleccionado
equipo_fq = prep_equipo(equipo)

equipo_fq

Unnamed: 0_level_0,Temperatura_C,Vibracion_mm_s,Horas_Operativas,Fallo
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,34.749896,2.136812,614,0.0
2021-01-02,0.000000,0.000000,0,0.0
2021-01-03,56.200558,1.529395,658,0.0
2021-01-04,70.555608,3.411009,664,0.0
2021-01-05,97.509000,0.618757,683,0.0
...,...,...,...,...
2024-12-27,38.587761,2.748949,808,0.0
2024-12-28,90.397197,5.382844,812,0.0
2024-12-29,21.104524,5.686564,824,0.0
2024-12-30,30.559628,7.948501,828,0.0


In [69]:
# Exógenos para el modelo
exogs = equipo_fq.drop(columns=target_forecast_column)
exogs_train = equipo_fq.drop(columns=target_forecast_column)[:-steps]
exogs_test = equipo_fq.drop(columns=target_forecast_column)[-steps:]

In [66]:
exogs[:-steps]

Unnamed: 0_level_0,Temperatura_C,Vibracion_mm_s,Fallo
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-01-01,34.749896,2.136812,0.0
2021-01-02,0.000000,0.000000,0.0
2021-01-03,56.200558,1.529395,0.0
2021-01-04,70.555608,3.411009,0.0
2021-01-05,97.509000,0.618757,0.0
...,...,...,...
2024-12-12,47.524678,0.731239,0.0
2024-12-13,48.043295,8.836478,0.0
2024-12-14,72.829938,0.553675,0.0
2024-12-15,77.041463,6.231991,0.0


In [71]:
# modelo final
final_model = gen_finalModel(steps, best_lags, best_params, equipo_fq[:-steps], exogs_train, target_forecast_column)

# Predicciones
predicciones = final_model.predict(steps=steps, exog=exogs_test)

In [77]:
# Crear y entrenar forecaster final
forecaster = ForecasterRecursive(
    regressor=Ridge(alpha=best_params['alpha'], random_state=123),
    transformer_y=StandardScaler(),
    lags=best_lags
)

forecaster.fit(y=equipo_fq[target_forecast_column])

In [78]:
# Predicciones
predicciones = forecaster.predict(steps=len(equipo_fq))

In [80]:
predicciones.shape

(1461,)

In [81]:
# residuos
residuals = equipo_fq[target_forecast_column] - predicciones.values

residuals

Fecha
2021-01-01    572.291471
2021-01-02    -57.587057
2021-01-03    583.999992
2021-01-04    577.053389
2021-01-05    582.901752
                 ...    
2024-12-27    520.634529
2024-12-28    524.634529
2024-12-29    536.634529
2024-12-30    540.634529
2024-12-31   -265.365471
Freq: D, Name: Horas_Operativas, Length: 1461, dtype: float64

## Clasificador

In [99]:
X = create_features_w_residuals(equipo_fq[target_forecast_column], residuals, lags)
y = equipo_fq[target_class_column].iloc[len(equipo_fq) - len(X):]

tabl(X.tail())
y

Fecha                  lag_1    lag_2    lag_3    lag_4    lag_5    lag_6    lag_7    lag_8    lag_9    lag_10    lag_11    lag_12    lag_13    lag_14    lag_15    lag_16    lag_17    lag_18    lag_19    lag_20    lag_21    lag_22    lag_23    lag_24    lag_25    lag_26    lag_27    lag_28    lag_29    lag_30    lag_31    lag_32    lag_33    lag_34    lag_35    lag_36    lag_37    lag_38    lag_39    lag_40    lag_41    lag_42    lag_43    lag_44    lag_45    lag_46    lag_47    lag_48    lag_49    lag_50    residuals
-------------------  -------  -------  -------  -------  -------  -------  -------  -------  -------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------  ----

Fecha
2021-02-20    0.0
2021-02-21    0.0
2021-02-22    0.0
2021-02-23    0.0
2021-02-24    0.0
             ... 
2024-12-27    0.0
2024-12-28    0.0
2024-12-29    0.0
2024-12-30    0.0
2024-12-31    1.0
Freq: D, Name: Fallo, Length: 1411, dtype: float64

In [92]:
# Dividir en entrenamiento y prueba

X_train, X_test = X[:-steps], X[-steps:]
y_train, y_test = y[:-steps], y[-steps:]

In [93]:
# Ajustar bosque aleatorio en residuos
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [100]:
# Generar predicciones

y_pred = rf_model.predict(X_test)

print(y_pred)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [103]:
# evaluar el modelo
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc}')

Accuracy: 1.0
