# Solución etapa 3 - Training

In [2]:
# Utilidades para print
from utils.print_utils import tabl, headr, titl

In [3]:
version_to_load='v3'

In [4]:
# Cargar el dataset
import pandas as pd
csv_path = f'../data/preprocessed/preprocessed_data_{version_to_load}.csv'
print('... Loading:', csv_path,'...')
final_data = pd.read_csv(csv_path)

tabl(final_data)

... Loading: ../data/preprocessed/preprocessed_data_v3.csv ...
    Ubicacion       Fallo    Promedio_Horas_Entre_Fallos_por_mes    Promedio_Temperatura_C    Promedio_Vibracion_mm_s    Promedio_Horas_Operativas  Tipo_Equipo    Fabricante    Modelo      Potencia_kW    Horas_Recomendadas_Revision
--  ------------  -------  -------------------------------------  ------------------------  -------------------------  ---------------------------  -------------  ------------  --------  -------------  -----------------------------
 0  Planta Sur          1                                 133200                    103.35                       3.9                         37071  Motor          ABB           Z300               1706                           4455
 1  Planta Este         1                                 298800                     57.58                       3.65                          250  Generador      GE            Y200               3432                           8213
 2  Plant

In [5]:
target_column = 'Fallo'

## Preprocesar

In [6]:
# Dividir en variables predictoras (X) y objetivo (y)
X = final_data.drop(columns=[target_column])

y = final_data[target_column]

print(titl('X:'))
tabl(X)


[1m[4m[94m** X: **[0m

    Ubicacion       Promedio_Horas_Entre_Fallos_por_mes    Promedio_Temperatura_C    Promedio_Vibracion_mm_s    Promedio_Horas_Operativas  Tipo_Equipo    Fabricante    Modelo      Potencia_kW    Horas_Recomendadas_Revision
--  ------------  -------------------------------------  ------------------------  -------------------------  ---------------------------  -------------  ------------  --------  -------------  -----------------------------
 0  Planta Sur                                   133200                    103.35                       3.9                         37071  Motor          ABB           Z300               1706                           4455
 1  Planta Este                                  298800                     57.58                       3.65                          250  Generador      GE            Y200               3432                           8213
 2  Planta Norte                                  18000                     71.2

In [7]:
# Dividir en entrenamiento y prueba
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Candidatos

1. LogisticRegression
2. GradientBoostingClassifier
3. SVC (kernel='rbf')

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.compose import make_column_selector as selector


### Pipelines

In [9]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)


print(headr("Numerical columns"), numerical_columns)
print(headr("Categorical columns"), categorical_columns)


[4mNumerical columns[0m
 ['Promedio_Horas_Entre_Fallos_por_mes', 'Promedio_Temperatura_C', 'Promedio_Vibracion_mm_s', 'Promedio_Horas_Operativas', 'Potencia_kW', 'Horas_Recomendadas_Revision']

[4mCategorical columns[0m
 ['Ubicacion', 'Tipo_Equipo', 'Fabricante', 'Modelo']


In [10]:
# Preprocesamiento para variables categóricas
categorical_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocesamiento para variables continuas
numerical_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combinar preprocesadores
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

In [11]:
pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression())]),
    # "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier())]),
    # "SVC": Pipeline([('preprocessor', preprocessor),('classifier', SVC(kernel='linear'))]),
    # "SGDClassifier": Pipeline([('preprocessor', preprocessor),('classifier', SGDClassifier())]),
}

In [12]:
# Cargamos funciones para best_model
from utils.best_model_utils import generate_learning_curves, generate_validation_curves, cv_train

### CV

In [13]:
# Config CV
cvss = ShuffleSplit(n_splits=40, test_size=0.2, random_state=0)

In [14]:
# Aplicar CV
results = {}

for name, pipeline in pipelines.items():
    print(headr(f"Entrenar {name}"))
    results[name] = cv_train(name, pipeline, cvss, X_train, y_train, X_test, y_test)

print(titl("Resultados:"))
results_df=pd.DataFrame(results)
tabl(results_df)


[4mEntrenar LogisticRegression[0m

test score (mean-std): 0.98 - 0.01
train score (mean-std): 0.98 - 0.00
params: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

[1m[4m[94m** RESULTADOS: **[0m

       LogisticRegression
---  --------------------
acc                  0.98
f1                   0.99
(2, 1)


### Curva de aprendizaje


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Aplicar LC
train_sizes = np.linspace(0.1, 1.0, num=5, endpoint=True)

for pipeline_name, pipeline_obj in pipelines.items():
    generate_learning_curves(pipeline_name, pipeline_obj, X, y, train_sizes, cvss)

### Curva validación

In [None]:
pname = 'LogisticRegression'
Cs = [0.1, 1, 10, 100]
# Cs = [5, 10, 15]
generate_validation_curves(pname, pipelines[pname], X, y, 'classifier__C', Cs, cvss)

## Afinar hiperparámentros

#### LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV

pname = 'LogisticRegression'

param_grid = {
    'classifier__C': [5, 10, 15],
    # 'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
    'classifier__solver': ['lbfgs', 'liblinear', 'newto-cg', 'newto-cholesky', 'sag', 'saga']
    # 'classifier__max_iter': [1,10,100,1000]
}

grid_search = GridSearchCV(
    pipelines[pname], param_grid, cv=cvss, scoring="accuracy")

grid_search.fit(X_train, y_train)

print(headr(pname))
print("Mejores hiperparámetros:", grid_search.best_params_)
print(f"Mejor accuracy: {grid_search.best_score_:.2f}")

### re-entrenar + re-evaluar


In [None]:
# Definir pipelines con mejores hiperparámetros
pipelines = {
    "LogisticRegression": Pipeline([('preprocessor', preprocessor),('classifier', LogisticRegression(C=5,solver='lbfgs'))]),
    # "GradientBoostingClassifier": Pipeline([('preprocessor', preprocessor),('classifier', GradientBoostingClassifier())]),
    "SVC": Pipeline([('preprocessor', preprocessor),('classifier', SVC(kernel='linear'))]),
    "SGDClassifier": Pipeline([('preprocessor', preprocessor),('classifier', SGDClassifier())]),
}


# Aplicar CV
results = {}

for name, pipeline in pipelines.items():
    print(headr(f"Entrenar {name}"))
    results[name] = cv_train(name, pipeline, cvss, X_train, y_train, X_test, y_test)

## Comparar resultados

In [None]:
print(titl("Resultados:"))
results_df=pd.DataFrame(results)
tabl(results_df)

In [None]:
# Guardar resultado
result_path= f'../results/result_{version_to_load}.csv'
results_df.to_csv(result_path, index=False)

## Desición
> Ninguno

- Muy por debajo de lo que se necesita
- Una columna con un peso descompensado en el resultado
- Se debe mejorar enfocando hacia interrelaciones a nivel de mes