In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv  # Necesario para habilitar HalvingGridSearchCV
from sklearn.model_selection import HalvingGridSearchCV
import optuna

In [9]:
df_data = pd.read_csv("process_dataset/train_data_scaled_robust.csv")
response = np.load("process_dataset/y_train.npy")

In [10]:
# Definir las métricas a evaluar
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 4, 5, 7, 9, 11, 15]
}

#### Optimización usando GridSearch

In [11]:
model = DecisionTreeClassifier()
clf = GridSearchCV(model, parameters, cv=10, scoring=scoring, refit='recall')
clf.fit(df_data, response)
results = clf.cv_results_

print("Best Parameters:", clf.best_params_)
print("Best Accuracy Score:", clf.best_score_)

# Extraer las métricas de la mejor combinación de parámetros
best_index = clf.best_index_  # Índice de la mejor combinación
print("Metrics for best model:")
print("Accuracy:", results['mean_test_accuracy'][best_index])
print("Recall:", results['mean_test_recall'][best_index])
print("Precision:", results['mean_test_precision'][best_index])
print("F1 Score:", results['mean_test_f1'][best_index])


Best Parameters: {'criterion': 'entropy', 'max_depth': 7, 'splitter': 'random'}
Best Accuracy Score: 0.4883024279885266
Metrics for best model:
Accuracy: 0.4984513274336283
Recall: 0.4883024279885266
Precision: 0.48995989758832686
F1 Score: 0.4867925227930125


#### Optimización usando HalvingGridSearch 

In [22]:
model = DecisionTreeClassifier()

# Configurar el HalvingGridSearchCV
clf = HalvingGridSearchCV(model, parameters)
clf.fit(df_data, response)

print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

Best Parameters: {'criterion': 'log_loss', 'max_depth': 7, 'splitter': 'best'}
Best Score: 0.5047724750277469


  _data = np.array(data, dtype=dtype, copy=copy,


#### Optimización bayesiana (Optuna)

In [13]:
results = []

def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']) 
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 3, 15) 

    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)

    accuracy = cross_val_score(model, df_data, response, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, df_data, response, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, df_data, response, cv=10, scoring='precision').mean()
    f1 = cross_val_score(model, df_data, response, cv=10, scoring='f1').mean()

    metrics = {
        "Model": f"DT(Criterion={criterion}, splitter={splitter}, max_depth={max_depth})",
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1": f1
    }
    results.append(metrics)

    return precision  # Retorna una métrica que deseas maximizar

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

df_results = pd.DataFrame(results)

print("Best Parameters:", study.best_params)
print("Best Precision:", study.best_value)

[I 2024-10-22 12:38:15,279] A new study created in memory with name: no-name-4ed057df-284b-456b-bf71-c52a7a67ab10
[I 2024-10-22 12:38:15,742] Trial 0 finished with value: 0.4873661193420916 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 10}. Best is trial 0 with value: 0.4873661193420916.
[I 2024-10-22 12:38:15,998] Trial 1 finished with value: 0.48086501386361746 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 4}. Best is trial 0 with value: 0.4873661193420916.
[I 2024-10-22 12:38:17,618] Trial 2 finished with value: 0.47663116766716296 and parameters: {'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 9}. Best is trial 0 with value: 0.4873661193420916.
[I 2024-10-22 12:38:18,829] Trial 3 finished with value: 0.4826325069513038 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 8}. Best is trial 0 with value: 0.4873661193420916.
[I 2024-10-22 12:38:20,078] Trial 4 finished with value: 0.48452376278001114 an

Best Parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 15}
Best Precision: 0.5040357395224253


In [23]:
df_results.sort_values(by="Recall", ascending=False)

Unnamed: 0,Model,Accuracy,Recall,Precision,F1
78,"DT(Criterion=log_loss, splitter=random, max_de...",0.508628,0.494251,0.491072,0.463320
24,"DT(Criterion=entropy, splitter=random, max_dep...",0.508628,0.491930,0.486073,0.489503
88,"DT(Criterion=gini, splitter=random, max_depth=15)",0.495354,0.486549,0.492624,0.490145
23,"DT(Criterion=entropy, splitter=random, max_dep...",0.499779,0.482028,0.483970,0.468985
22,"DT(Criterion=entropy, splitter=random, max_dep...",0.494690,0.479326,0.496971,0.500942
...,...,...,...,...,...
68,"DT(Criterion=log_loss, splitter=best, max_dept...",0.494469,0.354862,0.484882,0.404644
44,"DT(Criterion=gini, splitter=best, max_depth=7)",0.505310,0.353935,0.499109,0.408782
37,"DT(Criterion=gini, splitter=best, max_depth=6)",0.499558,0.349939,0.491443,0.394841
45,"DT(Criterion=gini, splitter=best, max_depth=6)",0.500000,0.349039,0.490017,0.394825
