In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv  # Necesario para habilitar HalvingGridSearchCV
from sklearn.model_selection import HalvingGridSearchCV
import optuna

In [5]:
df_data = pd.read_csv("process_dataset/train_data_scaled_robust.csv")
response = np.load("process_dataset/y_train.npy")

In [6]:
# Definir las métricas a evaluar
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 4, 5, 7, 9, 11, 15]
}

#### Optimización usando GridSearch

In [7]:
model = DecisionTreeClassifier()
clf = GridSearchCV(model, parameters, cv=10, scoring=scoring, refit='recall')
clf.fit(df_data, response)
results = clf.cv_results_

print("Best Parameters:", clf.best_params_)
print("Best Accuracy Score:", clf.best_score_)

# Extraer las métricas de la mejor combinación de parámetros
best_index = clf.best_index_  # Índice de la mejor combinación
print("Metrics for best model:")
print("Accuracy:", results['mean_test_accuracy'][best_index])
print("Recall:", results['mean_test_recall'][best_index])
print("Precision:", results['mean_test_precision'][best_index])
print("F1 Score:", results['mean_test_f1'][best_index])


Best Parameters: {'criterion': 'entropy', 'max_depth': 15, 'splitter': 'random'}
Best Accuracy Score: 0.49598230517513037
Metrics for best model:
Accuracy: 0.5044247787610621
Recall: 0.49598230517513037
Precision: 0.4959158014887731
F1 Score: 0.4956212511949172


#### Optimización usando HalvingGridSearch 

In [8]:
model = DecisionTreeClassifier()

# Configurar el HalvingGridSearchCV
clf = HalvingGridSearchCV(model, parameters)
clf.fit(df_data, response)

print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

Best Parameters: {'criterion': 'gini', 'max_depth': 5, 'splitter': 'best'}
Best Score: 0.5036625971143175


  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


#### Optimización bayesiana (Optuna)

In [9]:
results = []

def objective(trial):
    criterion = trial.suggest_int('criterion', ['gini', 'entropy', 'log_loss'])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_categorical('max_depth', 3, 15)

    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth)


    accuracy = cross_val_score(model, df_data, response, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, df_data, response, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, df_data, response, cv=10, scoring='precision').mean()
    f1 = cross_val_score(model, df_data, response, cv=10, scoring='f1').mean()

    metrics = {
        "Model": f"DT(Criterion={criterion}, splitter={splitter}, max_depth={max_depth})",
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1": f1
    }
    results.append(metrics)

    return precision

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

df_results = pd.DataFrame(results)

print("Best Parameters:", study.best_params)
print("Best Recall F1:", study.best_value)

[I 2024-10-21 11:27:09,680] A new study created in memory with name: no-name-bc075664-8976-49e0-840f-05b5419bcbb2
[W 2024-10-21 11:27:09,682] Trial 0 failed with parameters: {} because of the following error: TypeError("Trial.suggest_int() missing 1 required positional argument: 'high'").
Traceback (most recent call last):
  File "/home/nsoto/miniconda3/envs/ml/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_11423/1173875558.py", line 4, in objective
    criterion = trial.suggest_int('criterion', ['gini', 'entropy', 'log_loss'])
  File "/home/nsoto/miniconda3/envs/ml/lib/python3.13/site-packages/optuna/_convert_positional_args.py", line 83, in converter_wrapper
    return func(**kwargs)
TypeError: Trial.suggest_int() missing 1 required positional argument: 'high'
[W 2024-10-21 11:27:09,684] Trial 0 failed with value None.


TypeError: Trial.suggest_int() missing 1 required positional argument: 'high'

In [46]:
df_results.sort_values(by="F1", ascending=False)

Unnamed: 0,Model,Accuracy,Recall,Precision,F1
77,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
71,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
20,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
21,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
22,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
...,...,...,...,...,...
46,"KNN(n_neighbors=6, weights=uniform, metric=euc...",0.493584,0.300553,0.476347,0.367902
5,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
56,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
1,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
