In [39]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv  # Necesario para habilitar HalvingGridSearchCV
from sklearn.model_selection import HalvingGridSearchCV
import optuna

In [21]:
df_data = pd.read_csv("process_dataset/train_data_scaled_robust.csv")
response = np.load("process_dataset/y_train.npy")

In [33]:
# Definir las métricas a evaluar
scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1'
}

#### Optimización usando GridSearch

In [31]:
parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

model = KNeighborsClassifier()
clf = GridSearchCV(model, parameters, cv=10, scoring=scoring, refit='recall')
clf.fit(df_data, response)
results = clf.cv_results_

print("Best Parameters:", clf.best_params_)
print("Best Accuracy Score:", clf.best_score_)

# Extraer las métricas de la mejor combinación de parámetros
best_index = clf.best_index_  # Índice de la mejor combinación
print("Metrics for best model:")
print("Accuracy:", results['mean_test_accuracy'][best_index])
print("Recall:", results['mean_test_recall'][best_index])
print("Precision:", results['mean_test_precision'][best_index])
print("F1 Score:", results['mean_test_f1'][best_index])


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Best Accuracy Score: 0.4869813759948289
Metrics for best model:
Accuracy: 0.5050884955752212
Recall: 0.4869813759948289
Precision: 0.4971935303492526
F1 Score: 0.49163991721166234


#### Optimización usando HalvingGridSearch 

In [23]:
parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

model = KNeighborsClassifier()

# Configurar el HalvingGridSearchCV
clf = HalvingGridSearchCV(model, parameters)
clf.fit(df_data, response)

print("Best Parameters:", clf.best_params_)
print("Best Score:", clf.best_score_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}
Best Score: 0.49612403100775193


#### Optimización bayesiana (Optuna)

In [44]:
results = []

def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 9)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])

    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric)

    accuracy = cross_val_score(model, df_data, response, cv=10, scoring='accuracy').mean()
    recall = cross_val_score(model, df_data, response, cv=10, scoring='recall').mean()
    precision = cross_val_score(model, df_data, response, cv=10, scoring='precision').mean()
    f1 = cross_val_score(model, df_data, response, cv=10, scoring='f1').mean()

    metrics = {
        "Model": f"KNN(n_neighbors={n_neighbors}, weights={weights}, metric={metric})",
        "Accuracy": accuracy,
        "Recall": recall,
        "Precision": precision,
        "F1": f1
    }
    results.append(metrics)

    return precision

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

df_results = pd.DataFrame(results)

print("Best Parameters:", study.best_params)
print("Best Recall F1:", study.best_value)

[I 2024-10-21 01:13:40,726] A new study created in memory with name: no-name-49eaa806-77e4-489e-8e70-9ad9c8bbedd7
[I 2024-10-21 01:13:41,313] Trial 0 finished with value: 0.48225444188008126 and parameters: {'n_neighbors': 6, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.48225444188008126.
[I 2024-10-21 01:13:42,482] Trial 1 finished with value: 0.4899224283634223 and parameters: {'n_neighbors': 4, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.4899224283634223.
[I 2024-10-21 01:13:42,966] Trial 2 finished with value: 0.48050445867597497 and parameters: {'n_neighbors': 7, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 1 with value: 0.4899224283634223.
[I 2024-10-21 01:13:43,937] Trial 3 finished with value: 0.49428318956048534 and parameters: {'n_neighbors': 7, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 3 with value: 0.49428318956048534.
[I 2024-10-21 01:13:44,847] Trial 4 finished with value: 0

Best Parameters: {'n_neighbors': 5, 'weights': 'uniform', 'metric': 'manhattan'}
Best Recall Score: 0.4972149920203851


In [46]:
df_results.sort_values(by="F1", ascending=False)

Unnamed: 0,Model,Accuracy,Recall,Precision,F1
77,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
71,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
20,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
21,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
22,"KNN(n_neighbors=5, weights=distance, metric=ma...",0.505088,0.486981,0.497194,0.491640
...,...,...,...,...,...
46,"KNN(n_neighbors=6, weights=uniform, metric=euc...",0.493584,0.300553,0.476347,0.367902
5,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
56,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
1,"KNN(n_neighbors=4, weights=uniform, metric=euc...",0.501770,0.294694,0.489922,0.367674
