In [21]:
import os
import numpy as np
import pandas as pd

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import train_test_split

import joblib

In [22]:
print("="*80)
print("1. CARGA DE DATOS")
print("="*80)

DATA_DIRS = ["../Data", "../datos", "./Data", "./datos"]
CANDIDATE_FILES = {
    "train": ["train.csv", "train_df.csv"],
    "val":   ["validation.csv", "val.csv", "valid.csv"],
    "test":  ["test.csv", "test_df.csv"],
    "one":   ["Food_Delivery_Times.csv", "Food_Delivery_times.csv"]
}

def find_path(kind):
    for d in DATA_DIRS:
        for f in CANDIDATE_FILES[kind]:
            p = os.path.join(d, f)
            if os.path.exists(p):
                return p
    return None

train_path = find_path("train")
val_path   = find_path("val")
test_path  = find_path("test")
one_path   = find_path("one")

if train_path and val_path and test_path:
    train_df = pd.read_csv(train_path)
    validation_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
else:
    assert one_path is not None, "No se encontraron CSV; ajusta rutas o nombres."
    df = pd.read_csv(one_path)
    num_df = df.select_dtypes(include=[np.number]).dropna()
    train_df, tmp_df = train_test_split(num_df, test_size=0.4, random_state=42)
    validation_df, test_df = train_test_split(tmp_df, test_size=0.5, random_state=42)

feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

train_x = train_df[feature_cols].dropna().to_numpy()
validation_x = validation_df[feature_cols].dropna().to_numpy()
test_x = test_df[feature_cols].dropna().to_numpy()

print(f"\nColumnas numéricas usadas ({len(feature_cols)}): {feature_cols}")
print(f"Train: {train_x.shape} | Validation: {validation_x.shape} | Test: {test_x.shape}")

1. CARGA DE DATOS

Columnas numéricas usadas (5): ['Order_ID', 'Distance_km', 'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min']
Train: (582, 5) | Validation: (194, 5) | Test: (194, 5)


In [23]:
def compute_centroids(X, labels):
    ks = np.unique(labels)
    return np.vstack([X[labels == k].mean(axis=0) for k in ks])

def assign_by_nearest_centroid(X, centroids):
    d = np.sqrt(((X[:, None, :] - centroids[None, :, :])**2).sum(axis=2))
    return d.argmin(axis=1)

In [24]:
print("\nSe va a realizar un experimento donde se varían hiperparámetros y se busca la mejor solución.")

results = []
n_clusters_values = [2, 3, 4, 5, 6, 8, 10]
linkage_values    = ["ward", "complete", "average", "single"]

for n_clusters in n_clusters_values:
    for linkage in linkage_values:
        metric = "euclidean"
        try:
            model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, metric=metric)
            train_labels = model.fit_predict(train_x)
            centroids = compute_centroids(train_x, train_labels)
            val_labels = assign_by_nearest_centroid(validation_x, centroids)
            if len(np.unique(val_labels)) < 2:
                print(f"[SKIP] k={n_clusters:2d} | {linkage:8s} -> 1 solo cluster en validación")
                continue
            sil = silhouette_score(validation_x, val_labels, metric="euclidean")
            db  = davies_bouldin_score(validation_x, val_labels)
            results.append({
                "n_clusters": n_clusters,
                "linkage": linkage,
                "metric": metric,
                "val_silhouette": sil,
                "val_davies_bouldin": db
            })
            print(f"[OK] k={n_clusters:2d} | {linkage:8s} | Sil={sil:.4f} | DB={db:.4f}")
        except Exception as e:
            print(f"[FAIL] k={n_clusters:2d} | {linkage:8s} | {e}")

if not results:
    raise RuntimeError("No se obtuvieron resultados de validación. Revisa columnas numéricas o parámetros.")


Se va a realizar un experimento donde se varían hiperparámetros y se busca la mejor solución.
[OK] k= 2 | ward     | Sil=0.6232 | DB=0.4967
[OK] k= 2 | complete | Sil=0.6208 | DB=0.4851
[OK] k= 2 | average  | Sil=0.6217 | DB=0.4871
[OK] k= 2 | single   | Sil=0.5568 | DB=0.4890
[OK] k= 3 | ward     | Sil=0.5680 | DB=0.5194
[OK] k= 3 | complete | Sil=0.5688 | DB=0.5142
[OK] k= 3 | average  | Sil=0.5806 | DB=0.5112
[OK] k= 3 | single   | Sil=0.4312 | DB=0.5559
[OK] k= 4 | ward     | Sil=0.5403 | DB=0.5457
[OK] k= 4 | complete | Sil=0.5254 | DB=0.5570
[OK] k= 4 | average  | Sil=0.5323 | DB=0.5566
[OK] k= 4 | single   | Sil=0.4973 | DB=0.5594
[OK] k= 5 | ward     | Sil=0.5194 | DB=0.5729
[OK] k= 5 | complete | Sil=0.5112 | DB=0.5780
[OK] k= 5 | average  | Sil=0.5001 | DB=0.5836
[OK] k= 5 | single   | Sil=0.3549 | DB=0.6081
[OK] k= 6 | ward     | Sil=0.4809 | DB=0.6116
[OK] k= 6 | complete | Sil=0.4850 | DB=0.6004
[OK] k= 6 | average  | Sil=0.4752 | DB=0.5995
[OK] k= 6 | single   | Sil=0.31

In [25]:
print("\nResultados en VALIDATION (ordenados por Silhouette desc y DB asc):")
results_df = (
    pd.DataFrame(results)
    .sort_values(by=["val_silhouette", "val_davies_bouldin"], ascending=[False, True])
    .reset_index(drop=True)
)
results_df


Resultados en VALIDATION (ordenados por Silhouette desc y DB asc):


Unnamed: 0,n_clusters,linkage,metric,val_silhouette,val_davies_bouldin
0,2,ward,euclidean,0.623181,0.496659
1,2,average,euclidean,0.621741,0.487138
2,2,complete,euclidean,0.62082,0.485075
3,3,average,euclidean,0.580607,0.511157
4,3,complete,euclidean,0.568849,0.514189
5,3,ward,euclidean,0.567983,0.519433
6,2,single,euclidean,0.556752,0.489046
7,4,ward,euclidean,0.54029,0.545701
8,4,average,euclidean,0.532342,0.55664
9,4,complete,euclidean,0.525413,0.556966


In [26]:
print("\nSelección del mejor conjunto de hiperparámetros según VALIDATION:")
best_params = results_df.iloc[0]
best_params


Selección del mejor conjunto de hiperparámetros según VALIDATION:


n_clusters                    2
linkage                    ward
metric                euclidean
val_silhouette         0.623181
val_davies_bouldin     0.496659
Name: 0, dtype: object

In [27]:
print("\nSe entrena el modelo con los mejores parámetros.")
best_model = AgglomerativeClustering(
    n_clusters=int(best_params["n_clusters"]),
    linkage=best_params["linkage"],
    metric=best_params["metric"]
)

trainval_x = np.vstack([train_x, validation_x])
trainval_labels = best_model.fit_predict(trainval_x)
trainval_centroids = compute_centroids(trainval_x, trainval_labels)
print("Entrenamiento final completado.")


Se entrena el modelo con los mejores parámetros.
Entrenamiento final completado.


In [28]:
print('\nFinalmente se usan los datos de "test" para validar las métricas del modelo.')

test_labels = assign_by_nearest_centroid(test_x, trainval_centroids)
test_sil = silhouette_score(test_x, test_labels, metric="euclidean")
test_db  = davies_bouldin_score(test_x, test_labels)

print(f"\nTest Silhouette: {test_sil:.6f}")
print(f"Test Davies-Bouldin: {test_db:.6f}")


Finalmente se usan los datos de "test" para validar las métricas del modelo.

Test Silhouette: 0.570961
Test Davies-Bouldin: 0.575263


In [29]:
print("\nComo ejemplo: se tiene un nuevo registro y se debe predecir su cluster.")

new_example = {c: [train_df[c].median()] for c in feature_cols}
new_df = pd.DataFrame(new_example)
new_x = new_df[feature_cols].to_numpy()

pred_cluster = assign_by_nearest_centroid(new_x, trainval_centroids)[0]
print("Cluster asignado al nuevo registro:", int(pred_cluster))
new_df


Como ejemplo: se tiene un nuevo registro y se debe predecir su cluster.
Cluster asignado al nuevo registro: 0


Unnamed: 0,Order_ID,Distance_km,Preparation_Time_min,Courier_Experience_yrs,Delivery_Time_min
0,533.0,9.675,17.0,5.0,55.0


In [30]:
print("\nPara guardar el modelo y poder usarlo después con joblib.")

artifact = "agnes_model.joblib"
payload = {
    "params": {
        "n_clusters": int(best_params["n_clusters"]),
        "linkage": best_params["linkage"],
        "metric": best_params["metric"]
    },
    "centroids": trainval_centroids,
    "feature_names": feature_cols
}
joblib.dump(payload, artifact)
print("Model saved successfully!", artifact)


Para guardar el modelo y poder usarlo después con joblib.
Model saved successfully! agnes_model.joblib


In [31]:
print("\nPara usarlo:")

p = joblib.load("agnes_model.joblib")
new_x2 = new_df[p["feature_names"]].to_numpy()
pred2 = assign_by_nearest_centroid(new_x2, p["centroids"])[0]
print("Predicted Cluster:", int(pred2))


Para usarlo:
Predicted Cluster: 0
