1.1 - CARGA Y PREPROCESAMIENTO<br>
======================================================

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
import pickle, os, sys

In [2]:
CSV_NAME = "dataset_t2.csv"

In [3]:
if not os.path.exists(CSV_NAME):
    print("No se encontró el dataset_t2.csv")
    print("Colócalo en la misma carpeta y vuelve a ejecutar el script.")
    sys.exit()

In [4]:
df = pd.read_csv(CSV_NAME)
print(f"Dataset cargado con {df.shape[0]} filas y {df.shape[1]} columnas.")
print(df.head())

Dataset cargado con 15000 filas y 9 columnas.
   User-ID        ISBN  Book-Rating  Avg_User_Rating  Num_Ratings_User  \
0   276725  034545104X            0             4.37               412   
1   276726  0155061224            5             9.56               278   
2   276727  0446520802            0             7.59               493   
3   276729  052165615X            3             6.39                16   
4   276729  0521795028            6             2.40                13   

   Book_Popularity  Book_Pages  Book_Year Rating_Category  
0             55.7         308       1994            Bajo  
1             36.7        1177       2000           Medio  
2             19.6         413       1983            Bajo  
3             24.9         418       1997            Bajo  
4             22.2        1007       1986           Medio  


1.2 - VARIABLES X e Y<br>
======================================================

In [5]:
Y_col = "Rating_Category"
X_cols = [c for c in df.columns if c != Y_col and df[c].dtype != 'object']

In [6]:
X = df[X_cols]
Y = df[Y_col]

In [7]:
print("Variables usadas para clustering:")
print(X_cols)
print(f"Etiqueta Y: {Y_col} (clases: {df[Y_col].unique()})")

Variables usadas para clustering:
['User-ID', 'Book-Rating', 'Avg_User_Rating', 'Num_Ratings_User', 'Book_Popularity', 'Book_Pages', 'Book_Year']
Etiqueta Y: Rating_Category (clases: ['Bajo' 'Medio' 'Alto'])


1.3 - DIVISIÓN DE DATOS<br>
======================================================

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Train: {len(X_train)} filas, Test: {len(X_test)} filas")

Train: 12000 filas, Test: 3000 filas


1.4 - NORMALIZACIÓN<br>
======================================================

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
os.makedirs("clustering_outputs", exist_ok=True)
pickle.dump(scaler, open("clustering_outputs/scaler.pkl", "wb"))

1.5 - ENTRENAMIENTO Y EVALUACIÓN<br>
======================================================

In [11]:
models_info = []

Configuraciones kmeans

In [12]:
for k in [3, 4, 5, 6]:
    for init_method in ["random", "k-means++"]:
        model = KMeans(n_clusters=k, init=init_method, random_state=42)
        labels = model.fit_predict(X_train_scaled)
        score = silhouette_score(X_train_scaled, labels)
        models_info.append({
            "model": model,
            "name": f"KMeans_k={k}_{init_method}",
            "silhouette": score
        })
        print(f"KMeans(k={k}, init={init_method}) - Silhouette = {score:.3f}")

KMeans(k=3, init=random) - Silhouette = 0.139
KMeans(k=3, init=k-means++) - Silhouette = 0.112
KMeans(k=4, init=random) - Silhouette = 0.145
KMeans(k=4, init=k-means++) - Silhouette = 0.125
KMeans(k=5, init=random) - Silhouette = 0.133
KMeans(k=5, init=k-means++) - Silhouette = 0.132
KMeans(k=6, init=random) - Silhouette = 0.134
KMeans(k=6, init=k-means++) - Silhouette = 0.136


Configuraciones meanshift

In [13]:
for quantile in [0.2, 0.3, 0.4, 0.5]:
    bandwidth = estimate_bandwidth(X_train_scaled, quantile=quantile)
    model = MeanShift(bandwidth=bandwidth)
    labels = model.fit_predict(X_train_scaled)
    score = silhouette_score(X_train_scaled, labels)
    models_info.append({
        "model": model,
        "name": f"MeanShift_q={quantile:.1f}",
        "silhouette": score
    })
    print(f"MeanShift(q={quantile:.1f}) - Silhouette = {score:.3f}")
    
# Resumen general
results_df = pd.DataFrame([
    {"Modelo": m["name"], "Silhouette": round(m["silhouette"], 4)}
    for m in models_info
]).sort_values("Silhouette", ascending=False)

MeanShift(q=0.2) - Silhouette = 0.377
MeanShift(q=0.3) - Silhouette = 0.377
MeanShift(q=0.4) - Silhouette = 0.377
MeanShift(q=0.5) - Silhouette = 0.377


In [14]:
print("RANKING MODELOS (Top 12)")
print(results_df)

RANKING MODELOS (Top 12)
                  Modelo  Silhouette
8        MeanShift_q=0.2      0.3774
9        MeanShift_q=0.3      0.3774
10       MeanShift_q=0.4      0.3774
11       MeanShift_q=0.5      0.3774
2      KMeans_k=4_random      0.1453
0      KMeans_k=3_random      0.1390
7   KMeans_k=6_k-means++      0.1359
6      KMeans_k=6_random      0.1344
4      KMeans_k=5_random      0.1332
5   KMeans_k=5_k-means++      0.1317
3   KMeans_k=4_k-means++      0.1254
1   KMeans_k=3_k-means++      0.1115


Guarda resultados, ver en el editor de codigo

In [15]:
results_df.to_csv("clustering_outputs/model_silhouette_scores.csv", index=False)

1.6 - APLICAR LOS 3 MEJORES MODELOS AL TEST<br>
======================================================

In [16]:
top3 = results_df.head(3)
for i, name in enumerate(top3["Modelo"], 1):
    model = [m for m in models_info if m["name"] == name][0]["model"]
    pickle.dump(model, open(f"clustering_outputs/model_top{i}.pkl", "wb"))

    # predicción en test, si vemos la densidad predomina la clase 'baja'
    preds = model.predict(X_test_scaled)
    df_result = pd.DataFrame({
        "Cluster": preds,
        "Y_real": Y_test.values
    })

    # mapeo cluster → clase dominante
    mapping = df_result.groupby("Cluster")["Y_real"].agg(lambda x: x.mode()[0])
    df_result["Y_pred"] = df_result["Cluster"].map(mapping)

    # precisión simple
    accuracy = (df_result["Y_real"] == df_result["Y_pred"]).mean()
    print(f"Modelo {i}: {name} - Precisión mapeada = {accuracy:.3f}")

    # guardar resultados
    df_result.to_csv(f"clustering_outputs/test_labels_model_{i}.csv", index=False)

Modelo 1: MeanShift_q=0.2 - Precisión mapeada = 0.618
Modelo 2: MeanShift_q=0.3 - Precisión mapeada = 0.618
Modelo 3: MeanShift_q=0.4 - Precisión mapeada = 0.618


In [17]:
print("Proceso completado. Revisa la carpeta clustering_outputs/")

Proceso completado. Revisa la carpeta clustering_outputs/
