#1. Dataset

Cargar los dos archivos (simplified_coffee.csv) y (synthetic_coffee_health_10000.csv) y ejecutar todo el código.

In [None]:
import os
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve, silhouette_score,
    auc as sk_auc  # usado en la ROC multiclase
)

# Gensim
try:
    from gensim.models import Word2Vec
    GENSIM_OK = True
except Exception:
    GENSIM_OK = False

# Descarga/chequeo idempotente de recursos NLTK
for pkg in ("stopwords", "punkt", "wordnet"):
    try:
        nltk.data.find(f"corpora/{pkg}" if pkg != "punkt" else "tokenizers/punkt")
    except LookupError:
        nltk.download(pkg, quiet=True)

df2 = pd.read_csv("synthetic_coffee_health_10000.csv")  # CSV de personas
df2.info()

In [None]:
# Cargar los CSV
df1 = pd.read_csv("simplified_coffee.csv")  # CSV de cafés
df2 = pd.read_csv("synthetic_coffee_health_10000.csv")  # CSV de personas

# Limpiar espacios en los nombres de columnas
df1.rename(columns=lambda x: x.strip(), inplace=True)
df2.rename(columns=lambda x: x.strip(), inplace=True)

# Verificar columnas
print("Columnas df1:", df1.columns)
print("Columnas df2:", df2.columns)

# Merge usando columnas con nombres distintos
merged_df = pd.merge(
    df1,
    df2,
    left_on='loc_country',  # columna en df1
    right_on='Country',     # columna en df2
    how='inner'             # solo los registros que existan en ambos
)

# Opcional: eliminar columna duplicada 'Country' del df2
merged_df.drop(columns=['Country'], inplace=True)

# Mostrar resultado
print("Filas del merge:", len(merged_df))
print(merged_df.head())

In [None]:
merged_df.info()

#2. Preprocesamiento de texto

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Forzar descarga de recursos
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
import nltk

# Descargar todos los recursos necesarios
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # minúsculas
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # eliminar caracteres no alfanuméricos
    tokens = word_tokenize(text)  # tokenizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # stopwords + lematización
    return " ".join(tokens)

# Aplicar limpieza a la columna 'review'
merged_df['review_clean'] = merged_df['review'].apply(clean_text)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    # Minúsculas
    text = text.lower()
    # Eliminación de caracteres no alfanuméricos
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenización usando word_tokenize importado directamente
    tokens = word_tokenize(text)
    # Eliminación de stopwords y lematización
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Limpiar columna 'review'
merged_df['review_clean'] = merged_df['review'].apply(clean_text)

In [None]:
# Representación TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['review_clean'])

In [None]:
# Representación Word2Vec
# Tokenizamos para Word2Vec
sentences = [nltk.word_tokenize(text) for text in merged_df['review_clean']]

# Entrenar modelo Word2Vec en nuestro dataset
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

#Obtener vector promedio de cada review
def get_w2v_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

merged_df['review_w2v'] = [get_w2v_vector(nltk.word_tokenize(text), w2v_model)
                            for text in merged_df['review_clean']]

In [None]:
#Imprimir
print("Merge completado. Shape del dataset:", merged_df.shape)
print("TF-IDF shape:", tfidf_matrix.shape)

#3. Modelado

KNN (Clasificación)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from scipy.stats import mode

TARGET_COL = "Health_Issues"

# 1) Filtrado y alineación con TF-IDF/W2V
mask_valid = merged_df['review_clean'].fillna("").str.len() > 0
dfm = merged_df.loc[mask_valid].copy()
idx = np.flatnonzero(mask_valid.values)

# TF-IDF (índices válidos)
X_tfidf = tfidf_matrix[idx]
# W2V (apila los vectores por fila)
X_w2v_f = np.vstack(dfm['review_w2v'].values)

# 2) Etiqueta binaria o multiclase
s_raw = dfm[TARGET_COL].astype(str).str.strip().str.lower()
neg_set = {"none","no","false","0","low","none/low","nan",""}
pos_set = {"yes","true","1","mild","moderate","severe","high"}

y_bin = s_raw.map(lambda x: (1 if x in pos_set else (0 if x in neg_set else np.nan)))
if y_bin.isna().any():
    y_num = pd.to_numeric(dfm[TARGET_COL], errors="coerce")
    y_bin = y_bin.where(~y_bin.isna(), (y_num > 0).astype(float))

if y_bin.isna().any():
    le = LabelEncoder()
    y = le.fit_transform(dfm[TARGET_COL].astype(str).values)
    classes_ = le.classes_
else:
    y = y_bin.astype(int).values
    classes_ = np.array([0,1])

n_classes = len(classes_)
print("Etiquetas detectadas:", classes_)

# 3) Evaluación KNN (grid de K) + métricas + detalles para gráficas
def eval_knn_grid_general(X, y, classes_, k_values=(3,5,7,9,11), test_size=0.25, seed=90989):
    X_dense = X.toarray() if hasattr(X, "toarray") else np.asarray(X)
    X_tr, X_te, y_tr, y_te = train_test_split(X_dense, y, test_size=test_size,
                                              random_state=seed, stratify=y)
    rows, details = [], {}
    n_classes = len(classes_)
    for k in k_values:
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_te)
        y_score = clf.predict_proba(X_te) if hasattr(clf, "predict_proba") else None

        acc = accuracy_score(y_te, y_pred)
        prec = precision_score(y_te, y_pred, average=("binary" if n_classes==2 else "weighted"), zero_division=0)
        rec  = recall_score(y_te, y_pred, average=("binary" if n_classes==2 else "weighted"), zero_division=0)
        f1   = f1_score(y_te, y_pred, average=("binary" if n_classes==2 else "weighted"), zero_division=0)

        if y_score is not None:
            if n_classes == 2:
                auc_val = roc_auc_score(y_te, y_score[:,1])
            else:
                Y_bin = label_binarize(y_te, classes=np.arange(n_classes))
                auc_val = roc_auc_score(Y_bin, y_score, multi_class="ovr", average="macro")
        else:
            auc_val = np.nan

        cm = confusion_matrix(y_te, y_pred)
        rows.append({"k": k, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc_val})
        details[k] = {"y_test": y_te, "y_pred": y_pred, "y_score": y_score, "cm": cm}

    df_res = pd.DataFrame(rows).sort_values(["f1","accuracy"], ascending=False).reset_index(drop=True)
    best_k = int(df_res.iloc[0]["k"])
    return df_res, best_k, details[best_k]

def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(title); plt.xlabel("Predicted"); plt.ylabel("True")
    plt.colorbar()
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, str(cm[i, j]), ha='center', va='center')
    plt.tight_layout(); plt.show()

def plot_roc_general(y_true, y_score, classes_, title="ROC Curve"):
    if y_score is None:
        print("No hay probabilidades para ROC."); return
    n_classes = len(classes_)
    if n_classes == 2:
        fpr, tpr, _ = roc_curve(y_true, y_score[:,1])
        auc_val = roc_auc_score(y_true, y_score[:,1])
        plt.figure()
        plt.plot(fpr, tpr, linewidth=2); plt.plot([0,1],[0,1], "--")
        plt.title(f"{title} (AUC={auc_val:.3f})")
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.tight_layout(); plt.show()
    else:
        from sklearn.metrics import auc as sk_auc
        Y_bin = label_binarize(y_true, classes=np.arange(n_classes))
        plt.figure()
        aucs = []
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(Y_bin[:, i], y_score[:, i])
            aucs.append(sk_auc(fpr, tpr))
            plt.plot(fpr, tpr, label=f"Clase {classes_[i]} (AUC={aucs[-1]:.2f})")
        plt.plot([0,1],[0,1], "--")
        plt.title(f"{title} (macro AUC={np.mean(aucs):.3f})")
        plt.xlabel("FPR"); plt.ylabel("TPR")
        plt.legend(); plt.tight_layout(); plt.show()

# 4) Correr KNN en TF-IDF y en W2V
res_knn_tfidf, best_k_tfidf, best_tf = eval_knn_grid_general(X_tfidf, y, classes_, k_values=(3,5,7,9,11))
print("KNN (TF-IDF) – resultados:"); display(res_knn_tfidf.round(3))
print("Mejor k (TF-IDF):", best_k_tfidf)
plot_confusion_matrix(best_tf["cm"], f"Confusion – KNN TF-IDF (k={best_k_tfidf})")
plot_roc_general(best_tf["y_test"], best_tf["y_score"], classes_, f"ROC – KNN TF-IDF (k={best_k_tfidf})")

res_knn_w2v, best_k_w2v, best_w2v = eval_knn_grid_general(X_w2v_f, y, classes_, k_values=(3,5,7,9,11))
print("KNN (W2V) – resultados:"); display(res_knn_w2v.round(3))
print("Mejor k (W2V):", best_k_w2v)
plot_confusion_matrix(best_w2v["cm"], f"Confusion – KNN W2V (k={best_k_w2v})")
plot_roc_general(best_w2v["y_test"], best_w2v["y_score"], classes_, f"ROC – KNN W2V (k={best_k_w2v})")


3.2 K-Means (Clustering)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

def kmeans_analysis(X, k_min=2, k_max=10, seed=90989):
    Xd = X.toarray() if hasattr(X, "toarray") else np.asarray(X)

    # Métricas de clustering
    inertia_vals, sil_vals, ch_vals, db_vals = [], [], [], []
    models, labels_dict = {}, {}

    for k in range(k_min, k_max+1):
        km = KMeans(n_clusters=k, n_init=10, random_state=seed)
        labels = km.fit_predict(Xd)

        models[k] = km
        labels_dict[k] = labels
        inertia_vals.append(km.inertia_)

        # Calcular métricas de clustering
        try:
            sil_vals.append(silhouette_score(Xd, labels))
        except:
            sil_vals.append(np.nan)

        try:
            ch_vals.append(calinski_harabasz_score(Xd, labels))
        except:
            ch_vals.append(np.nan)

        try:
            db_vals.append(davies_bouldin_score(Xd, labels))
        except:
            db_vals.append(np.nan)

    # Crear DataFrame con métricas
    metrics_df = pd.DataFrame({
        'k': range(k_min, k_max+1),
        'Inertia': inertia_vals,
        'Silhouette': sil_vals,
    }).set_index('k')

    return metrics_df, models, labels_dict

def plot_elbow(ks, inertia, title):
    plt.figure(figsize=(10, 6))
    plt.plot(ks, inertia, marker="o", linestyle='-', color='b', linewidth=2)
    plt.title(title, fontsize=14, fontweight='bold')
    plt.xlabel("Número de clusters (k)", fontsize=12)
    plt.ylabel("Inertia (SSE)", fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.xticks(ks)
    plt.tight_layout()
    plt.show()

def plot_silhouette(ks, sil, title):
    plt.figure(figsize=(10, 6))
    plt.plot(ks, sil, marker="o", linestyle='-', color='g', linewidth=2)
    plt.title(title, fontsize=14, fontweight='bold')
    plt.xlabel("Número de clusters (k)", fontsize=12)
    plt.ylabel("Silhouette Score", fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.xticks(ks)
    plt.tight_layout()
    plt.show()

def plot_all_metrics(metrics_df, title):
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'{title} - Métricas de Clustering', fontsize=16, fontweight='bold')

    k_values = metrics_df.index

    # Inertia (Elbow Method)
    axes[0,0].plot(k_values, metrics_df['Inertia'], marker='o', linestyle='-', color='b', linewidth=2)
    axes[0,0].set_title('Método del Codo (Inertia)')
    axes[0,0].set_xlabel('Número de clusters (k)')
    axes[0,0].set_ylabel('Inertia (SSE)')
    axes[0,0].grid(True, alpha=0.3)
    axes[0,0].set_xticks(k_values)

    # Silhouette Score
    axes[0,1].plot(k_values, metrics_df['Silhouette'], marker='o', linestyle='-', color='g', linewidth=2)
    axes[0,1].set_title('Silhouette Score')
    axes[0,1].set_xlabel('Número de clusters (k)')
    axes[0,1].set_ylabel('Silhouette Score')
    axes[0,1].grid(True, alpha=0.3)
    axes[0,1].set_xticks(k_values)

# 1) Análisis de K-Means para TF-IDF y W2V
print("ANÁLISIS K-MEANS - CLUSTERING NO SUPERVISADO")
print("="*60)

# Para TF-IDF
print("\nTF-IDF - Análisis de Clustering:")
metrics_tfidf, models_tfidf, labels_tfidf_all = kmeans_analysis(X_tfidf, 2, 10)

plot_elbow(metrics_tfidf.index, metrics_tfidf['Inertia'], "Método del Codo - TF-IDF")
plot_silhouette(metrics_tfidf.index, metrics_tfidf['Silhouette'], "Silhouette Score - TF-IDF")

# Para W2V
print("\nW2V - Análisis de Clustering:")
metrics_w2v, models_w2v, labels_w2v_all = kmeans_analysis(X_w2v_f, 2, 10)

plot_elbow(metrics_w2v.index, metrics_w2v['Inertia'], "Método del Codo - W2V")
plot_silhouette(metrics_w2v.index, metrics_w2v['Silhouette'], "Silhouette Score - W2V")

# 2) Interpretación de clusters
def top_terms_tfidf(kmeans_model, vectorizer, topn=10):
    centroids = kmeans_model.cluster_centers_
    order = np.argsort(-centroids, axis=1)[:, :topn]
    terms = np.array(vectorizer.get_feature_names_out())
    return [[terms[j] for j in row] for row in order]

print(f"\n{'='*60}")
print("INTERPRETACIÓN DE CLUSTERS")
print(f"{'='*60}")

# Para TF-IDF
print(f"\nTF-IDF - Términos representativos (k={best_k_tfidf}):")
km_tfidf = models_tfidf[best_k_tfidf]
labels_tfidf = labels_tfidf_all[best_k_tfidf]
tfidf_top_terms = top_terms_tfidf(km_tfidf, tfidf_vectorizer, topn=8)

for c, terms in enumerate(tfidf_top_terms):
    print(f"Cluster {c}: {', '.join(terms)}")

# 3) Distribución de clusters vs etiquetas reales
if hasattr(y, '__len__'):
    print(f"\n{'='*60}")
    print("DISTRIBUCIÓN CLUSTERS vs ETIQUETAS REALES")
    print(f"{'='*60}")

    # Para TF-IDF
    print(f"\nTF-IDF (k={best_k_tfidf}):")
    ct_tfidf = pd.crosstab(labels_tfidf, y, rownames=["Cluster"], colnames=["Etiqueta Real"])
    print(ct_tfidf)

    # Para W2V
    print(f"\nW2V (k={best_k_w2v}):")
    km_w2v = models_w2v[best_k_w2v]
    labels_w2v = labels_w2v_all[best_k_w2v]
    ct_w2v = pd.crosstab(labels_w2v, y, rownames=["Cluster"], colnames=["Etiqueta Real"])
    print(ct_w2v)

# 4) Comparativa final entre representaciones
print(f"\n{'='*60}")
print("COMPARATIVA FINAL ENTRE REPRESENTACIONES")
print(f"{'='*60}")

print(f"\nMejor k seleccionado:")
print(f"TF-IDF: k = {best_k_tfidf}")
print(f"W2V: k = {best_k_w2v}")

print(f"\nMétricas comparativas (para k óptimo de cada representación):")
print(f"{'Métrica':<20} {'TF-IDF':<10} {'W2V':<10}")
print("-" * 45)

metrics_to_compare = ['Silhouette']
for metric in metrics_to_compare:
    tfidf_val = metrics_tfidf.loc[best_k_tfidf, metric]
    w2v_val = metrics_w2v.loc[best_k_w2v, metric]

    if metric == 'Davies_Bouldin':
        # Para Davies-Bouldin, menor es mejor
        better = "TF-IDF" if tfidf_val < w2v_val else "W2V"
    else:
        # Para otras métricas, mayor es mejor
        better = "TF-IDF" if tfidf_val > w2v_val else "W2V"

    print(f"{metric:<20} {tfidf_val:.4f}    {w2v_val:.4f}    ← {better}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# 1) Preparar etiquetas 0/1 desde dfm
s_raw = dfm["Health_Issues"].astype(str).str.strip().str.lower()
neg_set = {"none","no","false","0","low","none/low","nan",""}
pos_set = {"yes","true","1","mild","moderate","severe","high"}
y_all = s_raw.map(lambda x: 1 if x in pos_set else (0 if x in neg_set else np.nan))
mask_valid = ~y_all.isna()
y_vec = y_all[mask_valid].astype(int).values
print("Número de muestras válidas:", len(y_vec))
print("Etiquetas detectadas:", np.unique(y_vec))

# 2) Features (asume que ya creaste X_tfidf, vectorizer, X_w2v_f)
#    Filtramos las filas válidas para alinear con y_vec
X_tfidf_clf = X_tfidf[mask_valid]      # si es sparse, este index funciona
X_w2v_clf   = X_w2v_f[mask_valid]      # matriz densa (np.ndarray)

# Sanity checks
assert X_tfidf_clf.shape[0] == len(y_vec), "X_tfidf_clf no alinea con y"
assert X_w2v_clf.shape[0]   == len(y_vec), "X_w2v_clf no alinea con y"

# Vista de features TF-IDF si tienes 'vectorizer'
try:
    feature_names = vectorizer.get_feature_names_out()
    print("Ejemplo de features TF-IDF:", feature_names[:20], "...")
    print("Total de features TF-IDF:", len(feature_names))
except Exception:
    pass

# 3) Función de entrenamiento y evaluación
#    (convierte a denso para árboles si la X es sparse)
from scipy import sparse

def _make_compatible(model, X):
    if isinstance(model, DecisionTreeClassifier) and sparse.issparse(X):
        return X.toarray()
    return X
def train_eval_model(X, y, model, model_name):
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )
    X_tr = _make_compatible(model, X_tr)
    X_te = _make_compatible(model, X_te)

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    y_score = model.predict_proba(X_te)[:,1] if hasattr(model, "predict_proba") else None
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred)
    rec = recall_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)
    auc_val = roc_auc_score(y_te, y_score) if y_score is not None else np.nan
    cm = confusion_matrix(y_te, y_pred)
    report = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc_val}
    print(f"\n== {model_name} ==")
    print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}, AUC-ROC: {auc_val:.3f}")
    print("Confusion Matrix:\n", cm)

    # ROC Curve
    if y_score is not None:
        fpr, tpr, _ = roc_curve(y_te, y_score)
        plt.figure()
        plt.plot(fpr, tpr, label=f"{model_name} (AUC={auc_val:.3f})")
        plt.plot([0,1],[0,1], "--")
        plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC Curve - {model_name}")
        plt.legend(); plt.show()
    return report, auc_val, cm

# 4) Definir modelos
models = [
    (LogisticRegression(max_iter=1000),                      "LogisticRegression (TF-IDF)"),
    (DecisionTreeClassifier(max_depth=10, random_state=42),  "DecisionTree (TF-IDF)"),
    (LogisticRegression(max_iter=1000),                      "LogisticRegression (W2V)"),
    (DecisionTreeClassifier(max_depth=10, random_state=42),  "DecisionTree (W2V)")
]

# 5) Entrenar y guardar resultados
results = {}
for mdl, name in models:
    X_input = X_tfidf_clf if "TF-IDF" in name else X_w2v_clf
    report, auc_val, cm = train_eval_model(X_input, y_vec, mdl, name)
    results[name] = {"report": report, "auc": auc_val, "cm": cm}

# 6) Resumen de métricas
metrics_df = pd.DataFrame({name: res["report"] for name,res in results.items()}).T.round(3)
print("\n== Comparativa de modelos ==")
display(metrics_df)

## Implementación de GridSearchCV y/o RandomizedSearchCV.

GridSearchCV conviene cuando el espacio de hiperparámetros es discreto y pequeño. Es exhaustivo y estable. Ej.: RandomForest (n_estimators, max_depth, etc.) con pocas opciones razonables.

RandomizedSearchCV es mejor cuando el espacio es amplio/continuo o no sabemos bien las mejores regiones. Es eficiente con pocos intentos. Ej.: SVC (C, gamma) o XGBoost (learning_rate, max_depth…), donde hay rangos continuos.

imports

In [None]:
# Setup y utilidades para synthetic_coffee_health_10000.csv (target='Health_Issues')
import os, time, numpy as np, pandas as pd, matplotlib.pyplot as plt
from IPython.display import display

# Model selection / pipelines / preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Métricas
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# XGBoost
try:
    from xgboost import XGBClassifier
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "xgboost"])
    from xgboost import XGBClassifier

SEED = 90989
np.random.seed(SEED)

def _ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # sklearn >= 1.2
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)         # sklearn < 1.2

def _pre(X: pd.DataFrame) -> ColumnTransformer:
    nums = X.select_dtypes(include=[np.number]).columns.tolist()
    cats = X.select_dtypes(exclude=[np.number]).columns.tolist()
    return ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                          ("sc",  StandardScaler())]), nums),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("oh",  _ohe())]), cats)
    ])

def _load_health():
    csv_path = "synthetic_coffee_health_10000.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"No encuentro {csv_path}")

    df = (pd.read_csv(csv_path)
            .replace([np.inf, -np.inf], np.nan)
            .dropna(axis=1, how="all"))

    target = "Health_Issues"
    assert target in df.columns, f"Target '{target}' no está en columnas: {df.columns.tolist()}"
    df = df[~df[target].isna()].copy()

    y_raw = df[target]
    X = df.drop(columns=[target])

    if y_raw.nunique() == 2:
        le = LabelEncoder()
        y = pd.Series(le.fit_transform(y_raw), index=y_raw.index, name=target)
        is_binary = True
    else:
        y = y_raw
        is_binary = False

    return X, y, csv_path, is_binary

def _metrics_row(model_name, est, Xte, yte, is_binary, train_secs):
    yhat = est.predict(Xte)
    avg = "binary" if is_binary else "macro"
    return {
        "model": model_name,
        "accuracy": accuracy_score(yte, yhat),
        "precision": precision_score(yte, yhat, average=avg, zero_division=0),
        "recall": recall_score(yte, yhat, average=avg, zero_division=0),
        "f1": f1_score(yte, yhat, average=avg, zero_division=0),
        "train_time_s": round(train_secs, 3)
    }

# Carga y split
X, y, path, is_binary = _load_health()
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.2,
    stratify=y if is_binary or (y.nunique() <= 20) else None,
    random_state=SEED
)

pre = _pre(X)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED) if is_binary or (y.nunique() <= 20) else 3
scoring = "f1" if is_binary else "f1_macro"

print(f"Dataset: {os.path.basename(path)} | target=Health_Issues | clases={y.nunique()} | CV=3 | métrica={scoring}")

GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest base
rf = RandomForestClassifier(random_state=SEED, n_jobs=-1)

# Pipeline: preprocesamiento -> modelo
pipe_rf = Pipeline([("pre", pre), ("clf", rf)])

# Grid de hiperparámetros
param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [None, 10],
    "clf__min_samples_split": [2, 10],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt"],
}

# Búsqueda en rejilla
# Si scoring es dict, deberías usar refit="f1_macro" o "f1"
gs = GridSearchCV(pipe_rf, param_grid, cv=cv, scoring=scoring, n_jobs=-1, refit=True)

# Entrenar y medir tiempo
t0 = time.time()
gs.fit(Xtr, ytr)
t_rf = time.time() - t0

# Mejor modelo
best_rf = gs.best_estimator_
row_rf = _metrics_row("RandomForest (GridSearchCV)", best_rf, Xte, yte,
                      is_binary=is_binary, train_secs=t_rf)
row_rf["cv_best_score"] = round(gs.best_score_, 6)
row_rf["best_params"] = gs.best_params_

print("== RandomForest (GridSearchCV) ==")
display(pd.DataFrame([row_rf]))

RandomizedSearchCV

In [None]:
from sklearn.svm import SVC
from scipy.stats import loguniform

# SVC base (probability=True habilita ROC/AUC pero lo hace más lento)
svc = SVC(probability=True, random_state=SEED)

# Pipeline: preprocesamiento -> modelo
pipe_svc = Pipeline([("pre", pre), ("clf", svc)])

# Distribuciones de búsqueda (espacios continuos con loguniform)
param_dist = {
    "clf__C": loguniform(1e-3, 1e2),
    "clf__gamma": loguniform(1e-4, 1e-1),
    "clf__kernel": ["rbf"],
}

# RandomizedSearchCV
# Si 'scoring' es dict, usa refit="f1_macro" o "f1" según corresponda
rs = RandomizedSearchCV(
    pipe_svc,
    param_distributions=param_dist,
    n_iter=20,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    random_state=SEED,
    refit=True,
    verbose=0
)

# Entrenar y medir tiempo
t0 = time.time()
rs.fit(Xtr, ytr)
t_svc = time.time() - t0

# Modelo
best_svc = rs.best_estimator_
row_svc = _metrics_row(
    "SVC (RandomizedSearchCV)",
    best_svc,
    Xte,
    yte,
    is_binary=is_binary,   # <-- ajuste clave
    train_secs=t_svc
)
row_svc["cv_best_score"] = round(rs.best_score_, 6)
row_svc["best_params"]   = rs.best_params_

print("== SVC (RandomizedSearchCV) ==")
display(pd.DataFrame([row_svc]))

Visualización de métricas

In [None]:
# Comparación + métricas (con AUC si binario) + CM + ROC
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay
)

# Helper para calcular fila de métricas (usa is_binary global si está disponible)
def _metrics_row_with_roc(name, est, Xte, yte, is_binary_override=None):
    # Usa el override (por ejemplo, la variable is_binary que definiste al cargar),
    # y si no existe, lo infiere de yte
    if is_binary_override is None:
        is_binary_local = (len(np.unique(yte)) == 2)
    else:
        is_binary_local = bool(is_binary_override)

    yhat = est.predict(Xte)
    avg = "binary" if is_binary_local else "macro"
    row = {
        "model": name,
        "accuracy": accuracy_score(yte, yhat),
        "precision": precision_score(yte, yhat, average=avg, zero_division=0),
        "recall":    recall_score(yte, yhat, average=avg, zero_division=0),
        "f1":        f1_score(yte, yhat, average=avg, zero_division=0),
        "roc_auc":   np.nan
    }

    # AUC solo si binario
    if is_binary_local:
        score = None
        if hasattr(est, "predict_proba"):
            score = est.predict_proba(Xte)[:, 1]
        elif hasattr(est, "decision_function"):
            score = est.decision_function(Xte)
        if score is not None:
            row["roc_auc"] = roc_auc_score(yte, score)

    return row, is_binary_local

# 1) Tabla de métricas (agregamos AUC si corresponde)
#    Pasamos is_binary si lo tienes definido en tu entorno; si no, no pasa nada.
try:
    row_rf, is_bin = _metrics_row_with_roc("RandomForest (Grid)", best_rf, Xte, yte, is_binary_override=is_binary)
    row_svc, _     = _metrics_row_with_roc("SVC (Randomized)",   best_svc, Xte, yte, is_binary_override=is_binary)
except NameError:
    # Fallback si no existe is_binary en el entorno
    row_rf, is_bin = _metrics_row_with_roc("RandomForest (Grid)", best_rf, Xte, yte)
    row_svc, _     = _metrics_row_with_roc("SVC (Randomized)",   best_svc, Xte, yte)

res = (pd.DataFrame([row_rf, row_svc])
       .sort_values(by=["f1", "roc_auc"], ascending=[False, False], na_position="last")
       .reset_index(drop=True))

print("=== Métricas comparadas ===")
display(res[["model", "accuracy", "precision", "recall", "f1", "roc_auc"]])

# 2) Visualizaciones: Matriz de confusión + Curva ROC (si binario)
for name, est in [("RandomForest (Grid)", best_rf), ("SVC (Randomized)", best_svc)]:
    # Matriz de confusión
    ConfusionMatrixDisplay.from_estimator(est, Xte, yte)
    plt.title(f"Matriz de confusión — {name}")
    plt.show()

    # Curva ROC (solo si es binario)
    if is_bin:
        if hasattr(est, "predict_proba"):
            RocCurveDisplay.from_estimator(est, Xte, yte, name=name)
        elif hasattr(est, "decision_function"):
            RocCurveDisplay.from_estimator(est, Xte, yte, name=name)
        plt.title(f"Curva ROC — {name}")
        plt.show()

In [None]:
# Entrenamiento de modelos (LogReg, RandomForest+Grid, XGBoost+Randomized)
import numpy as np, time, pandas as pd
from IPython.display import display
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import randint, uniform, loguniform
from xgboost import XGBClassifier

# Chequeo: esta sección asume que la Sección 1 ya definió estas variables:
needed = ["Xtr","Xte","ytr","yte","pre","cv","scoring","SEED","y","is_binary"]
missing = [v for v in needed if v not in globals()]
assert not missing, f"Falta correr la Sección 1. Variables no definidas: {missing}"

def _row(name, est, Xte, yte, ttrain):
    yhat = est.predict(Xte)
    avg = "binary" if is_binary else "macro"
    return {
        "model": name,
        "accuracy": accuracy_score(yte, yhat),
        "precision": precision_score(yte, yhat, average=avg, zero_division=0),
        "recall": recall_score(yte, yhat, average=avg, zero_division=0),
        "f1": f1_score(yte, yhat, average=avg, zero_division=0),
        "train_time_s": round(ttrain, 3),
        "cv_best_score": np.nan,
        "best_params": {}
    }

rows, models = {}, {}

# 2.1 Regresión Logística (baseline)
lr = LogisticRegression(max_iter=2000, random_state=SEED)
pipe_lr = Pipeline([("pre", pre), ("clf", lr)])
t0 = time.time(); pipe_lr.fit(Xtr, ytr); t_lr = time.time() - t0
rows["LogReg"] = _row("LogisticRegression", pipe_lr, Xte, yte, t_lr)
models["LogisticRegression"] = pipe_lr

# 2.2 RandomForest + GridSearchCV
rf = RandomForestClassifier(random_state=SEED, n_jobs=-1)
pipe_rf = Pipeline([("pre", pre), ("clf", rf)])
param_grid = {
    "clf__n_estimators": [200, 500],
    "clf__max_depth": [None, 10],
    "clf__min_samples_split": [2, 10],
    "clf__min_samples_leaf": [1, 2],
    "clf__max_features": ["sqrt"],
}
gs = GridSearchCV(pipe_rf, param_grid, cv=cv, scoring=scoring, n_jobs=-1, refit=True, verbose=0)
t0 = time.time(); gs.fit(Xtr, ytr); t_rf = time.time() - t0
best_rf = gs.best_estimator_
row_rf = _row("RandomForest (GridSearchCV)", best_rf, Xte, yte, t_rf)
row_rf["cv_best_score"] = round(gs.best_score_, 6)
row_rf["best_params"]   = gs.best_params_
rows["RF"] = row_rf
models["RandomForest (GridSearchCV)"] = best_rf

# 2.3 XGBoost + RandomizedSearchCV
if is_binary:
    # Binario: sin LabelEncoder; objetivo binario
    xgb = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=SEED,
        n_jobs=-1
    )
    pipe_xgb = Pipeline([("pre", pre), ("clf", xgb)])
    param_dist = {
        "clf__n_estimators": randint(300, 800),
        "clf__max_depth":    randint(3, 10),
        "clf__learning_rate": loguniform(1e-3, 3e-1),
        "clf__subsample":    uniform(0.6, 0.4),
        "clf__colsample_bytree": uniform(0.6, 0.4),
        "clf__reg_lambda":   loguniform(1e-2, 1e2),
        "clf__min_child_weight": loguniform(1e-1, 10),
    }
    rs = RandomizedSearchCV(
        pipe_xgb, param_distributions=param_dist, n_iter=20,
        cv=cv, scoring=scoring, n_jobs=-1, random_state=SEED, refit=True, verbose=0
    )
    t0 = time.time(); rs.fit(Xtr, ytr); t_xgb = time.time() - t0
    best_xgb = rs.best_estimator_
    yhat_xgb = best_xgb.predict(Xte)
else:
    # Multiclase: LabelEncoder + objetivo multiclase
    le = LabelEncoder().fit(pd.concat([ytr, yte], axis=0))
    ytr_enc = le.transform(ytr)
    xgb = XGBClassifier(
        objective="multi:softprob",
        num_class=len(le.classes_),
        eval_metric="mlogloss",
        tree_method="hist",
        random_state=SEED,
        n_jobs=-1
    )
    pipe_xgb = Pipeline([("pre", pre), ("clf", xgb)])
    param_dist = {
        "clf__n_estimators": randint(300, 800),
        "clf__max_depth":    randint(3, 10),
        "clf__learning_rate": loguniform(1e-3, 3e-1),
        "clf__subsample":    uniform(0.6, 0.4),
        "clf__colsample_bytree": uniform(0.6, 0.4),
        "clf__reg_lambda":   loguniform(1e-2, 1e2),
        "clf__min_child_weight": loguniform(1e-1, 10),
    }
    rs = RandomizedSearchCV(
        pipe_xgb, param_distributions=param_dist, n_iter=20,
        cv=cv, scoring=scoring, n_jobs=-1, random_state=SEED, refit=True, verbose=0
    )
    t0 = time.time(); rs.fit(Xtr, ytr_enc); t_xgb = time.time() - t0
    best_xgb = rs.best_estimator_
    yhat_xgb_enc = best_xgb.predict(Xte).astype(int)
    yhat_xgb = le.inverse_transform(yhat_xgb_enc)

row_xgb = {
    "model": "XGBoost (RandomizedSearchCV)",
    "accuracy": accuracy_score(yte, yhat_xgb),
    "precision": precision_score(yte, yhat_xgb, average=("binary" if is_binary else "macro"), zero_division=0),
    "recall": recall_score(yte, yhat_xgb, average=("binary" if is_binary else "macro"), zero_division=0),
    "f1": f1_score(yte, yhat_xgb, average=("binary" if is_binary else "macro"), zero_division=0),
    "train_time_s": round(t_xgb, 3),
    "cv_best_score": round(rs.best_score_, 6),
    "best_params": rs.best_params_
}
rows["XGB"] = row_xgb
models["XGBoost (RandomizedSearchCV)"] = best_xgb

# Resumen
res = (pd.DataFrame([rows["LogReg"], rows["RF"], rows["XGB"]])
       [["model","accuracy","precision","recall","f1","train_time_s","cv_best_score","best_params"]]
       .sort_values(by=["f1"], ascending=False)
       .reset_index(drop=True))
print("Comparación de los modelos.")
display(res)

3 personas falsas para verificar con nuestros 3 modelos

In [None]:
# Personas dummy + predicciones 3 modelos
# (tipos corregidos y columnas alineadas)
import numpy as np, pandas as pd

# --- Helpers ---
def _estimate_binary_p(series: pd.Series) -> float:
    """Estimación de p(1) para columnas binarias (numéricas o texto tipo Yes/No)."""
    s = series.dropna()
    if s.empty:
        return 0.5
    if pd.api.types.is_numeric_dtype(s):
        s = s.clip(0, 1)
        return float(s.mean())
    s = s.astype(str).str.lower().str.strip()
    yes = {"yes", "y", "true", "1", "si", "sí"}
    return float(s.isin(yes).mean()) if len(s) else 0.5

def _build_dummy_rows_from_X(X: pd.DataFrame, n=3, seed=90989) -> pd.DataFrame:
    """Crea n filas dummy plausibles respetando tipos: Age/Heart_Rate/ID enteros, Smoking/Alcohol_Consumption binarios."""
    rng = np.random.default_rng(seed)
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    # columnas con tipo forzado
    bin_cols = [c for c in ["Smoking", "Alcohol_Consumption"] if c in X.columns]
    int_cols = [c for c in ["Age", "Heart_Rate", "ID"] if c in X.columns]

    dummies = []
    num_q = [0.2, 0.5, 0.8]  # tres perfiles

    for i in range(n):
        row = {}
        # numéricas: cuantiles + jitter leve
        for c in num_cols:
            q = num_q[i % len(num_q)]
            base = float(X[c].quantile(q))
            span = float(X[c].max() - X[c].min())
            jitter = (0.01 * span if span > 0 else 1e-3) * rng.normal(0, 0.3)
            row[c] = base + jitter
        # categóricas: top-k (hasta 3)
        for c in cat_cols:
            topk = X[c].value_counts(dropna=True).index.tolist()[:3]
            if not topk:
                if X[c].mode().empty:
                    topk = ["Unknown"]
                else:
                    topk = [str(X[c].mode().iloc[0])]
            row[c] = topk[i % len(topk)]
        dummies.append(row)

    df_dum = pd.DataFrame(dummies, columns=X.columns)

    # Clip por cuantiles para evitar outliers raros
    for c in num_cols:
        lo, hi = X[c].quantile([0.01, 0.99])
        df_dum[c] = df_dum[c].clip(lo, hi)

    # Forzar enteros
    for c in int_cols:
        vals = np.abs(np.round(df_dum[c].fillna(0))).astype(int)
        # Evitar IDs duplicados si existe 'ID'
        if c == "ID":
            vals += np.arange(1, len(vals) + 1)
        df_dum[c] = vals

    # Forzar binarias 0/1 según prevalencia del dataset
    for c in bin_cols:
        p = _estimate_binary_p(X[c])
        p = float(np.clip(p, 0.05, 0.95))  # evita extremos 0/1 absolutos
        df_dum[c] = np.random.default_rng(seed + 7).binomial(1, p, size=len(df_dum)).astype(int)

    return df_dum

def _align_to_pipeline_cols(est, df_row):
    """
    Devuelve df_row (1 fila) con exactamente las columnas que el preprocesador del pipeline espera:
    - agrega faltantes con NaN
    - elimina extras
    - reordena
    """
    ct = est.named_steps["pre"]  # ColumnTransformer
    expected = []
    for name, trans, cols in ct.transformers_:
        if cols is not None and cols != 'drop':
            expected.extend(list(cols))
    # Agregar faltantes
    for c in expected:
        if c not in df_row.columns:
            df_row[c] = np.nan
    # Quedarse con esperadas y en ese orden
    return df_row[expected]

# --- Construir personas dummy (respeta tipos) ---
personas = _build_dummy_rows_from_X(X, n=3, seed=SEED)
personas.index = [f"Persona_{i+1}" for i in range(len(personas))]

print("Predicciones usando nuestros modelos de datos falsos")
display(personas.head())

# Guardar solo las primeras filas (head) como CSV
personas.head().to_csv("personas_head.csv", index=False)

# Descargar el archivo
files.download("personas_head.csv")

# --- Predicción con 3 modelos y mapeo mild -> 1 ---
model_names = ["LogisticRegression", "RandomForest (GridSearchCV)", "XGBoost (RandomizedSearchCV)"]
missing_models = [m for m in model_names if m not in models]
assert not missing_models, f"Faltan modelos entrenados en 'models': {missing_models}"

rows_pred = []
for p_id, xrow in personas.iterrows():
    X_one = xrow.to_frame().T
    for mname in model_names:
        est = models[mname]
        # Alinear columnas a lo que el pipeline espera (evita error de columnas faltantes como 'ID')
        X_one_aligned = _align_to_pipeline_cols(est, X_one.copy())
        y_pred = est.predict(X_one_aligned)[0]
        y_pred_out = 1 if str(y_pred).lower() == "mild" else y_pred
        rows_pred.append({
            "persona": p_id,
            "model": mname,
            "pred_Health_Issues": y_pred_out
        })

res_personas = pd.DataFrame(rows_pred).sort_values(["persona", "model"]).reset_index(drop=True)
print("=== Predicciones por persona y modelo (target: Health_Issues, mild→1) ===")
display(res_personas)
