In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# ==============================================================================
# 1. CARGA DE DATOS
# ==============================================================================
print("‚è≥ Cargando datos...")
try:
    # Ajusta la ruta '../data/01_raw/' seg√∫n donde est√© tu notebook
    df_customers = pd.read_csv("../data/01_raw/customer_profile_dataset.csv")
    df_products = pd.read_csv("../data/01_raw/products_dataset.csv")
    df_history = pd.read_csv("../data/01_raw/purchase_history_dataset.csv")
    
    # Unir las tablas (Master Table)
    df_merged = df_history.merge(df_customers, on="customer_id", how="left")
    df_merged = df_merged.merge(df_products, on="product_id", how="left")
    print(f"‚úÖ Datos cargados. Filas: {df_merged.shape[0]}, Columnas: {df_merged.shape[1]}")
except FileNotFoundError:
    print("‚ùå Error: No encuentro los archivos CSV. Verifica que est√©n en 'data/01_raw/'.")

# ==============================================================================
# 2. FEATURE ENGINEERING (Crear variables inteligentes)
# ==============================================================================
print("\nüõ†Ô∏è Generando nuevos features...")

# A. Calcular EDAD
df_merged['date_of_birth'] = pd.to_datetime(df_merged['date_of_birth'])
df_merged['purchase_date'] = pd.to_datetime(df_merged['purchase_date'])
df_merged['age'] = df_merged['purchase_date'].dt.year - df_merged['date_of_birth'].dt.year

# B. Extraer TEMPORALIDAD
df_merged['month'] = df_merged['purchase_date'].dt.month
df_merged['day_of_week'] = df_merged['purchase_date'].dt.dayofweek

# C. Seleccionar y Limpiar
# Usamos features que aporten valor + los nuevos
features_utiles = ['gender', 'city', 'state', 'age', 'month', 'day_of_week', 'brand', 'price_per_unit']
target = 'category' 

df_model = df_merged[features_utiles + [target]].copy()
df_model = df_model.dropna()

# Codificar texto a n√∫meros
le = LabelEncoder()
cols_categoricas = ['gender', 'city', 'state', 'brand', 'category']
for col in cols_categoricas:
    df_model[col] = le.fit_transform(df_model[col])

print("‚úÖ Features creados: Age, Month, Day_of_week.")

# ==============================================================================
# 3. CLUSTERING (K-Means)
# ==============================================================================
print("\nü§ñ Ejecutando Clustering (K-Means)...")

# Normalizar datos (Vital para K-Means)
scaler = StandardScaler()
X_cluster = df_model.drop(columns=['category']) 
X_scaled = scaler.fit_transform(X_cluster)

# Aplicar K-Means (5 clusters)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# ¬°AGREGAMOS EL CLUSTER COMO UN FEATURE M√ÅS!
df_model['cluster_id'] = clusters

score = silhouette_score(X_scaled, clusters)
print(f"‚úÖ Clusters generados. Silhouette Score: {score:.4f}")
print("   (Un score cercano a 1 es perfecto, cercano a 0 es solapado)")

# ==============================================================================
# 4. PRUEBA DE FUEGO (Modelo Supervisado con Clusters)
# ==============================================================================
print("\nüìà Entrenando modelo supervisado (Logistic Regression) con las mejoras...")

# Separar X e y
# Ahora X incluye: 'age', 'month', 'day_of_week' Y 'cluster_id'
X = df_model.drop(columns=['category']) 
y = df_model['category']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar
clf = LogisticRegression(max_iter=3000, random_state=42)
clf.fit(X_train, y_train)

# Evaluar
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"‚úÖ Accuracy del modelo supervisado: {accuracy:.4f}")

‚è≥ Cargando datos...
‚úÖ Datos cargados. Filas: 10308, Columnas: 22

üõ†Ô∏è Generando nuevos features...
‚úÖ Features creados: Age, Month, Day_of_week.

ü§ñ Ejecutando Clustering (K-Means)...
