# 04 - Analisis de Clustering de Riesgo

Este notebook analiza el clustering de pacientes por perfil de riesgo:
- **K-Means**: Agrupamiento con K optimo (elbow + silhouette)
- **DBSCAN**: Deteccion de outliers como pacientes de riesgo critico
- **PCA**: Visualizacion 2D de los clusters
- **Anomaly Detection**: Autoencoder para valores de laboratorio

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from app.core.ml.risk_clusterer import RiskClusterer
from app.core.ml.anomaly_detector import LabAnomalyDetector
from app.core.ml.feature_engineering import FeatureEngineer

print('Imports OK')

## 1. Generar Datos Sinteticos de Pacientes

In [None]:
rng = np.random.RandomState(42)
n_patients = 150

# Cluster 1: Jovenes sanos
young_healthy = [
    {'age': rng.randint(25, 40), 'gender': rng.choice(['M', 'F']),
     'chronic_conditions': [], 'active_medications': [],
     'visit_frequency_6m': rng.randint(0, 2),
     'recent_lab_values': {'glucosa': rng.normal(85, 5), 'hemoglobina': rng.normal(14, 1),
                           'colesterol': rng.normal(170, 15), 'trigliceridos': rng.normal(120, 15),
                           'creatinina': rng.normal(0.9, 0.1), 'presion_sistolica': rng.normal(115, 5),
                           'presion_diastolica': rng.normal(75, 5)},
     'alert_count': 0, 'days_since_last_visit': rng.randint(30, 180)}
    for _ in range(50)
]

# Cluster 2: Adultos con condiciones controladas
controlled = [
    {'age': rng.randint(45, 65), 'gender': rng.choice(['M', 'F']),
     'chronic_conditions': ['diabetes'], 'active_medications': ['Metformina'],
     'visit_frequency_6m': rng.randint(2, 5),
     'recent_lab_values': {'glucosa': rng.normal(110, 10), 'hemoglobina': rng.normal(13.5, 1),
                           'colesterol': rng.normal(200, 20), 'trigliceridos': rng.normal(160, 20),
                           'creatinina': rng.normal(1.0, 0.15), 'presion_sistolica': rng.normal(130, 8),
                           'presion_diastolica': rng.normal(82, 5)},
     'alert_count': rng.randint(0, 2), 'days_since_last_visit': rng.randint(15, 60)}
    for _ in range(50)
]

# Cluster 3: Alto riesgo
high_risk = [
    {'age': rng.randint(55, 80), 'gender': rng.choice(['M', 'F']),
     'chronic_conditions': ['diabetes', 'hipertension', 'dislipidemia'],
     'active_medications': ['Metformina', 'Losartan', 'Atorvastatina'],
     'visit_frequency_6m': rng.randint(4, 8),
     'recent_lab_values': {'glucosa': rng.normal(180, 30), 'hemoglobina': rng.normal(11, 1.5),
                           'colesterol': rng.normal(260, 25), 'trigliceridos': rng.normal(250, 40),
                           'creatinina': rng.normal(1.5, 0.3), 'presion_sistolica': rng.normal(155, 12),
                           'presion_diastolica': rng.normal(95, 8)},
     'alert_count': rng.randint(2, 6), 'days_since_last_visit': rng.randint(5, 30)}
    for _ in range(50)
]

all_patients = young_healthy + controlled + high_risk

engineer = FeatureEngineer()
features, feature_names = engineer.extract_patient_features_batch(all_patients)
print(f'Patient features shape: {features.shape}')
print(f'Feature names: {feature_names}')

## 2. K-Means Clustering

In [None]:
clusterer = RiskClusterer()
result = clusterer.fit_kmeans(features, feature_names=feature_names)

print(f'Optimal clusters: {result.n_clusters}')
print(f'Silhouette score: {result.silhouette:.4f}')
print(f'\nCluster Descriptions:')
for desc in result.descriptions:
    print(f'  Cluster {desc.cluster_id}: {desc.size} patients, risk={desc.risk_level}')
    if desc.top_features:
        for feat, val in desc.top_features[:3]:
            print(f'    {feat}: {val:.2f}')

In [None]:
# Visualize clusters
path = clusterer.visualize_clusters(
    features, np.array(result.labels), output_path='clusters_kmeans.png'
)
from IPython.display import Image, display
display(Image(filename=path))

## 3. DBSCAN Clustering

In [None]:
dbscan_result = clusterer.fit_dbscan(
    features, eps=1.5, min_samples=5, feature_names=feature_names
)

n_outliers = sum(1 for l in dbscan_result.labels if l == -1)
print(f'DBSCAN clusters: {dbscan_result.n_clusters}')
print(f'Outliers (critical risk): {n_outliers}')
print(f'Silhouette: {dbscan_result.silhouette:.4f}')

for desc in dbscan_result.descriptions:
    label = 'Outliers' if desc.cluster_id == -1 else f'Cluster {desc.cluster_id}'
    print(f'  {label}: {desc.size} patients, risk={desc.risk_level}')

## 4. Anomaly Detection en Laboratorios

In [None]:
# Extract lab values for anomaly detection
lab_features = []
lab_names = ['glucosa', 'hemoglobina', 'colesterol', 'trigliceridos', 'creatinina']
for p in all_patients:
    vals = [p['recent_lab_values'].get(n, 0) for n in lab_names]
    lab_features.append(vals)
lab_data = np.array(lab_features)

# Train on 'normal' patients (first cluster)
normal_data = lab_data[:50]  # Young healthy

detector = LabAnomalyDetector(threshold_percentile=95.0)
train_metrics = detector.train(
    normal_data, epochs=50, batch_size=16, feature_names=lab_names
)
print(f'Training metrics: {train_metrics}')

In [None]:
# Detect anomalies in high-risk patients
high_risk_lab = lab_data[100:]  # High risk cluster
results = detector.detect_anomalies(high_risk_lab, feature_names=lab_names)

anomaly_count = sum(1 for r in results if r.is_anomaly)
print(f'Anomalies detected in high-risk group: {anomaly_count}/{len(results)}')

# Show top anomalies
print('\nTop anomalies:')
sorted_results = sorted(results, key=lambda r: r.anomaly_score, reverse=True)
for r in sorted_results[:5]:
    print(f'  Score: {r.anomaly_score:.2f}, Features: {r.most_anomalous_features[:2]}')

In [None]:
# Visualize anomaly scores distribution
all_results = detector.detect_anomalies(lab_data, feature_names=lab_names)
scores = [r.anomaly_score for r in all_results]

fig, ax = plt.subplots(figsize=(10, 5))
colors = ['#16A34A'] * 50 + ['#F59E0B'] * 50 + ['#DC2626'] * 50
ax.bar(range(len(scores)), scores, color=colors, alpha=0.7)
ax.axhline(y=1.0, color='red', linestyle='--', label='Threshold')
ax.set_xlabel('Patient Index')
ax.set_ylabel('Anomaly Score')
ax.set_title('Anomaly Scores by Patient Group')
ax.legend(['Threshold', 'Healthy', 'Controlled', 'High Risk'])
plt.tight_layout()
plt.show()