In [0]:
# === LEER ARCHIVO .env DE FORMA SEGURA ===

%pip install python-dotenv

from dotenv import load_dotenv
import os

# Ruta correcta (sin /dbfs al inicio para load_dotenv)
env_path = "/dbfs/Users/pansezapata@gmail.com/.env"

# Cargar variables desde .env
result = load_dotenv(env_path)
print(f".env cargado: {result}")

# Leer variables de entorno
storage_account = os.getenv('AZURE_STORAGE_ACCOUNT')
storage_key = os.getenv('AZURE_STORAGE_KEY')

print(f"Storage Account: {storage_account}")
print(f"Storage Key: {'*' * 12} (protegida)")

# Verificar que no sean None
if storage_account and storage_key:
    # Configurar Spark
    spark.conf.set(
        f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
        storage_key
    )
    print("Configuración Spark completada!")
else:
    print("Error: Variables de entorno están vacías")
    
print("prueba")

In [0]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Leer datos (usando el método híbrido que ya funciona)
print("📖 Cargando datos...")
file_path = "abfss://sistecredito2@sistecreditofinal.dfs.core.windows.net/data/v1/train/credit_train.csv"
spark_df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)
df = spark_df.toPandas()

print(f"Dataset cargado: {df.shape[0]} filas, {df.shape[1]} columnas")
print("Columnas:", list(df.columns))

In [0]:
def explore_credit_data(df):
    """Análisis exploratorio específico para datos de crédito"""
    
    print("🔍 === ANÁLISIS EXPLORATORIO ===")
    print(f"Dimensiones: {df.shape}")
    print(f"Columnas: {list(df.columns)}")
    
    # Información general
    print("\nInformación del dataset:")
    df.info()
    
    # Estadísticas descriptivas
    print("\nEstadísticas descriptivas:")
    print(df.describe())
    
    # Valores nulos
    print("\nValores nulos por columna:")
    nulls = df.isnull().sum()
    if nulls.sum() > 0:
        print(nulls[nulls > 0])
    else:
        print("No hay valores nulos")
    
    # Distribución de la variable objetivo (asumiendo que existe)
    target_cols = [col for col in df.columns if 'default' in col.lower() or 'risk' in col.lower() or 'target' in col.lower()]
    if target_cols:
        target_col = target_cols[0]
        print(f"\nDistribución de la variable objetivo '{target_col}':")
        print(df[target_col].value_counts())
        print("Porcentaje:")
        print(df[target_col].value_counts(normalize=True) * 100)
    
    return target_col if target_cols else None

# Ejecutar EDA
target_column = explore_credit_data(df)

In [0]:
df

In [0]:
def preprocess_credit_data(df, target_col):
    """Preprocesar datos para el modelo"""
    
    print(" === PREPROCESAMIENTO ===")
    
    # Crear copia para no modificar original
    df_processed = df.copy()
    
    # Limpiar datos
    print("Limpiando datos...")
    df_processed = df_processed.dropna()
    df_processed = df_processed.drop_duplicates()
    
    print(f"✅ Datos después de limpieza: {df_processed.shape}")
    
    # Identificar tipos de columnas
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
    
    # Remover target de las listas si está presente
    if target_col:
        if target_col in numeric_cols:
            numeric_cols.remove(target_col)
        if target_col in categorical_cols:
            categorical_cols.remove(target_col)
    
    print(f"Columnas numéricas ({len(numeric_cols)}): {numeric_cols}")
    print(f"Columnas categóricas ({len(categorical_cols)}): {categorical_cols}")
    
    # Codificar variables categóricas
    label_encoders = {}
    for col in categorical_cols:
        if col != target_col:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            label_encoders[col] = le
    
    # Preparar variable objetivo
    if target_col and target_col in categorical_cols:
        le_target = LabelEncoder()
        df_processed[target_col] = le_target.fit_transform(df_processed[target_col].astype(str))
        label_encoders[target_col] = le_target
    
    return df_processed, numeric_cols, categorical_cols, label_encoders

# Ejecutar preprocessing
df_processed, num_cols, cat_cols, encoders = preprocess_credit_data(df, target_column)

In [0]:
def train_random_forest_model(df, target_col, feature_cols):
    """
    Entrenar modelo Random Forest con validación robusta de tipos de datos
    """
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
    import pandas as pd
    import numpy as np
    
    print("🤖 === ENTRENAMIENTO RANDOM FOREST ===")
    
    # 1. VALIDAR INPUTS
    if target_col not in df.columns:
        raise ValueError(f"❌ Target column '{target_col}' no encontrada en el dataset")
    
    missing_features = [col for col in feature_cols if col not in df.columns]
    if missing_features:
        raise ValueError(f"❌ Features no encontradas: {missing_features}")
    
    # 2. PREPARAR DATOS CON VALIDACIÓN DE TIPOS
    print("📊 Preparando datos...")
    
    # Crear copias y verificar tipos
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    print(f"📊 Shape inicial - X: {X.shape}, y: {y.shape}")
    print(f"🎯 Target: '{target_col}'")
    print(f"📊 Tipo de y: {y.dtype}")
    
    # 3. LIMPIAR Y VALIDAR TARGET
    print("🔍 Validando target...")
    
    # Verificar valores nulos en target
    if y.isnull().sum() > 0:
        print(f"⚠️ Target tiene {y.isnull().sum()} valores nulos - eliminando...")
        valid_indices = ~y.isnull()
        X = X[valid_indices]
        y = y[valid_indices]
    
    # Verificar tipo del target
    print(f"📊 Valores únicos en target: {y.unique()}")
    
    # Asegurar que el target sea numérico
    if y.dtype == 'object' or not pd.api.types.is_numeric_dtype(y):
        print("🔄 Convirtiendo target a numérico...")
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y = pd.Series(le.fit_transform(y), index=y.index)
        print(f"✅ Target codificado - clases: {le.classes_}")
    
    # 4. LIMPIAR Y VALIDAR FEATURES
    print("🔍 Validando features...")
    
    # Eliminar columnas con todos valores nulos
    null_cols = X.columns[X.isnull().all()]
    if len(null_cols) > 0:
        print(f"🗑️ Eliminando {len(null_cols)} columnas con todos valores nulos")
        X = X.drop(columns=null_cols)
        feature_cols = [col for col in feature_cols if col not in null_cols]
    
    # Llenar valores nulos en features numéricas
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
    
    # Convertir features categóricas a numérico
    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"🔤 Convirtiendo {len(categorical_cols)} columnas categóricas...")
        from sklearn.preprocessing import LabelEncoder
        for col in categorical_cols:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
    
    # Asegurar que todas las features sean numéricas
    X = X.apply(pd.to_numeric, errors='coerce')
    X = X.fillna(0)  # Llenar cualquier NaN resultante
    
    print(f"✅ Datos limpiados - X: {X.shape}, y: {y.shape}")
    
    # 5. VERIFICAR DISTRIBUCIÓN DEL TARGET
    target_distribution = y.value_counts().to_dict()
    print(f"📈 Distribución del target: {target_distribution}")
    
    # Calcular balance de clases DE FORMA SEGURA
    target_counts = y.value_counts()
    minority_class_ratio = target_counts.min() / target_counts.sum()
    print(f"⚖️ Ratio clase minoritaria: {minority_class_ratio:.3f}")
    
    if minority_class_ratio < 0.1:
        print("⚠️ Dataset muy desbalanceado - usando class_weight='balanced'")
        class_weight = 'balanced'
    else:
        print("✅ Dataset relativamente balanceado")
        class_weight = None
    
    # 6. SPLIT TRAIN/TEST
    print("\n🔄 Dividiendo datos en train/test...")
    
    test_size = 0.2 if len(df) > 5000 else 0.3
    
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=test_size, 
            random_state=42, 
            stratify=y
        )
        print(f"✅ Split estratificado exitoso")
    except ValueError as e:
        print(f"⚠️ Error en split estratificado: {e}")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=test_size, 
            random_state=42
        )
        print(f"✅ Split sin estratificar completado")
    
    print(f"📊 Train set: {X_train.shape[0]} muestras")
    print(f"📊 Test set: {X_test.shape} muestras")
    
    # 7. MODELO BASELINE
    print("\n⚡ Entrenando modelo baseline...")
    
    rf_baseline = RandomForestClassifier(
        n_estimators=50,  # Reducido para velocidad
        random_state=42, 
        n_jobs=-1,
        class_weight=class_weight,
        max_depth=10  # Limitado para evitar overfitting
    )
    
    rf_baseline.fit(X_train, y_train)
    baseline_score = rf_baseline.score(X_test, y_test)
    
    print(f"📊 Accuracy modelo baseline: {baseline_score:.4f}")
    
    # 8. OPTIMIZACIÓN SIMPLIFICADA (para evitar errores)
    print("\n🔧 Optimización simplificada de hiperparámetros...")
    
    # Grid más pequeño para evitar problemas
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [10, 15, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    print(f"🔍 Probando {np.prod([len(v) for v in param_grid.values()])} combinaciones...")
    
    try:
        grid_search = GridSearchCV(
            RandomForestClassifier(
                random_state=42, 
                n_jobs=-1,
                class_weight=class_weight
            ),
            param_grid,
            cv=3,  # Reducido de 5 a 3 para velocidad
            scoring='accuracy',
            n_jobs=-1,
            verbose=0  # Reducir verbosidad
        )
        
        grid_search.fit(X_train, y_train)
        best_rf = grid_search.best_estimator_
        
        print(f"✅ Optimización completada")
        print(f"📊 Mejores parámetros:")
        for param, value in grid_search.best_params_.items():
            print(f"  {param}: {value}")
            
    except Exception as e:
        print(f"⚠️ Error en optimización: {e}")
        print("🔄 Usando modelo baseline...")
        best_rf = rf_baseline
    
    # 9. PREDICCIONES Y MÉTRICAS
    print("\n🎯 Realizando predicciones...")
    
    y_pred = best_rf.predict(X_test)
    
    # Probabilidades (solo para clasificación binaria)
    y_pred_proba = None
    if len(best_rf.classes_) == 2:
        try:
            y_pred_proba = best_rf.predict_proba(X_test)[:, 1]
        except:
            y_pred_proba = None
    
    # Calcular métricas
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # ROC AUC
    roc_auc = None
    if len(best_rf.classes_) == 2 and y_pred_proba is not None:
        try:
            roc_auc = roc_auc_score(y_test, y_pred_proba)
        except:
            roc_auc = None
    
    # Cross-validation
    try:
        cv_scores = cross_val_score(best_rf, X_train, y_train, cv=3, scoring='accuracy')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()
    except:
        cv_mean = accuracy
        cv_std = 0.0
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"\n === RESULTADOS FINALES ===")
    print(f" Accuracy: {accuracy:.4f}")
    print(f" Precision: {precision:.4f}")
    print(f" Recall: {recall:.4f}")
    print(f" F1-Score: {f1:.4f}")
    if roc_auc:
        print(f" ROC AUC: {roc_auc:.4f}")
    print(f" CV Score: {cv_mean:.4f} ± {cv_std:.4f}")
    
    print(f"\n Matriz de Confusión:")
    print(conf_matrix)
    
    # 10. FEATURE IMPORTANCE
    print("\n🔝 === FEATURE IMPORTANCE ===")
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,  # Usar X.columns en lugar de feature_cols
        'importance': best_rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Features más importantes:")
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
        print(f"  {i+1:2d}. {row['feature']:<40} {row['importance']:.4f}")
    
    # 11. PREPARAR MÉTRICAS PARA RETORNO
    detailed_metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'cv_mean': cv_mean,
        'cv_std': cv_std,
        'confusion_matrix': conf_matrix.tolist(),
        'baseline_accuracy': baseline_score,
        'improvement': accuracy - baseline_score,
        'n_features': len(X.columns),
        'n_samples_train': len(X_train),
        'n_samples_test': len(X_test),
        'class_distribution': target_distribution
    }
    
    improvement = detailed_metrics['improvement']
    print(f"\n🚀 Mejora vs baseline: {improvement:+.4f}")
    
    if improvement > 0:
        print(f"✅ Optimización exitosa - modelo mejorado")
    else:
        print(f"⚠️ Modelo optimizado igual o peor que baseline")
    
    print(f"\n✅ === ENTRENAMIENTO COMPLETADO ===")
    
    return best_rf, X_test, y_test, y_pred, y_pred_proba, feature_importance, detailed_metrics

# === EJECUTAR CON VALIDACIÓN ROBUSTA ===

print("🔧 === CONFIGURACIÓN INICIAL ===")

target_column = "PerdidaCartera"

# Verificar que el target existe
if target_column not in df_processed.columns:
    print(f"ERROR: Target '{target_column}' no encontrado")
    print(f"Columnas disponibles: {list(df_processed.columns)}")
else:
    print(f"Target '{target_column}' encontrado")

    # Preparar features
    excluded_columns = [
        target_column,
        'PersonaCreditoCodigo', 
        'IdentificacionCliente', 
        'TipoIdentificacion', 
        'CorreoElectronicoCliente', 
        'LocalCreditMasterIdSistecredito'
    ]
    
    feature_columns = [col for col in df_processed.columns if col not in excluded_columns]
    
    print(f"Target: {target_column}")
    print(f"Features seleccionadas: {len(feature_columns)}")
    print(f"Columnas excluidas: {len(excluded_columns)}")
    
    if len(feature_columns) == 0:
        print("ERROR: No hay features válidas para entrenar")
    else:
        print(f"Listo para entrenar con {len(feature_columns)} features")
        
        # ENTRENAR MODELO
        print("\n" + "="*60)
        print("INICIANDO ENTRENAMIENTO DEL MODELO")
        print("="*60)
        
        try:
            model, X_test, y_test, y_pred, y_pred_proba, feature_importance, detailed_metrics = train_random_forest_model(
                df_processed, 
                target_column, 
                feature_columns
            )
            
            print("\n" + "="*60)
            print("ENTRENAMIENTO COMPLETADO EXITOSAMENTE")
            print("="*60)
            
            # Mostrar resumen final
            print(f"\n=== RESUMEN FINAL ===")
            print(f" Modelo: Random Forest Classifier")
            print(f"Target: {target_column}")
            print(f"Features: {detailed_metrics['n_features']}")
            print(f"Train samples: {detailed_metrics['n_samples_train']}")
            print(f"Test samples: {detailed_metrics['n_samples_test']}")
            print(f"Final Accuracy: {detailed_metrics['accuracy']:.4f}")
            print(f"Cross-Val Score: {detailed_metrics['cv_mean']:.4f}")
            print(f"Feature más importante: {feature_importance.iloc[0]['feature']}")
            
        except Exception as e:
            print(f"ERROR durante entrenamiento: {e}")
            import traceback
            traceback.print_exc()



In [0]:
def save_model_to_adls_only(model, manifest, encoders, storage_account_name):
    """Guardar modelo, manifest y encoders SOLO en ADLS Gen2 - SIN repositorio"""
    
    import os
    import tempfile
    import joblib
    import json
    from datetime import datetime
    
    print("💾 === GUARDANDO MODELO SOLO EN ADLS GEN2 ===")
    
    # Configurar cliente ADLS Gen2
    try:
        from azure.storage.filedatalake import DataLakeServiceClient
        
        service_client = DataLakeServiceClient(
            account_url=f"https://{storage_account_name}.dfs.core.windows.net",
            credential=os.getenv('AZURE_STORAGE_KEY')
        )
        
        container_name = "sistecredito2"
        file_system_client = service_client.get_file_system_client(container_name)
        
        print("✅ Cliente ADLS Gen2 configurado")
        
    except Exception as e:
        print(f"Error configurando ADLS Gen2: {e}")
        return None
    
    # Crear timestamp para carpeta única
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_folder = f"models/random_forest_perdida_cartera_{timestamp}"
    
    try:
        # 1. GUARDAR MODELO
        print("Guardando modelo...")
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.joblib') as tmp_file:
            joblib.dump(model, tmp_file.name)
            
            with open(tmp_file.name, 'rb') as f:
                model_data = f.read()
            
            file_client = file_system_client.get_file_client(f"{model_folder}/model.joblib")
            file_client.upload_data(model_data, overwrite=True)
            
        os.unlink(tmp_file.name)
        print(f"Modelo guardado: {model_folder}/model.joblib")
        
        # 2. GUARDAR MANIFEST
        print("📋 Guardando manifest...")
        
        manifest_json = json.dumps(manifest, indent=2, ensure_ascii=False)
        manifest_bytes = manifest_json.encode('utf-8')
        
        file_client = file_system_client.get_file_client(f"{model_folder}/manifest.json")
        file_client.upload_data(manifest_bytes, overwrite=True)
        
        print(f"✅ Manifest guardado: {model_folder}/manifest.json")
        
        # 3. GUARDAR ENCODERS (SI EXISTEN)
        if encoders and len(encoders) > 0:
            print("🔤 Guardando encoders...")
            
            with tempfile.NamedTemporaryFile(delete=False, suffix='.joblib') as tmp_file:
                joblib.dump(encoders, tmp_file.name)
                
                with open(tmp_file.name, 'rb') as f:
                    encoders_data = f.read()
                
                file_client = file_system_client.get_file_client(f"{model_folder}/encoders.joblib")
                file_client.upload_data(encoders_data, overwrite=True)
                
            os.unlink(tmp_file.name)
            print(f"✅ Encoders guardados: {model_folder}/encoders.joblib")
        else:
            print("ℹ️ No hay encoders para guardar")
        
        # 4. CREAR README INFORMATIVO
        print("📄 Creando README...")
        
        readme_content = f"""# Modelo Random Forest - Perdida de Cartera

## Información del Modelo
- **Timestamp**: {timestamp}
- **Fecha**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **Tipo**: Random Forest Classifier
- **Target**: PerdidaCartera
- **Accuracy**: {manifest['model_performance']['accuracy']:.4f}

## Archivos en esta Carpeta
- `model.joblib` - Modelo entrenado serializado
- `manifest.json` - Metadatos completos del modelo
- `encoders.joblib` - Encoders para preprocesamiento (si aplica)
- `README.md` - Este archivo

## Top 5 Features Más Importantes
{chr(10).join([f"- {feat['feature']}: {feat['importance']:.4f}" for feat in manifest['feature_importance']['top_10_features'][:5]])}

## Performance
- Accuracy: {manifest['model_performance']['accuracy']:.4f}
- Precision: {manifest['model_performance']['precision']:.4f}
- Recall: {manifest['model_performance']['recall']:.4f}
- F1-Score: {manifest['model_performance']['f1_score']:.4f}
- Cross-Validation: {manifest['model_performance']['cross_validation_mean']:.4f} ± {manifest['model_performance']['cross_validation_std']:.4f}

## Uso del Modelo

---
*Generado automáticamente por Pipeline MLOps*
"""
        
        readme_bytes = readme_content.encode('utf-8')
        file_client = file_system_client.get_file_client(f"{model_folder}/README.md")
        file_client.upload_data(readme_bytes, overwrite=True)
        
        print(f"README creado: {model_folder}/README.md")
        
        # 5. VERIFICAR ARCHIVOS GUARDADOS
        print("\n=== ARCHIVOS GUARDADOS EN ADLS GEN2 ===")
        
        try:
            files = file_system_client.get_paths(path=model_folder)
            for file_path in files:
                if not file_path.is_directory:
                    file_size = file_path.content_length or 0
                    print(f"  {file_path.name} ({file_size:,} bytes)")
        except Exception as e:
            print(f"  No se pudo listar archivos: {e}")
        
        # 6. INFORMACIÓN FINAL
        full_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{model_folder}/"
        
        print(f"\n=== GUARDADO COMPLETADO EXITOSAMENTE ===")
        print(f"Ubicación: {full_path}")
        print(f"Timestamp: {timestamp}")
        print(f"Accuracy del modelo: {manifest['model_performance']['accuracy']:.4f}")
        print(f"Feature más importante: {manifest['feature_importance']['top_10_features'][0]['feature']}")
        
        return {
            'adls_path': full_path,
            'folder_name': model_folder,
            'timestamp': timestamp,
            'files_saved': ['model.joblib', 'manifest.json', 'encoders.joblib' if encoders else None, 'README.md']
        }
        
    except Exception as e:
        print(f"Error guardando en ADLS Gen2: {e}")
        import traceback
        traceback.print_exc()
        return None

def create_model_manifest_for_perdida_cartera(model, df, feature_cols, target_col, metrics, feature_importance, encoders=None):
    """Crear manifest específico para modelo de PerdidaCartera"""
    
    from datetime import datetime
    
    manifest = {
        "model_info": {
            "model_type": "RandomForestClassifier",
            "algorithm": "Random Forest",
            "created_date": datetime.now().isoformat(),
            "sklearn_version": "1.0+",
            "model_parameters": model.get_params(),
            "use_case": "Predicción de Pérdida de Cartera",
            "business_problem": "Clasificar clientes según riesgo de pérdida en cartera"
        },
        "data_info": {
            "dataset_shape": df.shape,
            "total_features": len(feature_cols),
            "target_column": target_col,
            "feature_columns": feature_cols,
            "categorical_features": list(encoders.keys()) if encoders else [],
            "data_source": "Sistema Sistecredito - Datos de Cartera",
            "excluded_columns": [
                "PersonaCreditoCodigo", 
                "IdentificacionCliente", 
                "TipoIdentificacion", 
                "CorreoElectronicoCliente", 
                "LocalCreditMasterIdSistecredito"
            ]
        },
        "model_performance": {
            "accuracy": float(metrics['accuracy']),
            "precision": float(metrics['precision']),
            "recall": float(metrics['recall']),
            "f1_score": float(metrics['f1_score']),
            "roc_auc": float(metrics['roc_auc']) if metrics.get('roc_auc') else None,
            "cross_validation_mean": float(metrics['cv_mean']),
            "cross_validation_std": float(metrics['cv_std']),
            "confusion_matrix": metrics['confusion_matrix'],
            "baseline_accuracy": float(metrics.get('baseline_accuracy', 0)),
            "improvement_vs_baseline": float(metrics.get('improvement', 0))
        },
        "feature_importance": {
            "top_10_features": feature_importance.head(10).to_dict('records'),
            "all_features": feature_importance.to_dict('records')
        },
        "preprocessing": {
            "label_encoders": {k: v.classes_.tolist() if hasattr(v, 'classes_') else str(v) 
                              for k, v in encoders.items()} if encoders else {},
            "missing_values_treatment": "median_fill_numeric_zero_fill_categorical",
            "duplicates_removed": True,
            "null_columns_removed": True
        },
        "model_usage": {
            "prediction_example": "model.predict(X_new)",
            "probability_example": "model.predict_proba(X_new)",
            "required_features": feature_cols,
            "input_validation": "Ensure all features are numeric, no nulls allowed"
        },
        "business_metrics": {
            "target_distribution": metrics.get('class_distribution', {}),
            "training_samples": metrics.get('n_samples_train', 0),
            "test_samples": metrics.get('n_samples_test', 0),
            "class_balance_ratio": "balanced" if metrics.get('class_distribution', {}) else "unknown"
        }
    }
    
    return manifest

# === CÓDIGO COMPLETO PARA USAR DESPUÉS DEL ENTRENAMIENTO ===

# Después de entrenar tu modelo, ejecuta esto:

print("\n=== GUARDANDO MODELO EN ADLS GEN2 ===")

# Crear manifest completo
manifest = create_model_manifest_for_perdida_cartera(
    model, 
    df_processed, 
    feature_importance['feature'].tolist(),  # Lista de features del DataFrame
    target_column, 
    detailed_metrics, 
    feature_importance,
    encoders if 'encoders' in locals() else None
)

# Guardar SOLO en ADLS Gen2
result = save_model_to_adls_only(
    model, 
    manifest, 
    encoders if 'encoders' in locals() else None,
    "sistecreditofinal"
)

if result:
    print(f"\n === GUARDADO EXITOSO ===")
    print(f"Ubicación: {result['adls_path']}")
    print(f"Archivos: {', '.join([f for f in result['files_saved'] if f])}")
    print(f"ID del modelo: {result['timestamp']}")
    
    # Copiar la ruta para uso futuro
    model_path_for_tests = result['folder_name']
    print(f"\nPara tests CI/CD usa esta ruta:")
    print(f"model_path = '{model_path_for_tests}'")
    
else:
    print("Error guardando modelo")
