In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import mlflow
import mlflow.xgboost
import shap
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('data/credit_applications.csv')

# Feature engineering
features = [
    'age', 'annual_income', 'debt_to_income_ratio',
    'num_credit_lines', 'num_late_payments', 'credit_utilization',
    'months_since_last_delinquency', 'num_credit_inquiries', 'purchase_amount'
]

X = df[features]
y = df['default_risk']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Default rate - Train: {y_train.mean():.2%}, Test: {y_test.mean():.2%}")

# Start MLflow experiment
mlflow.set_experiment("credit-scoring-model")

with mlflow.start_run(run_name="xgboost_baseline"):
    
    # Model parameters
    params = {
        'max_depth': 5,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42
    }
    
    # Log parameters
    mlflow.log_params(params)
    
    # Train model
    model = XGBClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False
    )
    
    # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Log metrics
    mlflow.log_metric("auc_roc", auc_score)
    mlflow.log_metric("test_accuracy", (y_pred == y_test).mean())
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nðŸ“Š Model Performance:")
    print(f"AUC-ROC: {auc_score:.4f}")
    print("\nFeature Importance:")
    print(feature_importance)
    
    # Save feature importance plot
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Importance')
    plt.title('Feature Importance for Credit Scoring Model')
    plt.tight_layout()
    plt.savefig('models/feature_importance.png')
    mlflow.log_artifact('models/feature_importance.png')
    
    # Log model
    mlflow.xgboost.log_model(
        model,
        "credit_scoring_model",
        registered_model_name="CreditScoringModel"
    )
    
    # Save model locally too
    import joblib
    joblib.dump(model, 'models/credit_model.pkl')
    
    print("\nâœ… Model trained and logged to MLflow!")
    print(f"Run ID: {mlflow.active_run().info.run_id}")