# Credit Risk Model Training - XGBoost 3.1.0\n\n**Objective**: Train an optimized XGBoost model for credit risk prediction with high precision and recall.\n\n**Dataset**: Credit risk dataset with 32,583 samples\n**Target**: `loan_status` (1 = default, 0 = no default)\n**XGBoost Version**: 3.1.0

In [None]:
# Install required packages (if needed)\n# !pip install xgboost==3.1.0 scikit-learn pandas numpy matplotlib seaborn

In [None]:
import pandas as pd\nimport numpy as np\nimport xgboost as xgb\nfrom sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import (\n    classification_report, confusion_matrix, \n    precision_score, recall_score, f1_score, roc_auc_score\n)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport joblib\nimport warnings\nwarnings.filterwarnings('ignore')\n\nprint(f'XGBoost version: {xgb.__version__}')

## 1. Load and Explore Data

In [None]:
# Load dataset\ndf = pd.read_csv('usecases/credit_risk_dataset.csv')\n\nprint(f'Dataset shape: {df.shape}')\nprint(f'\\nTarget distribution:\\n{df["loan_status"].value_counts()}')\nprint(f'\\nDefault rate: {df["loan_status"].mean():.2%}')\n\ndf.head()

In [None]:
# Check for missing values\nprint('Missing values per column:')\nprint(df.isnull().sum())\n\n# Data types\nprint('\\nData types:')\nprint(df.dtypes)

## 2. Data Preprocessing

In [None]:
# Handle missing values\n# Fill missing loan_int_rate with median\ndf['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)\n\n# Fill missing person_emp_length with median\ndf['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)\n\nprint('Missing values after imputation:')\nprint(df.isnull().sum().sum())

In [None]:
# Encode categorical variables\ncategorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']\n\nlabel_encoders = {}\nfor col in categorical_cols:\n    le = LabelEncoder()\n    df[col] = le.fit_transform(df[col])\n    label_encoders[col] = le\n\nprint('Encoded categorical variables')\ndf.head()

In [None]:
# Separate features and target\nX = df.drop('loan_status', axis=1)\ny = df['loan_status']\n\nprint(f'Features shape: {X.shape}')\nprint(f'Target shape: {y.shape}')\nprint(f'\\nFeature names: {list(X.columns)}')

In [None]:
# Train-test split (80-20)\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.2, random_state=42, stratify=y\n)\n\nprint(f'Training set: {X_train.shape[0]} samples')\nprint(f'Test set: {X_test.shape[0]} samples')\nprint(f'\\nTraining set default rate: {y_train.mean():.2%}')\nprint(f'Test set default rate: {y_test.mean():.2%}')

## 3. Train XGBoost Model (Optimized for Precision & Recall)

In [None]:
# Calculate scale_pos_weight for imbalanced data\nscale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()\nprint(f'Scale pos weight: {scale_pos_weight:.2f}')

In [None]:
# XGBoost parameters optimized for precision and recall\nparams = {\n    'objective': 'binary:logistic',\n    'eval_metric': ['logloss', 'auc'],\n    'max_depth': 6,\n    'learning_rate': 0.05,\n    'n_estimators': 300,\n    'min_child_weight': 3,\n    'subsample': 0.8,\n    'colsample_bytree': 0.8,\n    'gamma': 0.1,\n    'reg_alpha': 0.1,\n    'reg_lambda': 1.0,\n    'scale_pos_weight': scale_pos_weight,\n    'random_state': 42,\n    'n_jobs': -1,\n    'tree_method': 'hist',  # Fast histogram-based algorithm\n    'device': 'cpu'  # Use 'cuda' if GPU available\n}\n\nprint('Model parameters:')\nfor key, value in params.items():\n    print(f'  {key}: {value}')

In [None]:
# Train model with early stopping\nmodel = xgb.XGBClassifier(**params)\n\nmodel.fit(\n    X_train, y_train,\n    eval_set=[(X_train, y_train), (X_test, y_test)],\n    verbose=50\n)\n\nprint('\\nModel training complete!')

## 4. Model Evaluation

In [None]:
# Predictions\ny_pred = model.predict(X_test)\ny_pred_proba = model.predict_proba(X_test)[:, 1]\n\n# Metrics\nprecision = precision_score(y_test, y_pred)\nrecall = recall_score(y_test, y_pred)\nf1 = f1_score(y_test, y_pred)\nauc = roc_auc_score(y_test, y_pred_proba)\n\nprint('='*60)\nprint('MODEL PERFORMANCE')\nprint('='*60)\nprint(f'Precision: {precision:.4f}')\nprint(f'Recall: {recall:.4f}')\nprint(f'F1-Score: {f1:.4f}')\nprint(f'ROC-AUC: {auc:.4f}')\nprint('='*60)

In [None]:
# Classification report\nprint('\\nClassification Report:')\nprint(classification_report(y_test, y_pred, target_names=['No Default', 'Default']))

In [None]:
# Confusion matrix\ncm = confusion_matrix(y_test, y_pred)\n\nplt.figure(figsize=(8, 6))\nsns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n            xticklabels=['No Default', 'Default'],\n            yticklabels=['No Default', 'Default'])\nplt.title('Confusion Matrix')\nplt.ylabel('True Label')\nplt.xlabel('Predicted Label')\nplt.tight_layout()\nplt.show()

In [None]:
# Feature importance\nfeature_importance = pd.DataFrame({\n    'feature': X.columns,\n    'importance': model.feature_importances_\n}).sort_values('importance', ascending=False)\n\nplt.figure(figsize=(10, 6))\nsns.barplot(data=feature_importance, x='importance', y='feature')\nplt.title('Feature Importance')\nplt.xlabel('Importance Score')\nplt.tight_layout()\nplt.show()\n\nprint('\\nTop 5 Most Important Features:')\nprint(feature_importance.head())

## 5. Cross-Validation

In [None]:
# 5-fold cross-validation\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\ncv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)\n\nprint('Cross-Validation ROC-AUC Scores:')\nprint(f'  Fold scores: {cv_scores}')\nprint(f'  Mean: {cv_scores.mean():.4f}')\nprint(f'  Std: {cv_scores.std():.4f}')

## 6. Save Model

In [None]:
# Save model\nmodel_path = 'usecases/xgboost_credit_risk_new.pkl'\njoblib.dump(model, model_path)\n\nprint(f'✅ Model saved to: {model_path}')\nprint(f'\\nModel info:')\nprint(f'  XGBoost version: {xgb.__version__}')\nprint(f'  Features: {len(X.columns)}')\nprint(f'  Classes: {model.classes_}')\nprint(f'  Precision: {precision:.4f}')\nprint(f'  Recall: {recall:.4f}')\nprint(f'  F1-Score: {f1:.4f}')\nprint(f'  ROC-AUC: {auc:.4f}')

## 7. Test Model Loading

In [None]:
# Load and test the saved model\nloaded_model = joblib.load(model_path)\n\n# Verify it works\ntest_pred = loaded_model.predict(X_test[:5])\nprint('Test predictions from loaded model:')\nprint(test_pred)\n\nprint('\\n✅ Model successfully saved and loaded!')

## Summary\n\n**Model Performance:**\n- Precision: Measures how many predicted defaults are actual defaults\n- Recall: Measures how many actual defaults are correctly identified\n- F1-Score: Harmonic mean of precision and recall\n- ROC-AUC: Overall model discrimination ability\n\n**Next Steps:**\n1. Use the saved model (`xgboost_credit_risk_new.pkl`) in the HEXEval framework\n2. Run XAI evaluation (SHAP, LIME, Anchor, DiCE)\n3. Get persona-based recommendations\n\n**Model is compatible with:**\n- XGBoost 3.1.0\n- Python 3.8+\n- HEXEval framework