In [3]:
"""
Solution to class assignment
trees_ensembles_econ.py

Teaching example for Random Forests and Gradient Boosting using scikit-learn.
Context: Master's students in economics learning applied ML.

We build a synthetic default-risk dataset with economic-style covariates,
fit RandomForest and GradientBoosting models, evaluate them, and look at
feature importances and simple hyperparameter tuning.
"""

import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


def make_dataset(n_samples=2000, random_state=0):
    """
    Create a synthetic binary classification dataset that resembles
    an economics application: predicting loan default.
    """
    X, y = make_classification(
        n_samples=n_samples,
        n_features=6,
        n_informative=4,
        n_redundant=0,
        n_repeated=0,
        n_clusters_per_class=2,
        weights=[0.7, 0.3],   # 30% default rate
        class_sep=1.2,
        random_state=random_state,
    )

    # Give features economic-style names
    feature_names = [
        "income",            # roughly: higher => less default
        "age",
        "balance_outstanding",
        "num_past_loans",
        "payment_history_score",
        "regional_risk_index",
    ]

    df = pd.DataFrame(X, columns=feature_names)
    df["default"] = y
    return df, feature_names


def train_random_forest(X_train, y_train, X_test, y_test, feature_names):
    print("\n" + "=" * 70)
    print("RANDOM FOREST CLASSIFIER")
    print("=" * 70)

    # A reasonably standard RF setup
    rf = RandomForestClassifier(
        n_estimators=200,      # number of trees
        max_depth=None,        # let trees grow deep
        max_features="sqrt",   # random subset of features at each split
        min_samples_leaf=5,    # regularization
        random_state=0,
        n_jobs=-1,
    )

    rf.fit(X_train, y_train)

    # Evaluate
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)

    print("\nTraining accuracy:  {:.3f}".format(accuracy_score(y_train, y_pred_train)))
    print("Test accuracy:       {:.3f}".format(accuracy_score(y_test, y_pred_test)))

    print("\nClassification report (test set):")
    print(classification_report(y_test, y_pred_test, target_names=["No default", "Default"]))

    print("Confusion matrix (test set):")
    print(confusion_matrix(y_test, y_pred_test))

    # Feature importances
    importances = rf.feature_importances_
    importance_df = pd.DataFrame(
        {"feature": feature_names, "importance": importances}
    ).sort_values("importance", ascending=False)

    print("\nRandom Forest feature importances (higher = more important):")
    print(importance_df.to_string(index=False))


def train_gradient_boosting(X_train, y_train, X_test, y_test, feature_names):
    print("\n" + "=" * 70)
    print("GRADIENT BOOSTING CLASSIFIER")
    print("=" * 70)

    # A simple GBM; learning_rate and n_estimators are the key knobs
    gb = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,     # depth of individual trees (weak learners)
        subsample=0.8,   # stochastic gradient boosting
        random_state=0,
    )

    gb.fit(X_train, y_train)

    # Evaluate
    y_pred_train = gb.predict(X_train)
    y_pred_test = gb.predict(X_test)

    print("\nTraining accuracy:  {:.3f}".format(accuracy_score(y_train, y_pred_train)))
    print("Test accuracy:       {:.3f}".format(accuracy_score(y_test, y_pred_test)))

    print("\nClassification report (test set):")
    print(classification_report(y_test, y_pred_test, target_names=["No default", "Default"]))

    print("Confusion matrix (test set):")
    print(confusion_matrix(y_test, y_pred_test))

    # Feature importances
    importances = gb.feature_importances_
    importance_df = pd.DataFrame(
        {"feature": feature_names, "importance": importances}
    ).sort_values("importance", ascending=False)

    print("\nGradient Boosting feature importances (higher = more important):")
    print(importance_df.to_string(index=False))


def random_forest_with_cv(X_train, y_train):
    """
    Simple example of using cross-validation to choose RF hyperparameters.
    This is just for teaching; the grid is tiny.
    """
    print("\n" + "=" * 70)
    print("RANDOM FOREST WITH GRID SEARCH (CROSS-VALIDATION)")
    print("=" * 70)

    rf = RandomForestClassifier(random_state=0, n_jobs=-1)

    param_grid = {
        "n_estimators": [100, 200],
        "max_depth": [None, 5, 10],
        "max_features": ["sqrt", 0.7],
    }

    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,              # 5-fold cross-validation
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )

    grid_search.fit(X_train, y_train)

    print("\nBest parameters from GridSearchCV:")
    print(grid_search.best_params_)

    print("\nBest cross-validated accuracy: {:.3f}".format(grid_search.best_score_))

    return grid_search.best_estimator_



def gradient_boosting_with_cv(X_train, y_train):
    """
    Example of using cross-validation to choose Gradient Boosting hyperparameters.
    Again, small grid for teaching purposes.
    """
    print("\n" + "=" * 70)
    print("GRADIENT BOOSTING WITH GRID SEARCH (CROSS-VALIDATION)")
    print("=" * 70)

    gb = GradientBoostingClassifier(random_state=0)

    # Key knobs: learning_rate, n_estimators, max_depth, subsample
    param_grid = {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1],
        "max_depth": [2, 3],
        "subsample": [0.8, 1.0],
    }

    grid_search = GridSearchCV(
        estimator=gb,
        param_grid=param_grid,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )

    grid_search.fit(X_train, y_train)

    print("\nBest parameters from GB GridSearchCV:")
    print(grid_search.best_params_)

    print("Best cross-validated accuracy: {:.3f}".format(grid_search.best_score_))

    return grid_search.best_estimator_



def main():
    # 1. Create dataset
    df, feature_names = make_dataset()
    X = df[feature_names].values
    y = df["default"].values

    # 2. Train-test split (hold-out set mimics out-of-sample prediction)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0, stratify=y
    )

    print("Training observations:", X_train.shape[0])
    print("Test observations:     ", X_test.shape[0])

    # 3. Fit and evaluate Random Forest
    train_random_forest(X_train, y_train, X_test, y_test, feature_names)

    # 4. Fit and evaluate Gradient Boosting
    train_gradient_boosting(X_train, y_train, X_test, y_test, feature_names)

    # Using Cross-validation to select hyperparameters (Random Forest)
    best_rf = random_forest_with_cv(X_train, y_train)

    # 5. Evaluate tuned RF on test set
    y_pred_test_best = best_rf.predict(X_test)
    print("\nPerformance of tuned Random Forest on test set:")
    print("Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_test_best)))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred_test_best))

    # 6. GB with CV
    best_gb = gradient_boosting_with_cv(X_train, y_train)
    y_pred_test_best_gb = best_gb.predict(X_test)
    print("\nPerformance of tuned Gradient Boosting on test set:")
    print("Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_test_best_gb)))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred_test_best_gb))


if __name__ == "__main__":
    main()


Training observations: 1400
Test observations:      600

RANDOM FOREST CLASSIFIER

Training accuracy:  0.969
Test accuracy:       0.917

Classification report (test set):
              precision    recall  f1-score   support

  No default       0.93      0.96      0.94       418
     Default       0.89      0.82      0.86       182

    accuracy                           0.92       600
   macro avg       0.91      0.89      0.90       600
weighted avg       0.92      0.92      0.92       600

Confusion matrix (test set):
[[400  18]
 [ 32 150]]

Random Forest feature importances (higher = more important):
              feature  importance
       num_past_loans    0.260945
  balance_outstanding    0.238456
payment_history_score    0.230637
               income    0.205016
                  age    0.033159
  regional_risk_index    0.031788

GRADIENT BOOSTING CLASSIFIER

Training accuracy:  0.966
Test accuracy:       0.918

Classification report (test set):
              precision    reca