## Tree-based and ensemble models

Code from the slides in an executable notebook.

## Example: the heart data set

In [None]:
import pandas as pd
heart = pd.read_csv("data/Heart.csv", index_col=0)
heart.info()

In [None]:
heart.head()

## Do we have a class imbalance?

In [None]:
heart['AHD'].value_counts(normalize=True)

## Data splitting

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

np.random.seed(2024)

heart_train, heart_test = train_test_split(
    heart, train_size=0.8, stratify=heart["AHD"]
)

X_train = heart_train.drop(columns=['AHD'])
y_train = heart_train['AHD']
X_test = heart_test.drop(columns=['AHD'])
y_test = heart_test['AHD']

## One hot encoding & pre-processing

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

numeric_feats = ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak','Slope', 'Ca']
passthrough_feats = ['Sex', 'Fbs', 'ExAng']
categorical_feats = ['ChestPain', 'Thal']

heart_preprocessor = make_column_transformer(
    (StandardScaler(), numeric_feats), 
    ("passthrough", passthrough_feats),     
    (OneHotEncoder(handle_unknown = "ignore"), categorical_feats),     
)

## Fitting a dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

dummy = DummyClassifier()
dummy_pipeline = make_pipeline(heart_preprocessor, dummy)
cv_10_dummy = pd.DataFrame(
    cross_validate(
        estimator=dummy_pipeline,
        cv=10,
        X=X_train,
        y=y_train
    )
)
cv_10_dummy_metrics = cv_10_dummy.agg(["mean", "sem"])
results = pd.DataFrame({'mean' : [cv_10_dummy_metrics.test_score.iloc[0]],
  'sem' : [cv_10_dummy_metrics.test_score.iloc[1]]},
  index = ['Dummy classifier']
)
results

## Fitting a decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=2026)

dt_pipeline = make_pipeline(heart_preprocessor, decision_tree)
cv_10_dt = pd.DataFrame(
    cross_validate(
        estimator=dt_pipeline,
        cv=10,
        X=X_train,
        y=y_train
    )
)
cv_10_dt_metrics = cv_10_dt.agg(["mean", "sem"])
results_dt = pd.DataFrame({'mean' : [cv_10_dt_metrics.test_score.iloc[0]],
  'sem' : [cv_10_dt_metrics.test_score.iloc[1]]},
  index = ['Decision tree']
)
results = pd.concat([results, results_dt])
results

## Random forest in `scikit-learn` & missing values

How many rows have missing observations:

In [None]:
heart.isna().any(axis=1).sum()


Drop rows with missing observations:

In [None]:
heart_train_drop_na = heart_train.dropna()

X_train_drop_na = heart_train_drop_na.drop(
    columns=['AHD']
)
y_train_drop_na = heart_train_drop_na['AHD']

## Random forest in `scikit-learn`

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=2026)
rf_pipeline = make_pipeline(heart_preprocessor, random_forest)
cv_10_rf = pd.DataFrame(
    cross_validate(
        estimator=rf_pipeline,
        cv=10,
        X=X_train_drop_na,
        y=y_train_drop_na
    )
)

cv_10_rf_metrics = cv_10_rf.agg(["mean", "sem"])
results_rf = pd.DataFrame({'mean' : [cv_10_rf_metrics.test_score.iloc[0]],
  'sem' : [cv_10_rf_metrics.test_score.iloc[1]]},
  index = ['Random forest']
)
results = pd.concat([results, results_rf])
results

## Tuning random forest in `scikit-learn`

In [None]:
from sklearn.model_selection import GridSearchCV

rf_param_grid = {'randomforestclassifier__n_estimators': [200],
              'randomforestclassifier__max_depth': [1, 3, 5, 7, 9],
              'randomforestclassifier__max_features': [1, 2, 3, 4, 5, 6, 7]}

rf_tune_grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=rf_param_grid,
    cv=10,
    n_jobs=-1 # tells computer to use all available CPUs
)
rf_tune_grid.fit(
    X_train_drop_na,
    y_train_drop_na
)

cv_10_rf_tuned_metrics = pd.DataFrame(rf_tune_grid.cv_results_)
results_rf_tuned = pd.DataFrame({'mean' : rf_tune_grid.best_score_,
  'sem' : pd.DataFrame(rf_tune_grid.cv_results_)['std_test_score'][6] / 10**(1/2)},
  index = ['Random forest tuned']
)
results = pd.concat([results, results_rf_tuned])

In [None]:
results

## Tuning `GradientBoostingClassifier` with `scikit-learn`

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boosted_classifier = GradientBoostingClassifier(random_state=2026)
gb_pipeline = make_pipeline(heart_preprocessor, gradient_boosted_classifier)
gb_param_grid = {'gradientboostingclassifier__n_estimators': [200],
              'gradientboostingclassifier__max_depth': [1, 3, 5, 7, 9],
              'gradientboostingclassifier__learning_rate': [0.001, 0.005, 0.01]}
gb_tune_grid = GridSearchCV(
    estimator=gb_pipeline,
    param_grid=gb_param_grid,
    cv=10,
    n_jobs=-1 # tells computer to use all available CPUs
)
gb_tune_grid.fit(
    X_train_drop_na,
    y_train_drop_na
)

cv_10_gb_tuned_metrics = pd.DataFrame(gb_tune_grid.cv_results_)
results_gb_tuned = pd.DataFrame({'mean' : gb_tune_grid.best_score_,
  'sem' : pd.DataFrame(gb_tune_grid.cv_results_)['std_test_score'][6] / 10**(1/2)},
  index = ['Gradient boosted classifier tuned']
)
results = pd.concat([results, results_gb_tuned])

In [None]:
results

## Precision and recall on the tuned random forest model

In [None]:
from sklearn.metrics import make_scorer, precision_score, recall_score

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, pos_label='Yes'),
    'recall': make_scorer(recall_score, pos_label='Yes')
}

rf_tune_grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=rf_param_grid,
    cv=10,
    n_jobs=-1,
    scoring=scoring,
    refit='accuracy'
)

rf_tune_grid.fit(X_train_drop_na, y_train_drop_na)

In [None]:
cv_results = pd.DataFrame(rf_tune_grid.cv_results_)

mean_precision = cv_results['mean_test_precision'].iloc[rf_tune_grid.best_index_]
sem_precision = cv_results['std_test_precision'].iloc[rf_tune_grid.best_index_] / np.sqrt(10)
mean_recall = cv_results['mean_test_recall'].iloc[rf_tune_grid.best_index_]
sem_recall = cv_results['std_test_recall'].iloc[rf_tune_grid.best_index_] / np.sqrt(10)

results_rf_tuned = pd.DataFrame({
    'mean': [rf_tune_grid.best_score_, mean_precision, mean_recall],
    'sem': [cv_results['std_test_accuracy'].iloc[rf_tune_grid.best_index_] / np.sqrt(10), sem_precision, sem_recall],
}, index=['accuracy', 'precision', 'recall'])

results_rf_tuned

## Feature importances in `scikit-learn`

In [None]:
# Access the best pipeline
best_pipeline = rf_tune_grid.best_estimator_

# Extract the trained RandomForestClassifier from the pipeline
best_rf = best_pipeline.named_steps['randomforestclassifier']

# Extract feature names after preprocessing
# Get the names of features from each transformer in the pipeline
numeric_features = numeric_feats
categorical_feature_names = best_pipeline.named_steps['columntransformer'].transformers_[2][1].get_feature_names_out(categorical_feats)
passthrough_features = passthrough_feats

# Combine all feature names into a single list
feature_names = np.concatenate([numeric_features, passthrough_features, categorical_feature_names])

# Calculate feature importances
feature_importances = best_rf.feature_importances_

# Create a DataFrame to display feature importances
importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance (descending order)
importances_df = importances_df.sort_values(by='Importance', ascending=False)

## Visualizing the results

In [None]:
import altair as alt

bar_chart = alt.Chart(importances_df).mark_bar().encode(
    x=alt.X('Importance:Q', title='Feature Importance'),
    y=alt.Y('Feature:N', sort='-x', title='Feature'),
    tooltip=['Feature', 'Importance']
).properties(
    title='Feature Importances from Random Forest Model',
    width=600,
    height=400
)
bar_chart

## Evaluating on the test set

In [None]:
heart_test_drop_na = heart_test.dropna()
X_test_drop_na = heart_test_drop_na.drop(columns=['AHD'])
y_test_drop_na = heart_test_drop_na['AHD']

heart_test_drop_na["predicted"] = rf_tune_grid.predict(
    X_test_drop_na
)

Accuracy

In [None]:
rf_tune_grid.score(
    X_test_drop_na,
    y_test_drop_na
)

Precision

In [None]:
precision_score(
    y_true=heart_test_drop_na["AHD"],
    y_pred=heart_test_drop_na["predicted"],
    pos_label='Yes'
)

Recall

In [None]:
recall_score(
    y_true=heart_test_drop_na["AHD"],
    y_pred=heart_test_drop_na["predicted"],
    pos_label='Yes'
)

Confusion matrix

In [None]:
conf_matrix = pd.crosstab(
    heart_test_drop_na["AHD"],
    heart_test_drop_na["predicted"]
)
print(conf_matrix)

## ReferencesGareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani and Jonathan Taylor. An Introduction to Statistical Learning with Applications in Python. Springer, 1st edition, 2023. URL: https://www.statlearning.com/.Kolhatkar, V., and Ostblom, J. (2024). UBC DSCI 573: Feature and Model Selection course notes. URL: https://ubc-mds.github.io/DSCI_573_feat-model-selectPedregosa, F. et al., 2011. Scikit-learn: Machine learning in Python. Journal of machine learning research, 12(Oct), pp.2825–2830.