In [None]:
''''''
# Pedestrian crash severity analysis using Stacking Ensemble Models
# Author:   Amir Rafe (amir.rafe@usu.edu)
# File:     StackingEnsembleAIM.ipynb
# Date:     Spring 2024
# Version:  1.02  
''''''

In [None]:
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , f1_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import shap
import matplotlib.pyplot as plt

# Stacking Ensemble Models

In [None]:

# Load the dataset
Ped_df = pd.read_csv('df.csv')

# List of features to use for modeling
features = ['PersonType','Sex','AgeText','Aggressive','AlcoholSuspected','AlcResult',
                       'UrbanRural','FunctionClass','CommercialMotorVehInvolved','Light',
                       'Weather','RoadwaySurf','DisregardTrafficControl','DistractedDriving',
                       'DomesticAnimalRelated','DrowsyDriving','DrugsSuspected','OlderDriverInvolved',
                       'TeenageDriverInvolved','DUI','HeavyTruckInvolved','OverturnRollover','RightTurn',
                       'TransitVehicleInvolved','HolidayCrash','HolidayCrashYN','Intersection',
                       'LeftUTurnInvolved','VerticalAlignment','WorkZoneInvolved','WrongWayDriving']

# Handle any NaN values
Ped_df = Ped_df.fillna(0)
Ped_df['Severity'] = Ped_df['Severity'].astype(int)

# Split data into features (X) and label (y)
X = Ped_df[features].values
y = Ped_df['Severity'].values

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=25)

# Hyperparameter tunning 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 4, 6],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_model = XGBClassifier(num_class=5 , base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found: ",grid_search.best_params_)


In [None]:

# Define the base models
level0 = list()
level0.append(('lr', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000)))
level0.append(('xgb', XGBClassifier(
    objective='multi:softmax',
    num_class=5,
    random_state=42,
    n_estimators=300,
    max_depth=6,
    min_child_weight=1,
    gamma=0.5,
    subsample=0.6,
    colsample_bytree=1.0)))
level0.append(('et', ExtraTreesClassifier()))

# Define meta learner model
level1 = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=5000)

# Define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# Fit the model on all scaled available data
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_test_scaled = scaler.transform(X_test)

# Make predictions
model.fit(X_res_scaled, y_res)
preds = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Evaluate the model using F1-score
f1 = f1_score(y_test, preds, average='weighted')
print("F1-score: %.2f" % f1)


# SHAP interpretation 

In [None]:
# Sample a background dataset from features. This subset is used as a reference point to compute SHAP values.
# 150 samples are typically enough to approximate the SHAP values without being too computationally intensive.
background = shap.sample(X_train, 150)

# Initialize a SHAP explainer object. This uses the KernelExplainer, which is model-agnostic and can be used 
# with any machine learning model. The explainer requires a prediction function and a background dataset.
# Here, `best_model.predict_proba` is passed to compute SHAP values for classification models, which outputs
# probabilities for each class. The background dataset helps in comparing the feature's effect when present vs. absent.
explainer = shap.KernelExplainer(model.predict_proba, background)

# Compute SHAP values for each feature in dataset X. SHAP values quantify the impact of each feature on the model's prediction.
# This computation is done for all instances in X, allowing to interpret the model's behavior. The `n_jobs=-1` parameter
# enables parallel computation, using all available CPUs to speed up the calculation.
shap_values = explainer.shap_values(background, n_jobs=-1)


In [None]:
#shap summary plot (feature importance)

features2 = ['PersonType','Sex','Age','Aggressive','DrugResult','AlcResult','UrbanRural','FunctionClass','CommercialVehInvolved',
                       'LightingCondition','Weather','RoadwaySurf','DisregardTrafficControl','DistractedDriving','DomesticAnimalRelated',
                       'DrowsyDriving','DrugsSuspected','OlderDriverInvolved','TeenageDriverInvolved','DUI','HeavyTruckInvolved',
                       'OverturnRollover','RightTurnInvolved','TransitVehicleInvolved','Aggressive',
                       'HolidayCrashYN','Intersection','LeftUTurnInvolved','VerticalAlignment','WorkZoneInvolved','WrongWayDriving']

shap.summary_plot(shap_values, X_train, feature_names=features2 , plot_type= 'bar' , plot_size=(12,12) ,  class_names= {
    0: 'Possible injury',
    1: 'Minor injury',
    2: 'No injury/PDO',
    3: 'Serious injury',
    4: 'Fatal'
},show=False)

# Get the current figure and axes objects
fig, ax = plt.gcf(), plt.gca()

# Modifying main plot parameters
ax.tick_params(labelsize=16)
ax.set_xlabel("mean(|SHAP value|) (average impact on model output magnitude)", fontsize=16 ,  labelpad=15)

# Get colorbar
plt.legend(fontsize=14)
plt.tight_layout()
plt.savefig("shap_summary_Stacking.png",dpi=300) 
plt.show()

In [None]:
#shap summary plot (dot plot)

shap.summary_plot(shap_values[0], X_train, feature_names=features2 , plot_type= 'dot' , plot_size=(10,10),show=False)

# Get the current figure and axes objects
fig, ax = plt.gcf(), plt.gca()

# Modifying main plot parameters
ax.tick_params(labelsize=16)
ax.set_xlabel("SHAP value (impact on model output)", fontsize=16  , labelpad=15)

# Get colorbar
cb_ax = fig.axes[1] 
cb_ax.tick_params(labelsize=16)
cb_ax.set_ylabel("Feature value", fontsize=16)
plt.tight_layout()
plt.savefig("shap_summary_Stacking_Possible.png", dpi=300)
plt.show()