# Model Training & Evaluation

## Importing key functions

In [None]:
from scripts import (
    load_and_split_data, 
    build_glm_pipeline,
    tune_glm_pipeline,
    build_lgbm_pipeline,
    tune_lgbm_pipeline,
    pipe_preprocessing,
    evaluate_model,
    plot_predicted_vs_actual_bar,
    feature_analysis)

## Initial stage (definitions and pipeline pre-processing)

In [None]:
# Define features
numerical_features = ["Speed_limit", "Number_of_Casualties", "Number_of_Vehicles"]
categorical_features = [
        "Did_Police_Officer_Attend_Scene_of_Accident",
        "Junction_Control",
        "Junction_Detail",
        "Light_Conditions",
        "Pedestrian_Crossing-Human_Control",
        "Pedestrian_Crossing-Physical_Facilities",
        "Road_Type",
        "Urban_or_Rural_Area",
        "Weather_Conditions",
        "Road_Surface_Conditions",
        "Month",
        "Day_of_Week",
        "Hour_of_Day"
    ]

# Define target
target_column = "Accident_Severity"

In [None]:
train_data, test_data = load_and_split_data()

In [None]:
# Split train_data into features and target
X_train = train_data.drop(columns=[target_column,
                                   "Accident_Index"])
y_train = train_data[target_column]

# Split test_data into features and target
X_test = test_data.drop(columns=[target_column,
                                  "Accident_Index"])
y_test = test_data[target_column]

In [None]:
preprocessor = pipe_preprocessing(numerical_features=numerical_features, categorical_features=categorical_features)

## GLM

In [None]:
# Build and evaluate the pipeline
glm_pipeline, glm_accuracy = build_glm_pipeline(
        train_data=train_data,
        test_data=test_data,
        target_column=target_column,
        preprocessor=preprocessor
    )

In [None]:
print(glm_pipeline)

In [None]:
best_glm_pipeline, best_params, best_score = tune_glm_pipeline(glm_pipeline, train_data)

In [None]:
glm_ap = evaluate_model(best_glm_pipeline, test_data=test_data)

In [None]:
plot_predicted_vs_actual_bar(glm_ap)

Based on the confusion matrix and bar plot, our tuned GLM is overestimating the 'Slight' accident prevalence, and underestimating 'Serious' and 'Fatal' accident prevalence. 

In [None]:
feature_analysis(best_glm_pipeline, X_train=X_train, y_train=y_train, top_n=5)

Within our tuned GLM, the top 5 features are Month, Pedestrian_Crossing-Human_Control, Light_Conditions, Day_of_Week, and Road_Surface_Conditions

## LGBM

In [None]:
lgbm_pipeline, lgbm_accuracy = build_lgbm_pipeline(
        train_data=train_data,
        test_data=test_data,
        target_column=target_column,
        preprocessor=preprocessor
    )

In [None]:
best_lgbm_pipeline, best_params = tune_lgbm_pipeline(lgbm_pipeline, train_data)

In [None]:
lgbm_ap = evaluate_model(best_lgbm_pipeline, test_data=test_data)

In [None]:
plot_predicted_vs_actual_bar(lgbm_ap)

In [None]:
feature_analysis(best_lgbm_pipeline, X_train=X_train, y_train=y_train, top_n=5)

Similar top 5 features in the LGBM as in the GLM, though Pedestrian_Crossing-Physical_Facilities replaces Light_Conditions in the LGBM.

## PDP

In [None]:
import dalex as dx
import matplotlib.pyplot as plt

# Assuming train_data and best_glm_pipeline are already defined
# Define target column
target_column = "Accident_Severity"

# Fit Dalex Explainer
explainer_glm = dx.Explainer(best_lgbm_pipeline, X_train, y_train, label="Tuned Pipeline")

# Generate Partial Dependence Profiles
pdp_glm = explainer_glm.model_profile()
pdp_glm.plot()
