In [69]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer #, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MaxAbsScaler

from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn import set_config
set_config(display ="diagram")

import matplotlib.pyplot as plt
%matplotlib inline

In [131]:

full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)

In [None]:
selected_df=full_df[['GUILTY','DEVICE_CONDITION', 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'TRAFFICWAY_TYPE','ROADWAY_SURFACE_COND','VEHICLE_DEFECT',
             'DRIVERS_LICENSE_CLASS', 'AGE_GROUP', 'VEHICLE_AGE', 'AIRBAG_DEPLOYED',
            'PHYSICAL_CONDITION', 'MANEUVER',  'DRIVER_VISION',  'ALIGNMENT' ,  'TRAFFIC_CONTROL_DEVICE',
                    'NUM_PASSENGERS','SUN_GLARE','VEHICLE_YEAR','AGE']]



In [140]:
sample_df=selected_df.sample(100000, random_state=100)
#sample_df=selected_df.copy()

y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)


In [141]:
#numeric_columns = ['NUM_PASSENGERS']

nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'ROADWAY_SURFACE_COND','VEHICLE_DEFECT', 'AIRBAG_DEPLOYED',
             'DRIVERS_LICENSE_CLASS',  'VEHICLE_AGE', 'AGE_GROUP',
            'PHYSICAL_CONDITION', 'MANEUVER',   'TRAFFIC_CONTROL_DEVICE']

#, 'TRAFFICWAY_TYPE', ,'SUN_GLARE'
#'ALIGNMENT' ,  'DEVICE_CONDITION','DRIVER_VISION', 

X_train = X_train[numeric_columns+nominal_columns]
X_test = X_test[numeric_columns+nominal_columns]



In [142]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numeric_columns)),
        ('imputer', SimpleImputer(strategy = 'constant', fill_value=0))
 #       ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(nominal_columns)),
#        ('imputer', SimpleImputer(strategy = 'constant', fill_value='NA')),
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
    ])


fu = FeatureUnion(transformer_list=[
#        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])


X_train_transformed = fu.fit_transform(X_train[numeric_columns+nominal_columns])
#X_train.info(verbose=True, show_counts=True)


In [148]:
#columns=fu.transformer_list[1][1][2].get_feature_names_out()
#columns=np.insert(columns, 0, 'Intercept')
#transformed_df=pd.DataFrame(X_train_transformed, columns=X_train[numeric_columns+nominal_columns].columns)
transformed_df=pd.DataFrame(X_train_transformed)
#transformed_df.info()

#transformed_df.info(verbose=True, show_counts=True)


In [214]:
random_forest_pipe = Pipeline([
                          ('fu', fu),
#                          ('onehotenc', OneHotEncoder(sparse = False, drop = 'first', handle_unknown =  'ignore')),
#                           ('label_encoder', LabelEncoder()),
#                         ('model',DecisionTreeClassifier(splitter='best', max_depth=5,
#                                                                  min_samples_split=2, min_samples_leaf=8))
                          ('model', RandomForestClassifier(n_estimators=100 )),
#                          ('model', BaggingClassifier(DecisionTreeClassifier(splitter='best', max_depth=5,
#                                                                  min_samples_split=2, min_samples_leaf=8),n_estimators=150))
 
                              ])


In [215]:
#bagged_tree_pipe.fit(X_train, y_train)
#y_pred_test = bagged_tree_pipe.predict(X_test)
#y_pred_train = bagged_tree_pipe.predict(X_train)

#print(classification_report(y_train, y_pred_train))
#print('------------')
#print(classification_report(y_test, y_pred_test))


In [216]:
#pipe_grid = {'model__max_depth': [ 5,6 ], 
#            'model__min_samples_leaf':[5,6], 
#             'model__min_samples_split':[5,6],
#             'model__n_estimators':[20]}

#pipe_grid = {'model__criterion': ['gini', 'entropy']}
pipe_grid = {'model__max_depth':[5,7,9,13],'model__min_samples_leaf':[3,5,8], 'model__min_samples_split':[4,6,8,12]}


gs_pipe = GridSearchCV(estimator=random_forest_pipe, 
                       param_grid=pipe_grid, cv=2)

#gs_pipe.estimator[2][18]
gs_pipe

In [217]:
gs_pipe.fit(X_train, y_train)

In [218]:
y_pred_train=gs_pipe.predict(X_train)
y_pred_test=gs_pipe.predict(X_test)

In [219]:
gs_pipe.best_params_

{'model__max_depth': 13,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 6}

In [213]:
print(classification_report(y_train, y_pred_train))
print('------------')
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      3907
           1       0.67      0.82      0.73      2093

    accuracy                           0.79      6000
   macro avg       0.78      0.80      0.78      6000
weighted avg       0.81      0.79      0.80      6000

------------
              precision    recall  f1-score   support

           0       0.87      0.78      0.82      1303
           1       0.65      0.77      0.71       697

    accuracy                           0.78      2000
   macro avg       0.76      0.78      0.76      2000
weighted avg       0.79      0.78      0.78      2000

