In [2]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer #, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline


from sklearn import set_config
set_config(display ="diagram")

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:

full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)

In [118]:
selected_df=full_df[['GUILTY','DEVICE_CONDITION', 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'TRAFFICWAY_TYPE','ROADWAY_SURFACE_COND','VEHICLE_DEFECT',
             'DRIVERS_LICENSE_CLASS', 'AGE_GROUP', 'VEHICLE_AGE', 'AIRBAG_DEPLOYED',
            'PHYSICAL_CONDITION', 'MANEUVER',  'DRIVER_VISION',  'ALIGNMENT' ,  'TRAFFIC_CONTROL_DEVICE',
                    'NUM_PASSENGERS','SUN_GLARE','VEHICLE_YEAR']]


In [119]:
#sample_df=selected_df.sample(80000, random_state=100)
sample_df=selected_df.copy()


y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)


In [120]:
numeric_columns = []

#nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
#           'ROADWAY_SURFACE_COND','VEHICLE_DEFECT', 'AIRBAG_DEPLOYED',
#             'DRIVERS_LICENSE_CLASS',  'VEHICLE_AGE', 
#            'PHYSICAL_CONDITION', 'MANEUVER',   'TRAFFIC_CONTROL_DEVICE']

nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 'MANEUVER']

#'AGE_GROUP', 'TRAFFICWAY_TYPE', ,'SUN_GLARE'
#'ALIGNMENT' ,  'DEVICE_CONDITION','DRIVER_VISION', 

X_train = X_train[numeric_columns+nominal_columns]
X_test = X_test[numeric_columns+nominal_columns]

X_train=X_train.apply(LabelEncoder().fit_transform)
X_test=X_test.apply(LabelEncoder().fit_transform)

In [155]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numeric_columns)),
        ('imputer', SimpleImputer(strategy = 'constant', fill_value=0)),
        ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(nominal_columns)),
#        ('imputer', SimpleImputer(strategy = 'constant', fill_value='NA')),
#        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
    ])


fu = FeatureUnion(transformer_list=[
#        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])




In [156]:
X_train_transformed = fu.fit_transform(X_train)
#X_train.info(verbose=True, show_counts=True)

#columns=fu.transformer_list[1][1][2].get_feature_names_out()
#columns=np.insert(columns, 0, 'Intercept')
#transformed_df=pd.DataFrame(X_train_transformed, columns=X_train[numeric_columns+nominal_columns].columns)
transformed_df=pd.DataFrame(X_train_transformed)

#transformed_df.info(verbose=True, show_counts=True)
#transformed_df.info()

In [153]:
knn_pipe = Pipeline([
#                          ('fu', fu),
                          ('std_scaler', StandardScaler()),
                          ('model',KNeighborsClassifier( n_jobs=-1 , n_neighbors=120, weights='uniform', p=1)),
                              ])


In [157]:
#pipe_grid = { 'model__min_samples_split':[4,6,8,12,24]}
pipe_grid = {}
pipe_grid = {'model__n_neighbors':[60, 120,200], 'model__weights': ['uniform'],
             'model__p':[1] }


gs_pipe = GridSearchCV(estimator=knn_pipe, 
                       param_grid=pipe_grid, cv=2, scoring='roc_auc')

gs_pipe

In [158]:
gs_pipe.fit(X_train, y_train)
gs_pipe.best_params_

{'model__n_neighbors': 120, 'model__p': 1, 'model__weights': 'uniform'}

In [159]:
y_pred_train=gs_pipe.predict(X_train)
y_pred_test=gs_pipe.predict(X_test)

In [163]:
gs_pipe.score(X_train, y_train)

0.8474092524156449

In [164]:
print(classification_report(y_train, y_pred_train))
print('------------')
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85    149107
           1       0.73      0.64      0.68     79432

    accuracy                           0.79    228539
   macro avg       0.77      0.76      0.76    228539
weighted avg       0.79      0.79      0.79    228539

------------
              precision    recall  f1-score   support

           0       0.82      0.87      0.84     49703
           1       0.72      0.64      0.68     26477

    accuracy                           0.79     76180
   macro avg       0.77      0.75      0.76     76180
weighted avg       0.78      0.79      0.79     76180



In [77]:
#bagged_tree_pipe.fit(X_train, y_train)
#y_pred_test = bagged_tree_pipe.predict(X_test)
#y_pred_train = bagged_tree_pipe.predict(X_train)

#print(classification_report(y_train, y_pred_train))
#print('------------')
#print(classification_report(y_test, y_pred_test))
