In [1]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer #, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, auc

from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline




from sklearn import set_config
set_config(display ="diagram")

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)

In [3]:
selected_df=full_df[['GUILTY','DEVICE_CONDITION', 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'TRAFFICWAY_TYPE','ROADWAY_SURFACE_COND','VEHICLE_DEFECT',
             'DRIVERS_LICENSE_CLASS', 'AGE_GROUP', 'VEHICLE_AGE', 'AIRBAG_DEPLOYED',
            'PHYSICAL_CONDITION', 'MANEUVER',  'DRIVER_VISION',  'ALIGNMENT' ,  'TRAFFIC_CONTROL_DEVICE',
                    'NUM_PASSENGERS','SUN_GLARE','VEHICLE_YEAR']]


In [4]:
sample_df=selected_df.sample(100000, random_state=100)
#sample_df=selected_df.copy()


y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)


In [5]:
numeric_columns = []


nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 'MANEUVER']


X_train = X_train[numeric_columns+nominal_columns]
X_test = X_test[numeric_columns+nominal_columns]

#X_train=X_train.apply(LabelEncoder().fit_transform)
#X_test=X_test.apply(LabelEncoder().fit_transform)

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    



cat_pipeline = Pipeline([
#        ('selector', DataFrameSelector(nominal_columns)),
#        ('imputer', SimpleImputer(strategy = 'constant', fill_value='NA')),
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
         ('std_scaler', StandardScaler()),
    
    ])


fu = FeatureUnion(transformer_list=[
#        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])




In [7]:
X_train_transformed = fu.fit_transform(X_train)
#X_train.info(verbose=True, show_counts=True)

#columns=fu.transformer_list[1][1][2].get_feature_names_out()
#columns=np.insert(columns, 0, 'Intercept')
#transformed_df=pd.DataFrame(X_train_transformed, columns=X_train[numeric_columns+nominal_columns].columns)
transformed_df=pd.DataFrame(X_train_transformed)

#transformed_df.info(verbose=True, show_counts=True)
#transformed_df.info()

In [8]:
logreg = LogisticRegression( max_iter=5000, class_weight='balanced')
extra = ExtraTreesClassifier( max_depth=13, min_samples_leaf=3, min_samples_split=6, n_estimators=200, class_weight='balanced' )
rand = RandomForestClassifier( min_samples_leaf=8, n_estimators=200, class_weight='balanced' )
dtree = DecisionTreeClassifier(criterion='entropy', random_state=100, class_weight='balanced')
naive = QuadraticDiscriminantAnalysis()
knn = KNeighborsClassifier( n_jobs=-1 , n_neighbors=120, weights='uniform', p=1)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', gamma= 0.01, max_depth=3, n_estimators=90, n_jobs=-1)
svc = SVC(probability=False)


vote_pipe = Pipeline([('fu', fu),
                      ('model', VotingClassifier(estimators = [
                                                 ('logreg',logreg), 
                                                 ('extra',extra), 
                                                 ('rand',rand),
                                                 ('dtree',dtree), 
                                                 ('naive',naive),
                                                  ('knn',knn),
                                                  ('xgb',xgb ),
                                                  ('svc',svc)
                                                  ], voting='hard'))   
                     ])




In [9]:
#pipe_grid = { 'model__min_samples_split':[4,6,8,12,24]}
pipe_grid = {}
#pipe_grid = {'model__n_neighbors':[60, 120,200], 'model__weights': ['uniform'],
#             'model__p':[1] }


gs_pipe = GridSearchCV(estimator=vote_pipe, 
                       param_grid=pipe_grid, cv=2, scoring='accuracy')

gs_pipe

In [10]:
vote_pipe.fit(X_train, y_train)
#gs_pipe.fit(X_train, y_train)
#gs_pipe.best_params_



In [11]:
#y_pred_train=gs_pipe.predict(X_train)
#y_pred_test=gs_pipe.predict(X_test)
y_pred_test=vote_pipe.predict(X_test)

In [12]:
#gs_pipe.score(X_train, y_train)

In [13]:
#print(classification_report(y_train, y_pred_train))
print('------------')
print(classification_report(y_test, y_pred_test))

------------
              precision    recall  f1-score   support

           0       0.84      0.83      0.84     16288
           1       0.69      0.70      0.70      8712

    accuracy                           0.79     25000
   macro avg       0.77      0.77      0.77     25000
weighted avg       0.79      0.79      0.79     25000



In [None]:
#bagged_tree_pipe.fit(X_train, y_train)
#y_pred_test = bagged_tree_pipe.predict(X_test)
#y_pred_train = bagged_tree_pipe.predict(X_train)

#print(classification_report(y_train, y_pred_train))
#print('------------')
#print(classification_report(y_test, y_pred_test))
