In [1]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer #, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MaxAbsScaler

from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, auc, SCORERS
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn import set_config
set_config(display ="diagram")

import matplotlib.pyplot as plt
%matplotlib inline

In [94]:

full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)



In [95]:
selected_df=full_df.dropna(subset=['SAFETY_EQUIPMENT'], axis=0)
selected_df=selected_df[full_df['SAFETY_EQUIPMENT'] != 'UNKNOWN']

selected_df=full_df[['GUILTY','DEVICE_CONDITION', 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'TRAFFICWAY_TYPE','ROADWAY_SURFACE_COND','VEHICLE_DEFECT',
             'DRIVERS_LICENSE_CLASS', 'AGE_GROUP', 'VEHICLE_AGE', 'AIRBAG_DEPLOYED',
            'PHYSICAL_CONDITION', 'MANEUVER',  'DRIVER_VISION',  'ALIGNMENT' ,  'TRAFFIC_CONTROL_DEVICE',
                    'NUM_PASSENGERS','SUN_GLARE','VEHICLE_YEAR','AGE','SAFETY_EQUIPMENT']]

#SAFETY_EQUIPMENT

#explore_cat_vars(full_df, 42)

In [110]:
sample_df=selected_df.sample(100000, random_state=100)
#sample_df=selected_df.copy()

y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)



In [111]:
numeric_columns = ['AGE']

nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'ROADWAY_SURFACE_COND','VEHICLE_DEFECT', 
           'MANEUVER',   'TRAFFIC_CONTROL_DEVICE', 'SAFETY_EQUIPMENT']

# 'AIRBAG_DEPLOYED',  'PHYSICAL_CONDITION', 'AGE_GROUP', 'VEHICLE_AGE', 'DRIVERS_LICENSE_CLASS', 
#, 'TRAFFICWAY_TYPE', ,'SUN_GLARE' , 'AGE',
#'ALIGNMENT' ,  'DEVICE_CONDITION','DRIVER_VISION',
#'NUM_PASSENGERS'

X_train = X_train[numeric_columns+nominal_columns]
X_test = X_test[numeric_columns+nominal_columns]

#X_train.info()

In [112]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numeric_columns)),
#        ('imputer', SimpleImputer(strategy = 'constant', fill_value=0))
 #       ('std_scaler', StandardScaler())
 #        ( 'polynomial', PolynomialFeatures(degree = 3, include_bias = False ))
        
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(nominal_columns)),
#        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
    ])


fu = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])




In [113]:
#X_train_transformed = fu.fit_transform(X_train[numeric_columns+nominal_columns])
#X_train.info(verbose=True, show_counts=True)

#columns=fu.transformer_list[1][1][2].get_feature_names_out()
#columns=np.insert(columns, 0, 'Intercept')
#transformed_df=pd.DataFrame(X_train_transformed, columns=X_train[numeric_columns+nominal_columns].columns)
#transformed_df=pd.DataFrame(X_train_transformed)

#transformed_df.info()
#transformed_df.info(verbose=True, show_counts=True)


In [114]:
XGB_pipeline = Pipeline([ ( 'fu', fu ),
                  ('boost', XGBClassifier(use_label_encoder=False, 
                                          eval_metric='auc', gamma= 0.02, max_depth=3, n_estimators=90, n_jobs=-1))
                 ])



In [115]:
pipe_grid={}
#pipe_grid={'boost__gamma':[0.01,0.015], 'boost__n_estimators':[110,130,150]}

gs_pipe = GridSearchCV(estimator=XGB_pipeline, param_grid=pipe_grid,  cv=2, scoring='roc_auc')

#pd.DataFrame(SCORERS.keys())

In [116]:
gs_pipe.fit(X_train, y_train)
#XGB_pipeline.fit(X_train, y_train)

In [117]:
y_pred_train=gs_pipe.predict(X_train)
y_pred_test=gs_pipe.predict(X_test)

#y_pred_train=XGB_pipeline.predict(X_train)
#y_pred_test=XGB_pipeline.predict(X_test)

In [118]:
gs_pipe.best_params_



{}

In [119]:
#print(XGB_pipeline.score(X_train, y_train ))
print(gs_pipe.score(X_train, y_train ))
print(classification_report(y_train, y_pred_train))
print('------------')
print(classification_report(y_test, y_pred_test))

0.855586760303131
              precision    recall  f1-score   support

           0       0.82      0.87      0.85     48863
           1       0.73      0.65      0.69     26137

    accuracy                           0.80     75000
   macro avg       0.78      0.76      0.77     75000
weighted avg       0.79      0.80      0.79     75000

------------
              precision    recall  f1-score   support

           0       0.82      0.87      0.85     16288
           1       0.73      0.65      0.69      8712

    accuracy                           0.79     25000
   macro avg       0.78      0.76      0.77     25000
weighted avg       0.79      0.79      0.79     25000



In [616]:
0.8773987570283558
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      3951
           1       0.76      0.68      0.72      2049

    accuracy                           0.82      6000
   macro avg       0.80      0.78      0.79      6000
weighted avg       0.81      0.82      0.81      6000

------------
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1317
           1       0.73      0.62      0.67       683

    accuracy                           0.79      2000
   macro avg       0.78      0.75      0.76      2000
weighted avg       0.79      0.79      0.79      2000

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)

In [41]:
def explore_cat_vars(df, n):
    col_counts=pd.DataFrame(df.nunique(),).reset_index()
    col_counts.columns=(["Col_Name", "Count"])
    for col in col_counts[col_counts['Count'] < n]['Col_Name']:
        print('--------------------------------------')
        print(col)
        print('--------------------------------------')
        print(df[col].value_counts())
    print("=================================================================================================")
    print(col_counts[col_counts['Count'] >= n])