In [None]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer #, IterativeImputer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MaxAbsScaler

from sklearn.metrics import precision_score, recall_score, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.metrics import roc_auc_score, roc_curve, auc, SCORERS
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

from sklearn import set_config
set_config(display ="diagram")

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)

In [None]:
selected_df=full_df[['GUILTY','DEVICE_CONDITION', 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'TRAFFICWAY_TYPE','ROADWAY_SURFACE_COND','VEHICLE_DEFECT',
             'DRIVERS_LICENSE_CLASS', 'AGE_GROUP', 'VEHICLE_AGE', 'AIRBAG_DEPLOYED',
            'PHYSICAL_CONDITION', 'MANEUVER',  'DRIVER_VISION',  'ALIGNMENT' ,  'TRAFFIC_CONTROL_DEVICE',
            'NUM_PASSENGERS','SUN_GLARE']]

In [None]:
#sample_df=selected_df.sample(8000, random_state=100)
sample_df=selected_df.copy()

y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)


In [None]:
#numeric_columns = ['NUM_PASSENGERS']

nominal_columns = [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 
           'ROADWAY_SURFACE_COND','VEHICLE_DEFECT', 'AIRBAG_DEPLOYED',
             'DRIVERS_LICENSE_CLASS',  'VEHICLE_AGE', 
            'PHYSICAL_CONDITION', 'MANEUVER',   'TRAFFIC_CONTROL_DEVICE']

#'AGE_GROUP', 'TRAFFICWAY_TYPE', ,'SUN_GLARE'
#'ALIGNMENT' ,  'DEVICE_CONDITION','DRIVER_VISION', 

X_train = X_train[numeric_columns+nominal_columns]
X_test = X_test[numeric_columns+nominal_columns]

#X_train.info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(numeric_columns)),
        ('imputer', SimpleImputer(strategy = 'constant', fill_value=0))
 #       ('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(nominal_columns)),
#        ('imputer', SimpleImputer(strategy = 'constant', fill_value='NA')),
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
    ])


fu = FeatureUnion(transformer_list=[
#        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])



In [None]:
X_train_transformed = fu.fit_transform(X_train[numeric_columns+nominal_columns])
#X_train.info(verbose=True, show_counts=True)

#columns=fu.transformer_list[1][1][2].get_feature_names_out()
#columns=np.insert(columns, 0, 'Intercept')
#transformed_df=pd.DataFrame(X_train_transformed, columns=X_train[numeric_columns+nominal_columns].columns)
transformed_df=pd.DataFrame(X_train_transformed)

#transformed_df.info()
#transformed_df.info(verbose=True, show_counts=True)


In [None]:
bag_log_pipe = Pipeline([
                          ('fu', fu),
                          ('model', BaggingClassifier(LogisticRegression(solver = 'newton-cg', max_iter=5000), 
                                                      n_estimators=100, ))
 
                              ])


In [None]:
pipe_grid = {}

#pipe_grid = {'model__criterion': ['gini', 'entropy']}

gs_pipe = GridSearchCV(estimator=bag_log_pipe, param_grid=pipe_grid, cv=2, scoring='roc_auc')

#pd.DataFrame(SCORERS.values())

In [None]:
gs_pipe.fit(X_train, y_train)

In [None]:
y_pred_train=gs_pipe.predict(X_train)
y_pred_test=gs_pipe.predict(X_test)

In [None]:
gs_pipe.best_params_


In [None]:
print(gs_pipe.score(X_train, y_train ))
print(classification_report(y_train, y_pred_train))
print('------------')
print(classification_report(y_test, y_pred_test))