In [1]:
# Uncomment only if you want to reprocess the raw data using initial_prep.py
# The script will re-create crashes.gz, comma separated gzipped file in ../data/processed

## %run ../src/preprocessing/initial_prep.py

In [2]:
%run ../src/import_libraries.py

%matplotlib inline

In [3]:
full_df=pd.read_csv('../data/processed/crashes.gz', compression='gzip', low_memory=False)

In [67]:
sample_df=full_df.sample(10000, random_state=100)
#sample_df=full_df.copy()


y = sample_df['GUILTY']
x = sample_df.drop(['GUILTY'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.25, stratify=y)


In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names=attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    

cat_pipeline = Pipeline([
        ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
         ('std_scaler', StandardScaler()),
    
    ])


fu = FeatureUnion(transformer_list=[
#        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [12]:

XGB_pipeline = Pipeline([ ( "cat_pipeline", cat_pipeline ),
                  ('boost', XGBClassifier(use_label_encoder=False, 
                                          eval_metric='auc', gamma= 0.02, max_depth=3, n_estimators=90, n_jobs=-1))
                 ])


#pipe_grid={'boost__gamma':[0.01,0.015]}
pipe_grid={'boost__gamma':[0.01,0.015], 'boost__n_estimators':[90,110,130]}

gs_pipe = GridSearchCV(estimator=XGB_pipeline, param_grid=pipe_grid,  cv=3, scoring='roc_auc')


In [117]:
model_dic={}


#nominal_columns=[ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 'MANEUVER','TRAFFIC_CONTROL_DEVICE', 'PHYSICAL_CONDITION']

def fit_model (X_train, X_test, y_train, all_cols, default_col):
    all_models_df = pd.DataFrame()
 
    for col in all_cols:
        nominal_columns=default_col
        nominal_columns.append(col)

        X_sel_train = X_train [nominal_columns] 
        X_sel_test = X_test [nominal_columns]

        cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(nominal_columns)),
            ('cat_encoder', OneHotEncoder(sparse=False, handle_unknown =  'ignore' )),
        ])
        print('---------------------')
        print ("Working with", col)

        fu = FeatureUnion(transformer_list=[
            ("cat_pipeline", cat_pipeline),
        ])


        XGB_pipeline = Pipeline([ ( 'fu', fu ),
                      ('boost', XGBClassifier(use_label_encoder=False, 
                                              eval_metric='auc', gamma= 0.02, max_depth=3, n_estimators=90, n_jobs=-1))
                     ])

        pipe_grid={'boost__gamma':[0.01,0.015]}

        gs_pipe = GridSearchCV(estimator=XGB_pipeline, param_grid=pipe_grid,  cv=3, scoring='roc_auc')    


        gs_pipe.fit(X_sel_train, y_train)
        y_pred_test=gs_pipe.predict(X_sel_test)

        dic_key=f'XGB_{col}'
        model_dic[dic_key]=gs_pipe

        model_df=pd.DataFrame(gs_pipe.cv_results_)
        model_df["Extra Columns"]=dic_key
        all_models_df=pd.concat([model_df, all_models_df], axis=0 )

    return all_models_df

   


In [120]:

all_cols=['AIRBAG_DEPLOYED',  'PHYSICAL_CONDITION', 'AGE_GROUP', 'VEHICLE_AGE', 
'DRIVERS_LICENSE_CLASS','TRAFFICWAY_TYPE', 'ALIGNMENT' ,  
'DEVICE_CONDITION','DRIVER_VISION', 'SAFETY_EQUIPMENT',  'WEATHER_CONDITION',
'ROAD_DEFECT', 'ROADWAY_SURFACE_COND', 'TRAFFIC_CONTROL_DEVICE', 'VEHICLE_DEFECT']


all_models_df = fit_model (X_train, X_test, y_train, all_cols, [])
all_models_df[["Extra Columns", "mean_test_score", "std_test_score", "rank_test_score"]].sort_values(by='mean_test_score', ascending=False)

---------------------
Working with AIRBAG_DEPLOYED
---------------------
Working with PHYSICAL_CONDITION
---------------------
Working with AGE_GROUP
---------------------
Working with VEHICLE_AGE
---------------------
Working with DRIVERS_LICENSE_CLASS
---------------------
Working with TRAFFICWAY_TYPE
---------------------
Working with ALIGNMENT
---------------------
Working with DEVICE_CONDITION
---------------------
Working with DRIVER_VISION
---------------------
Working with SAFETY_EQUIPMENT
---------------------
Working with WEATHER_CONDITION
---------------------
Working with ROAD_DEFECT
---------------------
Working with ROADWAY_SURFACE_COND
---------------------
Working with TRAFFIC_CONTROL_DEVICE
---------------------
Working with VEHICLE_DEFECT


Unnamed: 0,Extra Columns,mean_test_score,std_test_score,rank_test_score
1,XGB_TRAFFIC_CONTROL_DEVICE,0.625977,0.007153,1
0,XGB_VEHICLE_DEFECT,0.625484,0.008156,1
1,XGB_VEHICLE_DEFECT,0.625484,0.008156,1
0,XGB_TRAFFIC_CONTROL_DEVICE,0.625219,0.007322,2
1,XGB_ROAD_DEFECT,0.624864,0.013324,1
0,XGB_ROADWAY_SURFACE_COND,0.624778,0.009726,1
1,XGB_ROADWAY_SURFACE_COND,0.624778,0.009726,1
0,XGB_ROAD_DEFECT,0.624146,0.010273,2
0,XGB_WEATHER_CONDITION,0.622015,0.013453,1
1,XGB_WEATHER_CONDITION,0.622015,0.013453,1


In [121]:

all_cols=['AIRBAG_DEPLOYED',  'PHYSICAL_CONDITION', 'AGE_GROUP', 'VEHICLE_AGE', 
'DRIVERS_LICENSE_CLASS','TRAFFICWAY_TYPE', 'ALIGNMENT' ,  
'DEVICE_CONDITION','DRIVER_VISION', 'SAFETY_EQUIPMENT',  'WEATHER_CONDITION',
'ROAD_DEFECT', 'ROADWAY_SURFACE_COND', 'TRAFFIC_CONTROL_DEVICE', 'VEHICLE_DEFECT', 
          ]


all_models_df = fit_model (X_train, X_test, y_train, all_cols, [ 'FIRST_CRASH_TYPE', 'FIRST_CONTACT_POINT', 'MANEUVER','TRAFFIC_CONTROL_DEVICE'])
all_models_df[["Extra Columns", "mean_test_score", "std_test_score", "rank_test_score"]].sort_values(by='mean_test_score', ascending=False)

---------------------
Working with AIRBAG_DEPLOYED
---------------------
Working with PHYSICAL_CONDITION
---------------------
Working with AGE_GROUP
---------------------
Working with VEHICLE_AGE
---------------------
Working with DRIVERS_LICENSE_CLASS
---------------------
Working with TRAFFICWAY_TYPE
---------------------
Working with ALIGNMENT
---------------------
Working with DEVICE_CONDITION
---------------------
Working with DRIVER_VISION
---------------------
Working with SAFETY_EQUIPMENT
---------------------
Working with WEATHER_CONDITION
---------------------
Working with ROAD_DEFECT
---------------------
Working with ROADWAY_SURFACE_COND
---------------------
Working with TRAFFIC_CONTROL_DEVICE
---------------------
Working with VEHICLE_DEFECT


Unnamed: 0,Extra Columns,mean_test_score,std_test_score,rank_test_score
1,XGB_SAFETY_EQUIPMENT,0.844784,0.006963,1
0,XGB_SAFETY_EQUIPMENT,0.844784,0.006963,1
0,XGB_WEATHER_CONDITION,0.844513,0.008756,1
1,XGB_WEATHER_CONDITION,0.844513,0.008756,1
0,XGB_ROAD_DEFECT,0.844413,0.006556,1
1,XGB_ROAD_DEFECT,0.844413,0.006556,1
1,XGB_DRIVER_VISION,0.84391,0.008318,1
0,XGB_VEHICLE_DEFECT,0.843615,0.00693,1
0,XGB_DRIVER_VISION,0.843578,0.008006,2
1,XGB_VEHICLE_DEFECT,0.843462,0.007144,2
