## data import

In [1]:
import pkg_resources
import pip
installedPackages = {pkg.key for pkg in pkg_resources.working_set}
required = {'researchpy', 'missingno', 'folium', 'pydotplus','bokeh','imblearn', 'catboost'}
missing = required - installedPackages
if missing:
    !pip install researchpy
    !pip install missingno
    !pip install folium
    !pip install pydotplus
    !pip install bokeh
    !pip install imblearn
    !pip install catboost
    #!pip install xgboost
    #!pip install graphviz

Collecting catboost
  Downloading catboost-0.24.3-cp38-none-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (11.5 MB)
[K     |████████████████████████████████| 11.5 MB 5.7 MB/s eta 0:00:01


Collecting plotly
  Downloading plotly-4.12.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 466 kB/s  eta 0:00:01   |██████████████████▍             | 7.5 MB 2.8 MB/s eta 0:00:02     |██████████████████████████▎     | 10.7 MB 15.8 MB/s eta 0:00:01
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=4b8d544325ebef9074b423fce0719dd00ff8cfd8e82bffa052ca9f2f6de2f761
  Stored in directory: /Users/shellylin/Library/Caches/pip/wheels/c4/a7/48/0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: retrying, plotly, catboost
Successfully installed catboost-0.24.3 plotly-4.12.0 retrying-1.3.3


In [2]:
#Disable the warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
import timeit
start_time = timeit.default_timer()  #timestamp to calculate total runtime

import pandas as pd
import numpy as np

import researchpy as rp
import missingno as msno
import itertools
import scipy.stats as ss

import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium import plugins
import graphviz

from sklearn import tree
from sklearn import feature_selection
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from collections import Counter

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, accuracy_score, \
precision_score, recall_score, roc_auc_score, f1_score, precision_recall_curve, auc 

%matplotlib inline
pd.set_option('display.max_columns', None)  # prevent column output trancation
sns.set()  # change plot styling from Matlab's 90s feel to today's Seaborn.

In [4]:
# File Directories
path_crashes = '../crashes.sample2020.csv'
path_vehicles = '../vehicles.sample.csv'
path_people = '../people.sample.csv'

# Import samples
crashes = pd.read_csv(path_crashes, parse_dates=["CRASH_DATE", "CRASH_DATE_EST_I", "DATE_POLICE_NOTIFIED"],
                      low_memory=False, dtype=object)
vehicles = pd.read_csv(path_vehicles, parse_dates=["CRASH_DATE"], low_memory=False, dtype=object)
people = pd.read_csv(path_people, parse_dates=["CRASH_DATE"], low_memory=False, dtype=object)

In [5]:
# Joining datasets
non_passengers=people[people.PERSON_ID.str.contains('^O')]

vehicles_with_people=vehicles.merge(non_passengers,how='left',on=['CRASH_RECORD_ID','RD_NO','CRASH_DATE','VEHICLE_ID'])

data=crashes.merge(vehicles_with_people,how='inner',on=['CRASH_RECORD_ID','RD_NO','CRASH_DATE'])

# Feature Selection
filter_list=["AGE","LANE_CNT","AIRBAG_DEPLOYED","PRIM_CONTRIBUTORY_CAUSE","POSTED_SPEED_LIMIT","NUM_UNITS","TRAFFICWAY_TYPE",  
             "SEC_CONTRIBUTORY_CAUSE","FIRST_CRASH_TYPE","MOST_SEVERE_INJURY","LIGHTING_CONDITION","SEX","CRASH_DATE",
             "CRASH_HOUR","VEHICLE_YEAR"]

# Data that will be used in predictions
modeling_data=data[filter_list]

In [None]:
def preprocessor(dataframe):
    '''Preprocesses df and returns X and y ready for modeling (after imputation of numericals!)'''
    df = dataframe.copy()
    
    # Prepare data for missing value imputation
    df.loc[df["LIGHTING_CONDITION"] == "UNKNOWN", "LIGHTING_CONDITION"] = np.nan
    df.loc[df["TRAFFICWAY_TYPE"] == "UNKNOWN","TRAFFICWAY_TYPE"] = np.nan
    df.loc[df["AIRBAG_DEPLOYED"] == "DEPLOYMENT UNKNOWN","AIRBAG_DEPLOYED"] = np.nan
    df.fillna({'LIGHTING_CONDITION': 'DAYLIGHT', 'TRAFFICWAY_TYPE': 'NOT DIVIDED',
               'SEX': 'UNABLE TO DETERMINE', 'AIRBAG_DEPLOYED': 'UNABLE TO DETERMINE'}, inplace=True)
    
    # Remove rows missing most severe injury results
    drop_rows = ['MOST_SEVERE_INJURY']
    df.dropna(how ='any', subset = drop_rows, inplace = True)
    
    # Handle numerical features
    df['VEHICLE_YEAR'] = pd.to_numeric(df['VEHICLE_YEAR'])
    df['NUM_UNITS'] = pd.to_numeric(df['NUM_UNITS'])
    df["POSTED_SPEED_LIMIT"] = pd.to_numeric(df["POSTED_SPEED_LIMIT"])
    df["AGE"] = pd.to_numeric(df["AGE"])
    
    df['LANE_CNT'] = pd.to_numeric(df['LANE_CNT'])    
    df['LANE_CNT'].fillna(2, inplace=True)
    df.loc[(df['LANE_CNT'] > 6),'LANE_CNT'] = 6
    
    # Function definitions
    def injury(x): 
        if any(s in x for s in ["FATAL","NONINCAPACITATING INJURY","INCAPACITATING INJURY"]):
            return "INJURED"
        else:
            return "NOT INJURED"
    
    def airbag(x):
        if ("DEPLOY" in x) and ("UNKNOWN" not in x):
            if "NOT" in x:
                return "NOT DEPLOYED"
            else:
                return "DEPLOYED"
        else:
            return x

    def crash_hour(x):
        if  2 <= x < 8:
            return "Early_morning"
        elif 8 <= x < 12:
            return "Morning"
        elif 12 <= x < 18:
            return "Afternoon"
        else:
            return "Night"
  
    def traffic_way(x):
        if ("NOT" in x) or ("ONE-WAY" in x):
            return "NOT_DIVIDED"
        else:
            return "DIVIDED"
        
    def age_group(x):
        if  0.0 <= x <= 18.0:
            return "below 18"
        elif 18.0 < x <= 30.0:
            return "between 19 and 30"
        elif 30.0< x <= 40.0:
            return "between 31 and 40"
        elif 40.0< x <= 50.0:
            return "between 41 and 50"
        elif 50.0< x <= 60.0:
            return "between 51 and 60"
        else:
            return "Over 60"
    
    # Feature Engineering
    df["INJURY"] = df["MOST_SEVERE_INJURY"].apply(lambda x: injury(x))
    df["AIRBAG_DEPLOYED"] = df["AIRBAG_DEPLOYED"].apply(lambda x: airbag(x))
    df["CRASH_HOUR"] = df["CRASH_HOUR"].apply(lambda x: crash_hour(int(x)))
    df["TRAFFICWAY_TYPE"] = df["TRAFFICWAY_TYPE"].apply(lambda x: traffic_way(x))
    
    df["VEHICLE_AGE"] = df["CRASH_DATE"].dt.year-df["VEHICLE_YEAR"]
    df.loc[df["VEHICLE_AGE"] < 0, "VEHICLE_AGE"] = 0
    df.drop(["VEHICLE_YEAR", "CRASH_DATE", "MOST_SEVERE_INJURY"], axis=1, inplace=True)
    
    # Splitting df into X and y
    y = df["INJURY"]
    X = df.drop(["INJURY"], axis=1)
    
    # Binarize y
    from sklearn.preprocessing import label_binarize
    y = preprocessing.label_binarize(y, classes=['NOT INJURED', 'INJURED'])
    
    # One-Hot Encoding
    X = pd.get_dummies(X, columns = X.select_dtypes(['object']).columns)
    dummies_to_drop = X.columns[X.columns.str.contains("UNABLE|UNKNOWN|NOT APPLICABLE|OTHER")]
    X = X.loc[:, ~X.columns.isin(dummies_to_drop)]
      
    return(X, y)