# Packages and Datasets

In [68]:
import timeit
start_time = timeit.default_timer()  #timestamp to calculate total runtime

import pkg_resources
import pip
installedPackages = {pkg.key for pkg in pkg_resources.working_set}
required = {'researchpy', 'missingno', 'folium', 'fancyimpute', 'pdpbox', 'dtreeviz.trees', 'graphviz'}
missing = required - installedPackages
if missing:
    !pip install researchpy
    !pip install missingno
    !pip install folium
    !pip install fancyimpute
    !pip install pdpbox
    !pip install dtreeviz
    !pip install graphviz
    
#Disable the warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import researchpy as rp
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

import itertools
import scipy.stats as ss

import folium
from folium import plugins

from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.inspection import plot_partial_dependence

%matplotlib inline
pd.set_option('display.max_columns', None)  # prevent column output trancation
sns.set()  # change plot styling from Matlab's 90s feel to today's Seaborn.





In [3]:
# File Directories
path_crashes = '../crashes.sample2020.csv'
path_vehicles = '../vehicles.sample.csv'
path_people = '../people.sample.csv'

# Import samples
crashes = pd.read_csv(
    path_crashes,
    parse_dates=["CRASH_DATE", "CRASH_DATE_EST_I", "DATE_POLICE_NOTIFIED"],
    low_memory=False,
    dtype=object)

vehicles = pd.read_csv(path_vehicles,
                       parse_dates=["CRASH_DATE"],
                       low_memory=False,
                       dtype=object)

people = pd.read_csv(path_people,
                     parse_dates=["CRASH_DATE"],
                     low_memory=False,
                     dtype=object)

# Data Merge & Split

In [4]:
# Filter only non-pessenger people (drivers, pedestrian, cyclist, etc.)
non_passengers = people[people.PERSON_ID.str.contains('^O')]

# Left join non-peseenger people with vehicles
vehicles_with_people = vehicles.merge(non_passengers, 
                                      how='left', 
                                      on=['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE', 'VEHICLE_ID'])

merged_data = crashes.merge(vehicles_with_people, 
                            how='inner', 
                            on=['CRASH_RECORD_ID', 'RD_NO', 'CRASH_DATE'])

print('Dimensions of the data after merging', merged_data.shape)

Dimensions of the data after merging (68503, 144)


In [5]:
# Alternative: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
raw_train, raw_test = train_test_split(merged_data, test_size=0.20, random_state=42, shuffle=True)

print("Train shape", raw_train.shape)
print("Test shape", raw_test.shape)

Train shape (54802, 144)
Test shape (13701, 144)


In [6]:
# save the train and test file
#train.to_csv('~/repos/vision_zero_chicago_project/sprint 2/raw_train.csv')
#test.to_csv('~/repos/vision_zero_chicago_project/sprint 2/raw_test.csv')

# Feature Reduction

In [7]:
# Drop the features that are not needed
feature_drop_train = ["CRASH_DATE_EST_I", "REPORT_TYPE", "CRASH_DATE_EST_I", "REPORT_TYPE", 
                      "DATE_POLICE_NOTIFIED","BEAT_OF_OCCURRENCE", "PHOTOS_TAKEN_I", "STATEMENTS_TAKEN_I", 
                      "WORK_ZONE_TYPE", "WORKERS_PRESENT_I","INJURIES_NO_INDICATION", "INJURIES_UNKNOWN",
                      "CRASH_DAY_OF_WEEK", "CRASH_MONTH", "RD_NO", "VEHICLE_ID", "CRASH_RECORD_ID",
                      "CRASH_DATE", "SEAT_NO", "STATE", "ZIPCODE", "DRIVERS_LICENSE_STATE", "PERSON_ID",
                      "DRIVERS_LICENSE_CLASS", "INJURY_CLASSIFICATION", "HOSPITAL", "EMS_AGENCY", 
                      "EMS_RUN_NO", "PEDPEDAL_LOCATION", "LOCATION","DAMAGE","CRASH_TYPE","MODEL",
                      "BAC_RESULT", "CRASH_UNIT_ID", "RD_NO", "UNIT_NO", "UNIT_TYPE", "VEHICLE_ID", 
                      "CMRC_VEH_I", "MAKE", "TOWED_I", "FIRE_I", "TOWED_BY", "STREET_NO","STREET_NAME",
                      "TOWED_TO", "AREA_00_I", "AREA_01_I", "AREA_02_I", "AREA_03_I", "AREA_04_I", 
                      "AREA_05_I", "AREA_06_I", "AREA_07_I", "AREA_08_I", "AREA_09_I", "AREA_10_I", 
                      "AREA_11_I", "AREA_12_I", "AREA_99_I", "CMV_ID", "USDOT_NO", "CCMC_NO", 
                      "ILCC_NO", "COMMERCIAL_SRC", "GVWR", "CARRIER_NAME", "CARRIER_STATE", "CARRIER_CITY",
                      "HAZMAT_PLACARDS_I", "HAZMAT_NAME", "UN_NO", "HAZMAT_PRESENT_I", "HAZMAT_REPORT_I",
                      "HAZMAT_REPORT_NO", "MCS_REPORT_I", "MCS_REPORT_NO", "HAZMAT_VIO_CAUSE_CRASH_I", 
                      "MCS_VIO_CAUSE_CRASH_I", "IDOT_PERMIT_NO", "WIDE_LOAD_I", "TRAILER1_WIDTH", 
                      "TRAILER2_WIDTH", "TRAILER1_LENGTH", "TRAILER2_LENGTH", "TOTAL_VEHICLE_LENGTH",
                      "AXLE_CNT", "VEHICLE_CONFIG", "CARGO_BODY_TYPE", "LOAD_TYPE", "HAZMAT_OUT_OF_SERVICE_I",
                      "INJURIES_FATAL","INJURIES_INCAPACITATING","INJURIES_NON_INCAPACITATING","INJURIES_REPORTED_NOT_EVIDENT",
                      "MCS_OUT_OF_SERVICE_I", "HAZMAT_CLASS"]

train = raw_train.loc[:, ~raw_train.columns.isin(feature_drop_train)]

**Columns needs to be fixed**

TRAFFIC_CONTROL_DEVICE- UNKNOWN, OTHER
DEVICE_CONDITION - UNKNOWN, OTHER
WEATHER_CONDITION - UNKNOWN, OTHER
LIGHTING_CONDITION - UNKNOWN
TRAFFICWAY_TYPE - UNKNOWN, OTHER
ROADWAY_SURFACE_COND - UNKNOWN, OTHER
ROAD_DEFECT - UNKNOWN, OTHER
PRIM_CONTRIBUTORY_CAUSE- UNABLE TO DETERMINE
SEC_CONTRIBUTORY_CAUSE - UNABLE TO DETERMINE
MODEL - UNKNOWN
VEHICLE_DEFECT - UNKNOWN, OTHER
VEHICLE_TYPE - UNKNOWN/NA
TRAVEL_DIRECTION - UNKNOWN
MANEUVER - UNKNOWN/NA
SEX - X
SAFETY_EQUIPMENT - USAGE UNKNOWN
AIRBAG_DEPLOYED - DEPLOYMENT UNKNOWN
EJECTION - UNKNOWN
DRIVER_ACTION - UNKNOWN
DRIVER_VISION - UNKNOWN
PHYSICAL_CONDITION - UNKNOWN
PEDPEDAL_ACTION - UNKNOWN/NA 

In [8]:
train.loc[train["TRAFFIC_CONTROL_DEVICE"]=="UNKNOWN","TRAFFIC_CONTROL_DEVICE"]=np.nan
train.loc[train["DEVICE_CONDITION"]=="UNKNOWN","DEVICE_CONDITION"]=np.nan
train.loc[train["WEATHER_CONDITION"]=="UNKNOWN","WEATHER_CONDITION"]=np.nan
train.loc[train["LIGHTING_CONDITION"]=="UNKNOWN","LIGHTING_CONDITION"]=np.nan
train.loc[train["TRAFFICWAY_TYPE"]=="UNKNOWN","TRAFFICWAY_TYPE"]=np.nan
train.loc[train["ROADWAY_SURFACE_COND"]=="UNKNOWN","ROADWAY_SURFACE_COND"]=np.nan
train.loc[train["ROAD_DEFECT"]=="UNKNOWN","ROAD_DEFECT"]=np.nan

train.loc[train["VEHICLE_DEFECT"]=="UNKNOWN","VEHICLE_DEFECT"]=np.nan
train.loc[train["VEHICLE_TYPE"]=="UNKNOWN/NA","VEHICLE_TYPE"]=np.nan
train.loc[train["TRAVEL_DIRECTION"]=="UNKNOWN","TRAVEL_DIRECTION"]=np.nan
train.loc[train["MANEUVER"]=="UNKNOWN/NA","MANEUVER"]=np.nan

train.loc[train["SAFETY_EQUIPMENT"]=="USAGE UNKNOWN","SAFETY_EQUIPMENT"]=np.nan
train.loc[train["AIRBAG_DEPLOYED"]=="DEPLOYMENT UNKNOWN","AIRBAG_DEPLOYED"]=np.nan
train.loc[train["EJECTION"]=="UNKNOWN","EJECTION"]=np.nan
train.loc[train["DRIVER_ACTION"]=="UNKNOWN","DRIVER_ACTION"]=np.nan
train.loc[train["DRIVER_VISION"]=="UNKNOWN","DRIVER_VISION"]=np.nan
train.loc[train["PHYSICAL_CONDITION"]=="UNKNOWN","PHYSICAL_CONDITION"]=np.nan
train.loc[train["PEDPEDAL_ACTION"]=="UNKNOWN/NA","PEDPEDAL_ACTION"]=np.nan

## Missing value overview on train data

In [9]:
# replacing TRAFFIC_CONTROL_DEVICE with No Controls 
# replacing DEVICE_CONDITION with No Controls 

train.fillna({
    'TRAFFIC_CONTROL_DEVICE': 'NO CONTROLS',
    'DEVICE_CONDITION': 'NO CONTROLS',
    'WEATHER_CONDITION': 'CLEAR',
    'LIGHTING_CONDITION': 'DAYLIGHT',
    'TRAFFICWAY_TYPE': 'NOT DIVIDED',
    'ROADWAY_SURFACE_COND': 'NO DEFECTS',
    'ROAD_DEFECT': 'CLEAR',
    'INTERSECTION_RELATED_I': 'N',
    
    'NOT_RIGHT_OF_WAY_I': 'N',
    'HIT_AND_RUN_I': 'N',
    'DOORING_I': 'N',
    'WORK_ZONE_I': 'N',
    'NUM_PASSENGERS': 0,
    'LIC_PLATE_STATE': 'IL',
    'VEHICLE_DEFECT': 'UNABLE TO DETERMINE',
    'VEHICLE_TYPE': 'OTHER',
    
    'VEHICLE_USE': 'OTHER',
    'TRAVEL_DIRECTION': 'UNABLE TO DETERMINE', # UPDATED
    'MANEUVER': 'OTHER',
    'OCCUPANT_CNT': 0,
    'EXCEED_SPEED_LIMIT_I': 'N',
    'FIRST_CONTACT_POINT': 'OTHER',
    'PERSON_TYPE': 'UNABLE TO DETERMINE',
    'CITY': 'OTHER',
    
    'SEX': 'UNABLE TO DETERMINE', #or X ?
    'AIRBAG_DEPLOYED': 'UNABLE TO DETERMINE',
    'EJECTION': 'UNABLE TO DETERMINE',
    'DRIVER_ACTION': 'OTHER',
    'DRIVER_VISION': 'OTHER',
    'PHYSICAL_CONDITION': 'UNABLE TO DETERMINE',
    'PEDPEDAL_ACTION': 'UNABLE TO DETERMINE',
    'PEDPEDAL_VISIBILITY': 'UNABLE TO DETERMINE',
    
    'CELL_PHONE_USE': 'UNABLE TO DETERMINE',
    'SAFETY_EQUIPMENT': 'UNABLE TO DETERMINE',
    'BAC_RESULT VALUE': 0
}, inplace=True)

#### Replace with frequency (mode):
“LANE_CNT”: 30,763 missing data. Replace them with mode “2”

In [10]:
train['LANE_CNT'].fillna("2",inplace=True)

print("Total NAs in LANE_CNT is", train['LANE_CNT'].isnull().sum())

Total NAs in LANE_CNT is 0


#### Replace with median

In [11]:
train["AGE"] = pd.to_numeric(train["AGE"])
train['AGE'].fillna((train['AGE'].median()), inplace=True)

print("TOTAL NAs in AGE is", train['AGE'].isnull().sum())

TOTAL NAs in AGE is 0


#### Replace with mode/median

In [12]:
train['VEHICLE_YEAR'].fillna(train['VEHICLE_YEAR'].mode()[0], inplace=True)

train["OCCUPANT_CNT"] = pd.to_numeric(train["OCCUPANT_CNT"])
train['OCCUPANT_CNT'].fillna((train['OCCUPANT_CNT'].median()), inplace=True)

print("TOTAL NAs in VEHICLE_YEAR is", train['VEHICLE_YEAR'].isnull().sum())

TOTAL NAs in VEHICLE_YEAR is 0


#### Drop the missing rows


In [13]:
drop_rows = ['INJURIES_TOTAL', 'LATITUDE','MOST_SEVERE_INJURY']
train.dropna(how='any', subset=drop_rows, inplace = True)
#train.isnull().sum()

In [14]:
train.shape

(54388, 50)

# Handling Outliers

In [15]:
numericals = ['LANE_CNT', 'NUM_UNITS', 'VEHICLE_YEAR', 'CRASH_HOUR']

for col in numericals:
    #print(train[col].value_counts())
    train[col] = pd.to_numeric(train[col])
    #print()

In [16]:
# Change all rows with value greater than 6 to 6
train.loc[(train['LANE_CNT'] > 6), 'LANE_CNT'] = 6
train['LANE_CNT'].value_counts()

2    44921
4     4408
1     2970
3      769
6      575
0      543
5      202
Name: LANE_CNT, dtype: int64

# Feature Engineering

##  Target Feature Engineering

In [17]:
def injury(x): 
    if any(s in x for s in ["FATAL","NONINCAPACITATING INJURY","INCAPACITATING INJURY"]):
        return "INJURED"
    else:
        return "NOT INJURED"
    
# replacing NA values with OTHER
train["INJURY"] = train["MOST_SEVERE_INJURY"].apply(lambda x: injury(x))
train.drop("MOST_SEVERE_INJURY", axis=1, inplace=True)

In [18]:
train["INJURY"].value_counts()

NOT INJURED    48496
INJURED         5892
Name: INJURY, dtype: int64

##  Other Feature Engineering / Adaptation

### CONTACT_POINT

Reduced # of layers in `FIRST_CONTACT_POINT` to:
- FRONT
- SIDE
- READ
- OTHER

In [19]:
def contact_point(x):
    if "FRONT" in x:
        return "FRONT"
    elif "SIDE" in x:
        return "SIDE"
    elif "REAR" in x:
        return "REAR"
    else:
        return "OTHER"
    
# replacing NA values with OTHER
train["FIRST_CONTACT_POINT"] = train["FIRST_CONTACT_POINT"].apply(lambda x: contact_point(x))

In [20]:
train["MANEUVER"].unique()

array(['STRAIGHT AHEAD', 'CHANGING LANES', 'TURNING LEFT',
       'TURNING RIGHT', 'PARKED', 'MERGING', 'SLOW/STOP IN TRAFFIC',
       'OTHER', 'ENTERING TRAFFIC LANE FROM PARKING',
       'PASSING/OVERTAKING', 'LEAVING TRAFFIC LANE TO PARK', 'BACKING',
       'STARTING IN TRAFFIC', 'SKIDDING/CONTROL LOSS',
       'SLOW/STOP - LEFT TURN', 'SLOW/STOP - RIGHT TURN',
       'PARKED IN TRAFFIC LANE', 'AVOIDING VEHICLES/OBJECTS',
       'SLOW/STOP - LOAD/UNLOAD', 'ENTER FROM DRIVE/ALLEY',
       'TURNING ON RED', 'DRIVING WRONG WAY', 'U-TURN',
       'NEGOTIATING A CURVE', 'DISABLED', 'DRIVERLESS', 'DIVERGING'],
      dtype=object)

### MANEUVER, VEHICLE_YEAR, POSTED_SPEED_LIMIT

Reduced # of layers in `MANEUVER` to:
- UNKNOWN/NA
- TURN
- LANE
- OTHER

Reduced # layers in `VEHICLE_YEAR` to:
- NEW
- OLD (<2010)
- UNKNOWN

Reduced # layers in `POSTED_SPEED_LIMIT` to:
- LOW_SPEED (< 30mph)
- HIGH_SPEED

In [21]:
# merging same type of Maneuver in one
train["MANEUVER"] = train["MANEUVER"].apply(lambda x: "TURN" if "TURN" in x else("LANE" if any(s in x for s in ["LANE","OVER","ENTER"]) else x))
train["MANEUVER"] = train["MANEUVER"].apply(lambda x: "OTHER" if all(s not in x for s in ["AHEAD","TURN","UNKNOWN","LANE","BACKING"]) else x)

# converting VEHICLE_YEAR into new category of OLD or NEW
train["VEHICLE_YEAR"]= train["VEHICLE_YEAR"].apply(lambda x: "UNKNOWN" if pd.isnull(x) else ("OLD (<2010)" if int(x)<2010 else "NEW"))

# converting POSTED_SPEED_LIMIT into new category of LOW SPEED or HIGH SPEED
train["POSTED_SPEED_LIMIT"]=train["POSTED_SPEED_LIMIT"].apply(lambda x: "LOW_SPEED" if int(x)<=30 else "HIGH_SPEED")

### SAFETY_EQUIPMENT, AIRBAG_DEPLOYED

Reduced # of layers in `SAFETY_EQUIPMENT` to:
- USED SAFETY EQUIPMENT
- DID NOT USE SAFETY EQUIPMENT

Reduced # of layers in `AIRBAG_DEPLOYED` to:
- DEPLOYED
- NOT DEPLOYED

In [22]:
# grouping the SAFETY_EQUIPMENT used as SAFE and UNSAFE
def equip_used(x):
    if ("USED" in x) or ("HELMET" in x) or ("NONE PRESENT" in x):
        if any(s in x for s in ["NOT","IMPROPER","NONE PRESENT"]):
            return "DID NOT USE SAFETY EQUIP"
        else:
            return "USED SAFETY EQUIP"
    else:
        return x

train["SAFETY_EQUIPMENT"] = train["SAFETY_EQUIPMENT"].apply(lambda x: equip_used(x))


# grouping the AIRBAG_DEPLOYED used as DEPLOYED and NOT DEPLOYED
def airbag(x):
    if ("DEPLOY" in x) and ("UNKNOWN" not in x):
        if "NOT" in x:
            return "NOT DEPLOYED"
        else:
            return "DEPLOYED"
    else:
        return x
 
train["AIRBAG_DEPLOYED"] = train["AIRBAG_DEPLOYED"].apply(lambda x: airbag(x))

### CRASH_HOUR

Transform the crash hour to 4 categories:
- Early morning
- Morning
- Afternoon
- Night

In [23]:
def crash_hour(x):
    if  2 <= x < 8:
        return "Early_morning"
    elif 8 <= x < 12:
        return "Morning"
    elif 12 <= x < 18:
        return "Afternoon"
    else:
        return "Night"
    
# replacing NA values with OTHER
train["CRASH_HOUR"] = train["CRASH_HOUR"].apply(lambda x: crash_hour(int(x)))

### TRAFFIC_CONTROL_DEVICE

Transform the signs

- 'TRAFFIC SIGNAL', 'STOP SIGN/FLASHER', 'LANE USE MARKING', 'YIELD', 'DELINEATORS', 
'OTHER REG. SIGN', 'OTHER WARNING SIGN', 'POLICE/FLAGMAN', 'RAILROAD CROSSING GATE', 
'OTHER RAILROAD CROSSING', 'FLASHING CONTROL SIGNAL', 'PEDESTRIAN CROSSING SIGN',
'NO PASSING', 'SCHOOL ZONE', 'BICYCLE CROSSING SIGN' as **`SIGN`**
    
    
- 'NO CONTROLS','UNKNOWN', 'OTHER' as **`NO_SIGN`**

In [24]:
def traffic_control(x):
    if ("NO CONTROLS" in x) or ("UNKNOWN" in x) or ("OTHER" in x):
        return "NO_SIGN"
    else:
        return "SIGN"

train["TRAFFIC_CONTROL_DEVICE"]= train["TRAFFIC_CONTROL_DEVICE"].apply(lambda x: traffic_control(x))

In [25]:
def location(x1,x2):
    if (41.875297 <= float(x1) <= 41.9000064) and (-87.6321459 <= float(x2) <= -87.6096752):
        return "Downtown"
    else:
        return "Not Downtown"

#df['col_3'] = df.apply(lambda x: f(x.col_1, x.col_2), axis=1)
train["Location"] = train.apply(lambda x: location(x["LATITUDE"],x["LONGITUDE"]), axis=1)

In [26]:
train["Location"].unique()

array(['Downtown', 'Not Downtown'], dtype=object)

In [27]:
# NO MISSING VALUES LEFT!
# train.isnull().sum()

In [28]:
train.shape

(54388, 51)

In [29]:
train.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,LANE_CNT,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,STREET_DIRECTION,DOORING_I,WORK_ZONE_I,NUM_UNITS,INJURIES_TOTAL,CRASH_HOUR,LATITUDE,LONGITUDE,NUM_PASSENGERS,LIC_PLATE_STATE,VEHICLE_YEAR,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,TRAVEL_DIRECTION,MANEUVER,OCCUPANT_CNT,EXCEED_SPEED_LIMIT_I,FIRST_CONTACT_POINT,PERSON_TYPE,CITY,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,EJECTION,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,PEDPEDAL_ACTION,PEDPEDAL_VISIBILITY,BAC_RESULT VALUE,CELL_PHONE_USE,INJURY,Location
29244,LOW_SPEED,SIGN,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,ANGLE,DIVIDED - W/MEDIAN BARRIER,4,STRAIGHT ON GRADE,OTHER,CLEAR,Y,N,N,DISREGARDING TRAFFIC SIGNALS,FAILING TO YIELD RIGHT-OF-WAY,E,N,N,2,0,Afternoon,41.891233581,-87.613501254,0,IL,NEW,OTHER,SPORT UTILITY VEHICLE (SUV),PERSONAL,S,STRAIGHT AHEAD,1,N,SIDE,DRIVER,CHICAGO,M,26.0,USED SAFETY EQUIP,DEPLOYED,NONE,NONE,NOT OBSCURED,NORMAL,UNABLE TO DETERMINE,UNABLE TO DETERMINE,0,UNABLE TO DETERMINE,NOT INJURED,Downtown
22376,LOW_SPEED,SIGN,FUNCTIONING PROPERLY,RAIN,DAYLIGHT,SIDESWIPE SAME DIRECTION,ONE-WAY,3,STRAIGHT AND LEVEL,WET,NO DEFECTS,N,N,N,IMPROPER OVERTAKING/PASSING,UNABLE TO DETERMINE,S,N,N,2,0,Afternoon,41.879610255,-87.63097225,0,IL,NEW,NONE,PASSENGER,TAXI/FOR HIRE,S,LANE,1,N,SIDE,DRIVER,HOFFMAN ESTATES,M,51.0,UNABLE TO DETERMINE,NOT DEPLOYED,NONE,IMPROPER LANE CHANGE,OTHER,UNABLE TO DETERMINE,UNABLE TO DETERMINE,UNABLE TO DETERMINE,0,UNABLE TO DETERMINE,NOT INJURED,Downtown
49790,LOW_SPEED,NO_SIGN,NO CONTROLS,CLEAR,DAYLIGHT,OTHER OBJECT,NOT DIVIDED,2,STRAIGHT AND LEVEL,DRY,"RUT, HOLES",N,N,N,ROAD CONSTRUCTION/MAINTENANCE,NOT APPLICABLE,N,N,N,1,0,Afternoon,41.966268686,-87.836830283,0,IL,NEW,NONE,PASSENGER,PERSONAL,S,STRAIGHT AHEAD,1,N,OTHER,DRIVER,CHICAGO,M,73.0,USED SAFETY EQUIP,NOT DEPLOYED,NONE,NONE,NOT OBSCURED,NORMAL,UNABLE TO DETERMINE,UNABLE TO DETERMINE,0,UNABLE TO DETERMINE,NOT INJURED,Not Downtown
32552,LOW_SPEED,NO_SIGN,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ALLEY,2,STRAIGHT AND LEVEL,DRY,NO DEFECTS,N,N,N,UNABLE TO DETERMINE,NOT APPLICABLE,W,N,N,2,0,Morning,41.92902255,-87.659229555,0,IL,NEW,NONE,TRUCK - SINGLE UNIT,UNKNOWN/NA,E,TURN,1,N,OTHER,DRIVER,OTHER,F,38.0,UNABLE TO DETERMINE,NOT APPLICABLE,NONE,NONE,NOT OBSCURED,NORMAL,UNABLE TO DETERMINE,UNABLE TO DETERMINE,0,UNABLE TO DETERMINE,NOT INJURED,Not Downtown
1957,LOW_SPEED,SIGN,FUNCTIONING PROPERLY,CLEAR,DARKNESS,TURNING,FOUR WAY,2,STRAIGHT AND LEVEL,DRY,NO DEFECTS,N,N,Y,IMPROPER TURNING/NO SIGNAL,UNABLE TO DETERMINE,S,N,N,2,0,Early_morning,41.707182175,-87.620361492,0,IL,OLD (<2010),NONE,PASSENGER,PERSONAL,E,TURN,1,N,OTHER,DRIVER,CHICAGO,F,30.0,USED SAFETY EQUIP,NOT DEPLOYED,NONE,OTHER,NOT OBSCURED,NORMAL,UNABLE TO DETERMINE,UNABLE TO DETERMINE,0,UNABLE TO DETERMINE,NOT INJURED,Not Downtown


# Baseline Model

These features require feature engineering.
- `LIC_PLATE_STATE`
- `PEDPEDAL_ACTION`
- `BAC_RESULT VALUE`
- `CITY`

Tony 33, Shantanu 46 features.

**Shantanu's extras**
ALIGNMENT, STREET_DIRECTION, DOORING_I, WORK_ZONE_I, NUM_PASSENGERS, LIC_PLATE_STATE, CITY, 
PEDPEDAL_ACTION, PEDPEDAL_VISIBILITY, BAC_RESULT VALUE, CELL_PHONE_USE

In [30]:
drop_list=["SEC_CONTRIBUTORY_CAUSE", "LATITUDE", "LONGITUDE", "INJURIES_TOTAL", "STREET_DIRECTION", "ALIGNMENT", 
          "NUM_PASSENGERS", "LIC_PLATE_STATE", "PEDPEDAL_ACTION", "BAC_RESULT VALUE", "CITY"]

train_new = train.drop(drop_list, axis=1)

In [31]:
# splitting the dataset into x_train and y_train
X_train = train_new.drop(["INJURY"],axis=1)
y_train = train_new["INJURY"]

In [32]:
# performing one hot encoding on all the object features
X_train = pd.get_dummies(X_train, columns=X_train.select_dtypes(['object']).columns, drop_first=True)

In [33]:
print(X_train.shape)

X_train.head()

(54388, 258)


Unnamed: 0,LANE_CNT,NUM_UNITS,OCCUPANT_CNT,AGE,POSTED_SPEED_LIMIT_LOW_SPEED,TRAFFIC_CONTROL_DEVICE_SIGN,DEVICE_CONDITION_FUNCTIONING PROPERLY,DEVICE_CONDITION_MISSING,DEVICE_CONDITION_NO CONTROLS,DEVICE_CONDITION_NOT FUNCTIONING,DEVICE_CONDITION_OTHER,DEVICE_CONDITION_WORN REFLECTIVE MATERIAL,WEATHER_CONDITION_BLOWING SNOW,WEATHER_CONDITION_CLEAR,WEATHER_CONDITION_CLOUDY/OVERCAST,WEATHER_CONDITION_FOG/SMOKE/HAZE,WEATHER_CONDITION_FREEZING RAIN/DRIZZLE,WEATHER_CONDITION_OTHER,WEATHER_CONDITION_RAIN,WEATHER_CONDITION_SEVERE CROSS WIND GATE,WEATHER_CONDITION_SLEET/HAIL,WEATHER_CONDITION_SNOW,"LIGHTING_CONDITION_DARKNESS, LIGHTED ROAD",LIGHTING_CONDITION_DAWN,LIGHTING_CONDITION_DAYLIGHT,LIGHTING_CONDITION_DUSK,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OTHER NONCOLLISION,FIRST_CRASH_TYPE_OTHER OBJECT,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_OTHER,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_UNKNOWN INTERSECTION TYPE,TRAFFICWAY_TYPE_Y-INTERSECTION,ROADWAY_SURFACE_COND_ICE,ROADWAY_SURFACE_COND_NO DEFECTS,ROADWAY_SURFACE_COND_OTHER,"ROADWAY_SURFACE_COND_SAND, MUD, DIRT",ROADWAY_SURFACE_COND_SNOW OR SLUSH,ROADWAY_SURFACE_COND_WET,ROAD_DEFECT_DEBRIS ON ROADWAY,ROAD_DEFECT_NO DEFECTS,ROAD_DEFECT_OTHER,"ROAD_DEFECT_RUT, HOLES",ROAD_DEFECT_SHOULDER DEFECT,ROAD_DEFECT_WORN SURFACE,INTERSECTION_RELATED_I_Y,NOT_RIGHT_OF_WAY_I_Y,HIT_AND_RUN_I_Y,PRIM_CONTRIBUTORY_CAUSE_BICYCLE ADVANCING LEGALLY ON RED LIGHT,PRIM_CONTRIBUTORY_CAUSE_CELL PHONE USE OTHER THAN TEXTING,PRIM_CONTRIBUTORY_CAUSE_DISREGARDING OTHER TRAFFIC SIGNS,PRIM_CONTRIBUTORY_CAUSE_DISREGARDING ROAD MARKINGS,PRIM_CONTRIBUTORY_CAUSE_DISREGARDING STOP SIGN,PRIM_CONTRIBUTORY_CAUSE_DISREGARDING TRAFFIC SIGNALS,PRIM_CONTRIBUTORY_CAUSE_DISREGARDING YIELD SIGN,PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - FROM INSIDE VEHICLE,PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - FROM OUTSIDE VEHICLE,"PRIM_CONTRIBUTORY_CAUSE_DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)",PRIM_CONTRIBUTORY_CAUSE_DRIVING ON WRONG SIDE/WRONG WAY,PRIM_CONTRIBUTORY_CAUSE_DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,PRIM_CONTRIBUTORY_CAUSE_EQUIPMENT - VEHICLE CONDITION,"PRIM_CONTRIBUTORY_CAUSE_EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST",PRIM_CONTRIBUTORY_CAUSE_EXCEEDING AUTHORIZED SPEED LIMIT,PRIM_CONTRIBUTORY_CAUSE_EXCEEDING SAFE SPEED FOR CONDITIONS,PRIM_CONTRIBUTORY_CAUSE_FAILING TO REDUCE SPEED TO AVOID CRASH,PRIM_CONTRIBUTORY_CAUSE_FAILING TO YIELD RIGHT-OF-WAY,PRIM_CONTRIBUTORY_CAUSE_FOLLOWING TOO CLOSELY,PRIM_CONTRIBUTORY_CAUSE_HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE),PRIM_CONTRIBUTORY_CAUSE_IMPROPER BACKING,PRIM_CONTRIBUTORY_CAUSE_IMPROPER LANE USAGE,PRIM_CONTRIBUTORY_CAUSE_IMPROPER OVERTAKING/PASSING,PRIM_CONTRIBUTORY_CAUSE_IMPROPER TURNING/NO SIGNAL,PRIM_CONTRIBUTORY_CAUSE_MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT,PRIM_CONTRIBUTORY_CAUSE_NOT APPLICABLE,"PRIM_CONTRIBUTORY_CAUSE_OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER",PRIM_CONTRIBUTORY_CAUSE_PASSING STOPPED SCHOOL BUS,PRIM_CONTRIBUTORY_CAUSE_PHYSICAL CONDITION OF DRIVER,PRIM_CONTRIBUTORY_CAUSE_RELATED TO BUS STOP,PRIM_CONTRIBUTORY_CAUSE_ROAD CONSTRUCTION/MAINTENANCE,PRIM_CONTRIBUTORY_CAUSE_ROAD ENGINEERING/SURFACE/MARKING DEFECTS,PRIM_CONTRIBUTORY_CAUSE_TEXTING,PRIM_CONTRIBUTORY_CAUSE_TURNING RIGHT ON RED,PRIM_CONTRIBUTORY_CAUSE_UNABLE TO DETERMINE,PRIM_CONTRIBUTORY_CAUSE_UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED),"PRIM_CONTRIBUTORY_CAUSE_VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)",PRIM_CONTRIBUTORY_CAUSE_WEATHER,DOORING_I_Y,WORK_ZONE_I_Y,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,VEHICLE_YEAR_OLD (<2010),VEHICLE_DEFECT_CARGO,VEHICLE_DEFECT_ENGINE/MOTOR,VEHICLE_DEFECT_FUEL SYSTEM,VEHICLE_DEFECT_LIGHTS,VEHICLE_DEFECT_NONE,VEHICLE_DEFECT_OTHER,VEHICLE_DEFECT_RESTRAINT SYSTEM,VEHICLE_DEFECT_SIGNALS,VEHICLE_DEFECT_STEERING,VEHICLE_DEFECT_SUSPENSION,VEHICLE_DEFECT_TIRES,VEHICLE_DEFECT_UNABLE TO DETERMINE,VEHICLE_DEFECT_WHEELS,VEHICLE_DEFECT_WINDOWS,VEHICLE_TYPE_ALL-TERRAIN VEHICLE (ATV),VEHICLE_TYPE_AUTOCYCLE,VEHICLE_TYPE_BUS OVER 15 PASS.,VEHICLE_TYPE_BUS UP TO 15 PASS.,VEHICLE_TYPE_FARM EQUIPMENT,VEHICLE_TYPE_MOPED OR MOTORIZED BICYCLE,VEHICLE_TYPE_MOTOR DRIVEN CYCLE,VEHICLE_TYPE_MOTORCYCLE (OVER 150CC),VEHICLE_TYPE_OTHER,VEHICLE_TYPE_OTHER VEHICLE WITH TRAILER,VEHICLE_TYPE_PASSENGER,VEHICLE_TYPE_PICKUP,VEHICLE_TYPE_SINGLE UNIT TRUCK WITH TRAILER,VEHICLE_TYPE_SNOWMOBILE,VEHICLE_TYPE_SPORT UTILITY VEHICLE (SUV),VEHICLE_TYPE_TRACTOR W/ SEMI-TRAILER,VEHICLE_TYPE_TRACTOR W/O SEMI-TRAILER,VEHICLE_TYPE_TRUCK - SINGLE UNIT,VEHICLE_TYPE_VAN/MINI-VAN,VEHICLE_USE_AMBULANCE,VEHICLE_USE_CAMPER/RV - SINGLE UNIT,VEHICLE_USE_COMMERCIAL - MULTI-UNIT,VEHICLE_USE_COMMERCIAL - SINGLE UNIT,VEHICLE_USE_CONSTRUCTION/MAINTENANCE,VEHICLE_USE_CTA,VEHICLE_USE_DRIVER EDUCATION,VEHICLE_USE_FIRE,VEHICLE_USE_LAWN CARE/LANDSCAPING,VEHICLE_USE_MASS TRANSIT,VEHICLE_USE_MILITARY,VEHICLE_USE_NOT IN USE,VEHICLE_USE_OTHER,VEHICLE_USE_OTHER TRANSIT,VEHICLE_USE_PERSONAL,VEHICLE_USE_POLICE,VEHICLE_USE_RIDESHARE SERVICE,VEHICLE_USE_SCHOOL BUS,VEHICLE_USE_STATE OWNED,VEHICLE_USE_TAXI/FOR HIRE,VEHICLE_USE_TOW TRUCK,VEHICLE_USE_UNKNOWN/NA,TRAVEL_DIRECTION_N,TRAVEL_DIRECTION_NE,TRAVEL_DIRECTION_NW,TRAVEL_DIRECTION_S,TRAVEL_DIRECTION_SE,TRAVEL_DIRECTION_SW,TRAVEL_DIRECTION_UNABLE TO DETERMINE,TRAVEL_DIRECTION_W,MANEUVER_LANE,MANEUVER_OTHER,MANEUVER_STRAIGHT AHEAD,MANEUVER_TURN,EXCEED_SPEED_LIMIT_I_Y,FIRST_CONTACT_POINT_OTHER,FIRST_CONTACT_POINT_REAR,FIRST_CONTACT_POINT_SIDE,PERSON_TYPE_DRIVER,PERSON_TYPE_NON-CONTACT VEHICLE,PERSON_TYPE_NON-MOTOR VEHICLE,PERSON_TYPE_PEDESTRIAN,PERSON_TYPE_UNABLE TO DETERMINE,SEX_M,SEX_UNABLE TO DETERMINE,SEX_X,SAFETY_EQUIPMENT_DID NOT USE SAFETY EQUIP,SAFETY_EQUIPMENT_UNABLE TO DETERMINE,SAFETY_EQUIPMENT_USED SAFETY EQUIP,SAFETY_EQUIPMENT_WHEELCHAIR,AIRBAG_DEPLOYED_NOT APPLICABLE,AIRBAG_DEPLOYED_NOT DEPLOYED,AIRBAG_DEPLOYED_UNABLE TO DETERMINE,EJECTION_PARTIALLY EJECTED,EJECTION_TOTALLY EJECTED,EJECTION_TRAPPED/EXTRICATED,EJECTION_UNABLE TO DETERMINE,DRIVER_ACTION_DISREGARDED CONTROL DEVICES,DRIVER_ACTION_EMERGENCY VEHICLE ON CALL,DRIVER_ACTION_EVADING POLICE VEHICLE,DRIVER_ACTION_FAILED TO YIELD,DRIVER_ACTION_FOLLOWED TOO CLOSELY,DRIVER_ACTION_IMPROPER BACKING,DRIVER_ACTION_IMPROPER LANE CHANGE,DRIVER_ACTION_IMPROPER PARKING,DRIVER_ACTION_IMPROPER PASSING,DRIVER_ACTION_IMPROPER TURN,DRIVER_ACTION_LICENSE RESTRICTIONS,DRIVER_ACTION_NONE,DRIVER_ACTION_OTHER,DRIVER_ACTION_OVERCORRECTED,DRIVER_ACTION_STOPPED SCHOOL BUS,DRIVER_ACTION_TEXTING,DRIVER_ACTION_TOO FAST FOR CONDITIONS,DRIVER_ACTION_WRONG WAY/SIDE,DRIVER_VISION_BLINDED - SUNLIGHT,DRIVER_VISION_BLOWING MATERIALS,DRIVER_VISION_BUILDINGS,DRIVER_VISION_EMBANKMENT,DRIVER_VISION_HILLCREST,DRIVER_VISION_MOVING VEHICLES,DRIVER_VISION_NOT OBSCURED,DRIVER_VISION_OTHER,DRIVER_VISION_PARKED VEHICLES,DRIVER_VISION_SIGNBOARD,"DRIVER_VISION_TREES, PLANTS",DRIVER_VISION_WINDSHIELD (WATER/ICE),PHYSICAL_CONDITION_FATIGUED/ASLEEP,PHYSICAL_CONDITION_HAD BEEN DRINKING,PHYSICAL_CONDITION_ILLNESS/FAINTED,PHYSICAL_CONDITION_IMPAIRED - ALCOHOL,PHYSICAL_CONDITION_IMPAIRED - ALCOHOL AND DRUGS,PHYSICAL_CONDITION_IMPAIRED - DRUGS,PHYSICAL_CONDITION_MEDICATED,PHYSICAL_CONDITION_NORMAL,PHYSICAL_CONDITION_OTHER,PHYSICAL_CONDITION_REMOVED BY EMS,PHYSICAL_CONDITION_UNABLE TO DETERMINE,PEDPEDAL_VISIBILITY_NO CONTRASTING CLOTHING,PEDPEDAL_VISIBILITY_OTHER LIGHT SOURCE USED,PEDPEDAL_VISIBILITY_REFLECTIVE MATERIAL,PEDPEDAL_VISIBILITY_UNABLE TO DETERMINE,CELL_PHONE_USE_UNABLE TO DETERMINE,CELL_PHONE_USE_Y,Location_Not Downtown
29244,4,2,1,26.0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0
22376,3,2,1,51.0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0
49790,2,1,1,73.0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1
32552,2,2,1,38.0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1
1957,2,2,1,30.0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,1


**NOT INJURED will be 0 !**

In [34]:
# Encode the y_train labels
#lb = preprocessing.LabelBinarizer()
#y_train=lb.fit_transform(train_new["INJURY"])

# NOT INJURED will be 0
y_train = preprocessing.label_binarize(train_new['INJURY'], classes=['NOT INJURED', 'INJURED']) 
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

**Real Baseline Scores**

- **Accuracy: ~0.91**

- **Recall: ~0.31**

- **Precision: ~0.76**

- **F1: ~0.44**

In [35]:
#Create a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)

#Train the model using the training sets y_pred=rf.predict(X_test)
scores = cross_val_score(rf, X_train, y_train, cv=5)
print('Accuracy', np.mean(scores), scores)

recall = cross_val_score(rf, X_train, y_train, cv=5, scoring='recall')
print('Recall', np.mean(recall), recall)

precision = cross_val_score(rf, X_train, y_train, cv=5, scoring='precision')
print('Precision', np.mean(precision), precision)

f1 = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1')
print('F1', np.mean(f1), f1)

Accuracy 0.9145583630210432 [0.91763192 0.91294356 0.91285163 0.91293555 0.91642916]
Recall 0.3058390250435249 [0.32427844 0.2985581  0.30958439 0.27758913 0.31918506]
Precision 0.7642548624681645 [0.79253112 0.74576271 0.73146293 0.77304965 0.77846791]
F1 0.4365849844201898 [0.46024096 0.42640824 0.43504172 0.40849469 0.45273931]


# Iterations

## Function definitions

In [38]:
def prepare_dataset(features):
    ''' Returns a RF-ready set without one-hot encoded features of 'UNABLE','UNKNOWN','NOT APPLICABLE','OTHER'. '''
    
    X_train = train[features]
    X_train = pd.get_dummies(X_train, columns = X_train.select_dtypes(['object']).columns, drop_first = True)
    
    dummies_to_drop = X_train.columns[X_train.columns.str.contains("UNABLE|UNKNOWN|NOT APPLICABLE|OTHER")]
    X_train = X_train.loc[:, ~X_train.columns.isin(dummies_to_drop)]  
    return(X_train)

In [39]:
def rf_evaluator(X = X_train, y = y_train, cv = 5, n_estimators = 100, random_state = 0,
                 max_depth = None, class_weight = None, min_samples_leaf = 1, min_samples_split = 2):

    '''Runs a RF classifier with default values and returns a dataframe consists accuracy, recall, precision and f1 
    mean scores calculated by cross-validation.'''

    rf = RandomForestClassifier(n_estimators = n_estimators, 
                                random_state = random_state, 
                                max_depth = max_depth,
                                min_samples_leaf = min_samples_leaf, 
                                min_samples_split = min_samples_split,
                                class_weight = class_weight)
    rf.fit(X, y)
    
    metrics_table = []
    for metric in ['accuracy', 'recall', 'precision', 'f1']:
        metrics_table.append([metric, cross_val_score(rf, X, y, cv = cv, scoring = metric).mean()])
    
    metrics_table = pd.DataFrame(metrics_table, columns = ['metric', 'mean_score'])
    return(metrics_table)

In [40]:
def best_recall_tree_selector(n_estimators = 100, random_state = 0, class_weight = None, 
                              max_depth = None, min_samples_leaf = 1, min_samples_split = 2):
    
    '''Runs a RF classifier with default values and returns best performing tree object in terms of recall.'''
    
    rf = RandomForestClassifier(n_estimators = n_estimators,
                                random_state = random_state,
                                class_weight = class_weight,
                                max_depth = max_depth, 
                                min_samples_leaf = min_samples_leaf,
                                min_samples_split = min_samples_split)
    rf.fit(X_train, y_train)
   
    from sklearn.metrics import recall_score
    
    estimator_recall = []
    for current_tree in range(n_estimators):
        estimator_recall.append([current_tree, 
                                 recall_score(y_train, rf.estimators_[current_tree].predict(X_train))])

    estimator_recall = pd.DataFrame(estimator_recall, columns = ['tree_number', 'recall'])
    estimator_recall.sort_values(inplace = True, by = 'recall', ascending = False)

    best_recall_tree = rf.estimators_[estimator_recall.head(1)['tree_number'].values[0]]
    return(best_recall_tree)

## Features of Tony - Baseline

### All Features

In [42]:
# All important features

filter_all = ["FIRST_CRASH_TYPE","AGE","AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT","SEX",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY", 
             'OCCUPANT_CNT', 'NUM_UNITS']
X_train = prepare_dataset(filter_all)

print(X_train.shape)
X_train.head()

(54388, 89)


Unnamed: 0,AGE,LANE_CNT,OCCUPANT_CNT,NUM_UNITS,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,FIRST_CRASH_TYPE_ANIMAL.1,FIRST_CRASH_TYPE_FIXED OBJECT.1,FIRST_CRASH_TYPE_HEAD ON.1,FIRST_CRASH_TYPE_OVERTURNED.1,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.1,FIRST_CRASH_TYPE_PEDALCYCLIST.1,FIRST_CRASH_TYPE_PEDESTRIAN.1,FIRST_CRASH_TYPE_REAR END.1,FIRST_CRASH_TYPE_REAR TO FRONT.1,FIRST_CRASH_TYPE_REAR TO REAR.1,FIRST_CRASH_TYPE_REAR TO SIDE.1,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.1,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.1,FIRST_CRASH_TYPE_TRAIN.1,FIRST_CRASH_TYPE_TURNING.1,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,SEX_M,SEX_X,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL.2,FIRST_CRASH_TYPE_FIXED OBJECT.2,FIRST_CRASH_TYPE_HEAD ON.2,FIRST_CRASH_TYPE_OVERTURNED.2,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.2,FIRST_CRASH_TYPE_PEDALCYCLIST.2,FIRST_CRASH_TYPE_PEDESTRIAN.2,FIRST_CRASH_TYPE_REAR END.2,FIRST_CRASH_TYPE_REAR TO FRONT.2,FIRST_CRASH_TYPE_REAR TO REAR.2,FIRST_CRASH_TYPE_REAR TO SIDE.2,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.2,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.2,FIRST_CRASH_TYPE_TRAIN.2,FIRST_CRASH_TYPE_TURNING.2,FIRST_CRASH_TYPE_ANIMAL.3,FIRST_CRASH_TYPE_FIXED OBJECT.3,FIRST_CRASH_TYPE_HEAD ON.3,FIRST_CRASH_TYPE_OVERTURNED.3,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.3,FIRST_CRASH_TYPE_PEDALCYCLIST.3,FIRST_CRASH_TYPE_PEDESTRIAN.3,FIRST_CRASH_TYPE_REAR END.3,FIRST_CRASH_TYPE_REAR TO FRONT.3,FIRST_CRASH_TYPE_REAR TO REAR.3,FIRST_CRASH_TYPE_REAR TO SIDE.3,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.3,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.3,FIRST_CRASH_TYPE_TRAIN.3,FIRST_CRASH_TYPE_TURNING.3,INJURY_NOT INJURED
29244,26.0,4,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,51.0,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,73.0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,38.0,2,1,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,30.0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [77]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


### OCCUPANT_CNT

The number of people in the unit, as determined by the reporting officer

In [78]:
# All important features except OCCUPANT_CNT

filter_list = ["FIRST_CRASH_TYPE","AGE","AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT","SEX",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY",'NUM_UNITS']

X_train = prepare_dataset(filter_list)

print(X_train.shape)
X_train.head()

(54388, 88)


Unnamed: 0,AGE,LANE_CNT,NUM_UNITS,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,FIRST_CRASH_TYPE_ANIMAL.1,FIRST_CRASH_TYPE_FIXED OBJECT.1,FIRST_CRASH_TYPE_HEAD ON.1,FIRST_CRASH_TYPE_OVERTURNED.1,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.1,FIRST_CRASH_TYPE_PEDALCYCLIST.1,FIRST_CRASH_TYPE_PEDESTRIAN.1,FIRST_CRASH_TYPE_REAR END.1,FIRST_CRASH_TYPE_REAR TO FRONT.1,FIRST_CRASH_TYPE_REAR TO REAR.1,FIRST_CRASH_TYPE_REAR TO SIDE.1,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.1,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.1,FIRST_CRASH_TYPE_TRAIN.1,FIRST_CRASH_TYPE_TURNING.1,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,SEX_M,SEX_X,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL.2,FIRST_CRASH_TYPE_FIXED OBJECT.2,FIRST_CRASH_TYPE_HEAD ON.2,FIRST_CRASH_TYPE_OVERTURNED.2,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.2,FIRST_CRASH_TYPE_PEDALCYCLIST.2,FIRST_CRASH_TYPE_PEDESTRIAN.2,FIRST_CRASH_TYPE_REAR END.2,FIRST_CRASH_TYPE_REAR TO FRONT.2,FIRST_CRASH_TYPE_REAR TO REAR.2,FIRST_CRASH_TYPE_REAR TO SIDE.2,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.2,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.2,FIRST_CRASH_TYPE_TRAIN.2,FIRST_CRASH_TYPE_TURNING.2,FIRST_CRASH_TYPE_ANIMAL.3,FIRST_CRASH_TYPE_FIXED OBJECT.3,FIRST_CRASH_TYPE_HEAD ON.3,FIRST_CRASH_TYPE_OVERTURNED.3,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.3,FIRST_CRASH_TYPE_PEDALCYCLIST.3,FIRST_CRASH_TYPE_PEDESTRIAN.3,FIRST_CRASH_TYPE_REAR END.3,FIRST_CRASH_TYPE_REAR TO FRONT.3,FIRST_CRASH_TYPE_REAR TO REAR.3,FIRST_CRASH_TYPE_REAR TO SIDE.3,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.3,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.3,FIRST_CRASH_TYPE_TRAIN.3,FIRST_CRASH_TYPE_TURNING.3,INJURY_NOT INJURED
29244,26.0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,51.0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,73.0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,38.0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,30.0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [79]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


In [100]:
# Group the OCCUPANT_CNT and injury

df = train_new[:]
df2 = df.groupby(['OCCUPANT_CNT', 'INJURY']).size()

In [101]:
df2

OCCUPANT_CNT  INJURY     
0             INJURED         1332
              NOT INJURED     6926
1             INJURED         3229
              NOT INJURED    34875
2             INJURED          890
              NOT INJURED     4707
3             INJURED          287
              NOT INJURED     1245
4             INJURED          104
              NOT INJURED      504
5             INJURED           33
              NOT INJURED      149
6             INJURED           13
              NOT INJURED       52
7             INJURED            3
              NOT INJURED       14
8             NOT INJURED        9
9             NOT INJURED        5
10            NOT INJURED        3
11            INJURED            1
              NOT INJURED        2
12            NOT INJURED        1
13            NOT INJURED        1
20            NOT INJURED        1
33            NOT INJURED        1
39            NOT INJURED        1
dtype: int64

**Important Findings**

- OCCUPANT_CNT = 0: Getting injure = 16.12%

- OCCUPANT_CNT = 1: Getting injure = 8.47%

- OCCUPANT_CNT = 2: Getting injure = 15.90%

- **OCCUPANT_CNT = 3: Getting injure = 18.73%**

- OCCUPANT_CNT = 4: Getting injure = 17.10%

- OCCUPANT_CNT = 4: Getting injure = 18.13%

### Num_UNIT

Number of units involved in the crash

In [81]:
# All important features except NUM_UNITS

filter_list=["FIRST_CRASH_TYPE","AGE","AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT","SEX",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY", 
             'OCCUPANT_CNT']

X_train = prepare_dataset(filter_list)

print(X_train.shape)
X_train.head()

(54388, 88)


Unnamed: 0,AGE,LANE_CNT,OCCUPANT_CNT,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,FIRST_CRASH_TYPE_ANIMAL.1,FIRST_CRASH_TYPE_FIXED OBJECT.1,FIRST_CRASH_TYPE_HEAD ON.1,FIRST_CRASH_TYPE_OVERTURNED.1,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.1,FIRST_CRASH_TYPE_PEDALCYCLIST.1,FIRST_CRASH_TYPE_PEDESTRIAN.1,FIRST_CRASH_TYPE_REAR END.1,FIRST_CRASH_TYPE_REAR TO FRONT.1,FIRST_CRASH_TYPE_REAR TO REAR.1,FIRST_CRASH_TYPE_REAR TO SIDE.1,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.1,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.1,FIRST_CRASH_TYPE_TRAIN.1,FIRST_CRASH_TYPE_TURNING.1,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,SEX_M,SEX_X,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL.2,FIRST_CRASH_TYPE_FIXED OBJECT.2,FIRST_CRASH_TYPE_HEAD ON.2,FIRST_CRASH_TYPE_OVERTURNED.2,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.2,FIRST_CRASH_TYPE_PEDALCYCLIST.2,FIRST_CRASH_TYPE_PEDESTRIAN.2,FIRST_CRASH_TYPE_REAR END.2,FIRST_CRASH_TYPE_REAR TO FRONT.2,FIRST_CRASH_TYPE_REAR TO REAR.2,FIRST_CRASH_TYPE_REAR TO SIDE.2,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.2,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.2,FIRST_CRASH_TYPE_TRAIN.2,FIRST_CRASH_TYPE_TURNING.2,FIRST_CRASH_TYPE_ANIMAL.3,FIRST_CRASH_TYPE_FIXED OBJECT.3,FIRST_CRASH_TYPE_HEAD ON.3,FIRST_CRASH_TYPE_OVERTURNED.3,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.3,FIRST_CRASH_TYPE_PEDALCYCLIST.3,FIRST_CRASH_TYPE_PEDESTRIAN.3,FIRST_CRASH_TYPE_REAR END.3,FIRST_CRASH_TYPE_REAR TO FRONT.3,FIRST_CRASH_TYPE_REAR TO REAR.3,FIRST_CRASH_TYPE_REAR TO SIDE.3,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.3,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.3,FIRST_CRASH_TYPE_TRAIN.3,FIRST_CRASH_TYPE_TURNING.3,INJURY_NOT INJURED
29244,26.0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,51.0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,73.0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,38.0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,30.0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [82]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


In [97]:
# Group the NUM_UNITS and injury

df = train_new[:]
df2 = df.groupby(['NUM_UNITS', 'INJURY']).size()

In [98]:
df2

NUM_UNITS  INJURY     
1          INJURED          196
           NOT INJURED     1360
2          INJURED         4203
           NOT INJURED    41932
3          INJURED          917
           NOT INJURED     3854
4          INJURED          338
           NOT INJURED      983
5          INJURED           97
           NOT INJURED      223
6          INJURED           90
           NOT INJURED       54
7          INJURED           35
           NOT INJURED       47
8          INJURED            7
           NOT INJURED       19
9          NOT INJURED        8
10         NOT INJURED       10
11         NOT INJURED        6
12         INJURED            9
dtype: int64

### First Crash Type

In [83]:
# All important features except FIRST CRASH TYPE

filter_list=["AGE","AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT","SEX",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY", 
             'OCCUPANT_CNT', 'NUM_UNITS']

X_train = prepare_dataset(filter_list)

print(X_train.shape)
X_train.head()

(54388, 44)


Unnamed: 0,AGE,LANE_CNT,OCCUPANT_CNT,NUM_UNITS,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,SEX_M,SEX_X,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,INJURY_NOT INJURED
29244,26.0,4,1,2,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,51.0,3,1,2,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,73.0,2,1,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,38.0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,30.0,2,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [84]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


In [85]:
# Group the crash type and injury

df = train_new[:]
df2 = df.groupby(['FIRST_CRASH_TYPE', 'INJURY']).size()


In [86]:
df2

FIRST_CRASH_TYPE              INJURY     
ANGLE                         INJURED         1052
                              NOT INJURED     5051
ANIMAL                        NOT INJURED       27
FIXED OBJECT                  INJURED          203
                              NOT INJURED     1261
HEAD ON                       INJURED          128
                              NOT INJURED      357
OTHER NONCOLLISION            INJURED           18
                              NOT INJURED       80
OTHER OBJECT                  INJURED           33
                              NOT INJURED      286
OVERTURNED                    INJURED            5
                              NOT INJURED       12
PARKED MOTOR VEHICLE          INJURED          470
                              NOT INJURED    12476
PEDALCYCLIST                  INJURED          502
                              NOT INJURED      318
PEDESTRIAN                    INJURED         1109
                              NOT INJURE

**Important Findings**

- Crashes involving PEDALCYCLIST, and PEDESTRIAN lead to having more injuries than non-injuries

- Apart from above, ANGLE, HEAD-ON, OVERTURNED, and TURNING have a high probablities of getting injured from all cases

### AGE

In [87]:
# All important features except AGE

filter_list=["FIRST_CRASH_TYPE", "AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT","SEX",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY", 
             'OCCUPANT_CNT', 'NUM_UNITS']

X_train = prepare_dataset(filter_list)

print(X_train.shape)
X_train.head()

(54388, 88)


Unnamed: 0,LANE_CNT,OCCUPANT_CNT,NUM_UNITS,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,FIRST_CRASH_TYPE_ANIMAL.1,FIRST_CRASH_TYPE_FIXED OBJECT.1,FIRST_CRASH_TYPE_HEAD ON.1,FIRST_CRASH_TYPE_OVERTURNED.1,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.1,FIRST_CRASH_TYPE_PEDALCYCLIST.1,FIRST_CRASH_TYPE_PEDESTRIAN.1,FIRST_CRASH_TYPE_REAR END.1,FIRST_CRASH_TYPE_REAR TO FRONT.1,FIRST_CRASH_TYPE_REAR TO REAR.1,FIRST_CRASH_TYPE_REAR TO SIDE.1,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.1,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.1,FIRST_CRASH_TYPE_TRAIN.1,FIRST_CRASH_TYPE_TURNING.1,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,SEX_M,SEX_X,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL.2,FIRST_CRASH_TYPE_FIXED OBJECT.2,FIRST_CRASH_TYPE_HEAD ON.2,FIRST_CRASH_TYPE_OVERTURNED.2,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.2,FIRST_CRASH_TYPE_PEDALCYCLIST.2,FIRST_CRASH_TYPE_PEDESTRIAN.2,FIRST_CRASH_TYPE_REAR END.2,FIRST_CRASH_TYPE_REAR TO FRONT.2,FIRST_CRASH_TYPE_REAR TO REAR.2,FIRST_CRASH_TYPE_REAR TO SIDE.2,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.2,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.2,FIRST_CRASH_TYPE_TRAIN.2,FIRST_CRASH_TYPE_TURNING.2,FIRST_CRASH_TYPE_ANIMAL.3,FIRST_CRASH_TYPE_FIXED OBJECT.3,FIRST_CRASH_TYPE_HEAD ON.3,FIRST_CRASH_TYPE_OVERTURNED.3,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.3,FIRST_CRASH_TYPE_PEDALCYCLIST.3,FIRST_CRASH_TYPE_PEDESTRIAN.3,FIRST_CRASH_TYPE_REAR END.3,FIRST_CRASH_TYPE_REAR TO FRONT.3,FIRST_CRASH_TYPE_REAR TO REAR.3,FIRST_CRASH_TYPE_REAR TO SIDE.3,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.3,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.3,FIRST_CRASH_TYPE_TRAIN.3,FIRST_CRASH_TYPE_TURNING.3,INJURY_NOT INJURED
29244,4,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,2,1,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [88]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


In [89]:
# Binning the age

cut_labels = ['0-20', '20-30', '30-40', '40-50', '50-60', 'Elder']
cut_bins = [0, 20, 30, 40, 50, 60, 200]
train_new['AGE_BIN'] = pd.cut(train_new['AGE'], bins = cut_bins, labels = cut_labels)


In [90]:
# Compare the range of age with primary cause

pd.set_option('display.max_rows', 400)
df = train_new[:]
df.groupby(['AGE_BIN', 'PRIM_CONTRIBUTORY_CAUSE']).size().reset_index(name = 'count').sort_values(['AGE_BIN','count','PRIM_CONTRIBUTORY_CAUSE'], ascending = False)





Unnamed: 0,AGE_BIN,PRIM_CONTRIBUTORY_CAUSE,count
230,Elder,UNABLE TO DETERMINE,1265
213,Elder,FAILING TO YIELD RIGHT-OF-WAY,695
214,Elder,FOLLOWING TOO CLOSELY,517
218,Elder,IMPROPER OVERTAKING/PASSING,216
212,Elder,FAILING TO REDUCE SPEED TO AVOID CRASH,214
221,Elder,NOT APPLICABLE,193
216,Elder,IMPROPER BACKING,190
217,Elder,IMPROPER LANE USAGE,183
219,Elder,IMPROPER TURNING/NO SIGNAL,182
207,Elder,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,139


In [91]:
# Add the secondary cause back

train_new['SEC_CONTRIBUTORY_CAUSE'] = train['SEC_CONTRIBUTORY_CAUSE']

pd.set_option('display.max_rows', 400)
df = train_new[:]
df.groupby(['AGE_BIN', 'SEC_CONTRIBUTORY_CAUSE']).size().reset_index(name = 'count').sort_values(['AGE_BIN','count','SEC_CONTRIBUTORY_CAUSE'], ascending = False)



Unnamed: 0,AGE_BIN,SEC_CONTRIBUTORY_CAUSE,count
226,Elder,NOT APPLICABLE,1771
236,Elder,UNABLE TO DETERMINE,1437
217,Elder,FAILING TO REDUCE SPEED TO AVOID CRASH,215
218,Elder,FAILING TO YIELD RIGHT-OF-WAY,177
212,Elder,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,142
219,Elder,FOLLOWING TOO CLOSELY,120
239,Elder,WEATHER,64
223,Elder,IMPROPER OVERTAKING/PASSING,62
222,Elder,IMPROPER LANE USAGE,60
224,Elder,IMPROPER TURNING/NO SIGNAL,55


**Important Findings**

- FAILING TO YIELD RIGHT-OF-WAY is the main reason of crashes for age 0-20, 20-30 because FAILING TO REDUCE SPEED TO AVOID CRASH

- FOLLOWING TOO CLOSELY is the main reason of crashes for age 30-40, 40-50, 50-60, and 60 + because FAILING TO REDUCE SPEED TO AVOID CRASH

- For teenager (age 0-20), another reason is because lack of DRIVING SKILLS/KNOWLEDGE/EXPERIENCE

### SEX

In [92]:
# All important features except SEX

filter_list=["FIRST_CRASH_TYPE","AGE","AIRBAG_DEPLOYED","INTERSECTION_RELATED_I","LANE_CNT",
             "TRAFFICWAY_TYPE","POSTED_SPEED_LIMIT","CRASH_HOUR","FIRST_CRASH_TYPE","INJURY", 
             'OCCUPANT_CNT', 'NUM_UNITS']

X_train = prepare_dataset(filter_list)

print(X_train.shape)
X_train.head()

(54388, 87)


Unnamed: 0,AGE,LANE_CNT,OCCUPANT_CNT,NUM_UNITS,FIRST_CRASH_TYPE_ANIMAL,FIRST_CRASH_TYPE_FIXED OBJECT,FIRST_CRASH_TYPE_HEAD ON,FIRST_CRASH_TYPE_OVERTURNED,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE,FIRST_CRASH_TYPE_PEDALCYCLIST,FIRST_CRASH_TYPE_PEDESTRIAN,FIRST_CRASH_TYPE_REAR END,FIRST_CRASH_TYPE_REAR TO FRONT,FIRST_CRASH_TYPE_REAR TO REAR,FIRST_CRASH_TYPE_REAR TO SIDE,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION,FIRST_CRASH_TYPE_TRAIN,FIRST_CRASH_TYPE_TURNING,FIRST_CRASH_TYPE_ANIMAL.1,FIRST_CRASH_TYPE_FIXED OBJECT.1,FIRST_CRASH_TYPE_HEAD ON.1,FIRST_CRASH_TYPE_OVERTURNED.1,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.1,FIRST_CRASH_TYPE_PEDALCYCLIST.1,FIRST_CRASH_TYPE_PEDESTRIAN.1,FIRST_CRASH_TYPE_REAR END.1,FIRST_CRASH_TYPE_REAR TO FRONT.1,FIRST_CRASH_TYPE_REAR TO REAR.1,FIRST_CRASH_TYPE_REAR TO SIDE.1,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.1,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.1,FIRST_CRASH_TYPE_TRAIN.1,FIRST_CRASH_TYPE_TURNING.1,AIRBAG_DEPLOYED_NOT DEPLOYED,INTERSECTION_RELATED_I_Y,TRAFFICWAY_TYPE_CENTER TURN LANE,TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN (NOT RAISED),TRAFFICWAY_TYPE_DIVIDED - W/MEDIAN BARRIER,TRAFFICWAY_TYPE_DRIVEWAY,"TRAFFICWAY_TYPE_FIVE POINT, OR MORE",TRAFFICWAY_TYPE_FOUR WAY,TRAFFICWAY_TYPE_L-INTERSECTION,TRAFFICWAY_TYPE_NOT DIVIDED,TRAFFICWAY_TYPE_NOT REPORTED,TRAFFICWAY_TYPE_ONE-WAY,TRAFFICWAY_TYPE_PARKING LOT,TRAFFICWAY_TYPE_RAMP,TRAFFICWAY_TYPE_ROUNDABOUT,TRAFFICWAY_TYPE_T-INTERSECTION,TRAFFICWAY_TYPE_TRAFFIC ROUTE,TRAFFICWAY_TYPE_Y-INTERSECTION,POSTED_SPEED_LIMIT_LOW_SPEED,CRASH_HOUR_Early_morning,CRASH_HOUR_Morning,CRASH_HOUR_Night,FIRST_CRASH_TYPE_ANIMAL.2,FIRST_CRASH_TYPE_FIXED OBJECT.2,FIRST_CRASH_TYPE_HEAD ON.2,FIRST_CRASH_TYPE_OVERTURNED.2,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.2,FIRST_CRASH_TYPE_PEDALCYCLIST.2,FIRST_CRASH_TYPE_PEDESTRIAN.2,FIRST_CRASH_TYPE_REAR END.2,FIRST_CRASH_TYPE_REAR TO FRONT.2,FIRST_CRASH_TYPE_REAR TO REAR.2,FIRST_CRASH_TYPE_REAR TO SIDE.2,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.2,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.2,FIRST_CRASH_TYPE_TRAIN.2,FIRST_CRASH_TYPE_TURNING.2,FIRST_CRASH_TYPE_ANIMAL.3,FIRST_CRASH_TYPE_FIXED OBJECT.3,FIRST_CRASH_TYPE_HEAD ON.3,FIRST_CRASH_TYPE_OVERTURNED.3,FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE.3,FIRST_CRASH_TYPE_PEDALCYCLIST.3,FIRST_CRASH_TYPE_PEDESTRIAN.3,FIRST_CRASH_TYPE_REAR END.3,FIRST_CRASH_TYPE_REAR TO FRONT.3,FIRST_CRASH_TYPE_REAR TO REAR.3,FIRST_CRASH_TYPE_REAR TO SIDE.3,FIRST_CRASH_TYPE_SIDESWIPE OPPOSITE DIRECTION.3,FIRST_CRASH_TYPE_SIDESWIPE SAME DIRECTION.3,FIRST_CRASH_TYPE_TRAIN.3,FIRST_CRASH_TYPE_TURNING.3,INJURY_NOT INJURED
29244,26.0,4,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
22376,51.0,3,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49790,73.0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
32552,38.0,2,1,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1957,30.0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


In [93]:
# Baseline
rf_evaluator()

Unnamed: 0,metric,mean_score
0,accuracy,0.914558
1,recall,0.305839
2,precision,0.764255
3,f1,0.436585


In [94]:
# Compare sex with primary cause

pd.set_option('display.max_rows', 200)
df = train_new[:]
df.groupby(['SEX', 'PRIM_CONTRIBUTORY_CAUSE']).size().reset_index(name = 'count').sort_values(['SEX','count','PRIM_CONTRIBUTORY_CAUSE'], ascending = False)


Unnamed: 0,SEX,PRIM_CONTRIBUTORY_CAUSE,count
142,X,UNABLE TO DETERMINE,2743
135,X,NOT APPLICABLE,324
129,X,FOLLOWING TOO CLOSELY,201
133,X,IMPROPER OVERTAKING/PASSING,186
128,X,FAILING TO YIELD RIGHT-OF-WAY,163
132,X,IMPROPER LANE USAGE,133
127,X,FAILING TO REDUCE SPEED TO AVOID CRASH,131
131,X,IMPROPER BACKING,104
122,X,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,77
136,X,"OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELE...",75


**Important Findings**

- Primary cause is mostly identical for both females and males

## Partial Dependence

In [95]:
fig = plt.figure(figsize = (5, 5))
features = ['NUM_UNITS', 'OCCUPANT_CNT']

plot_partial_dependence(rf, features = features, # column numbers of plots we want to show
                        X = X_train,
                        fig = fig) # number of values to plot on x axis

plt.show()

ValueError: feature_names should not contain duplicates.

<Figure size 360x360 with 0 Axes>