In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import load_model, save_model
import keras_tuner as kt

from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot

In [3]:
# Load in our dataset

# REPLACE THIS WITH SQL CODE TO CALL IN FROM PGADMIN
# SQL ALCHEMY... LOAD DATA AND TURN INTO DATAFRAME

data = pd.read_csv('Motor Vehicle Collisions with KSI Data - 4326.csv')

In [4]:
# Look at the colmn names

column_names = data.columns.tolist()

print(column_names)

['_id', 'ACCNUM', 'YEAR', 'DATE', 'TIME', 'STREET1', 'STREET2', 'OFFSET', 'ROAD_CLASS', 'DISTRICT', 'WARDNUM', 'LOCCOORD', 'ACCLOC', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY', 'FATAL_NO', 'INITDIR', 'VEHTYPE', 'MANOEUVER', 'DRIVACT', 'DRIVCOND', 'PEDTYPE', 'PEDACT', 'PEDCOND', 'CYCLISTYPE', 'CYCACT', 'CYCCOND', 'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'EMERG_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'DISABILITY', 'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140', 'NEIGHBOURHOOD_140', 'DIVISION', 'geometry']


In [5]:
# Setup a function for binning

def bin_column (feature, bin_size):
    
    value_counts = feature.value_counts()
    replace_list = list(value_counts.iloc[bin_size:].index)

    for item in replace_list:
        feature.replace({item: "Other"}, inplace=True)

In [6]:
def create_model_var(hp, input_dim, min_units, max_units, step_units):
    nn_model = tf.keras.models.Sequential()

    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=min_units,
        max_value=max_units,
        step=step_units), activation=activation, input_dim=input_dim))

    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=min_units,
            max_value=max_units,
            step=step_units),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [7]:
# Look at all columns

pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,_id,ACCNUM,YEAR,DATE,TIME,STREET1,STREET2,OFFSET,ROAD_CLASS,DISTRICT,WARDNUM,LOCCOORD,ACCLOC,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,FATAL_NO,INITDIR,VEHTYPE,MANOEUVER,DRIVACT,DRIVCOND,PEDTYPE,PEDACT,PEDCOND,CYCLISTYPE,CYCACT,CYCCOND,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,EMERG_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,DIVISION,geometry
0,1,892658,2006,2006-03-11,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4.0,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,,South,"Automobile, Station Wagon",Turning Left,Failed to Yield Right of Way,Unknown,,,,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,"{'type': 'MultiPoint', 'coordinates': [[-79.45..."
1,2,892658,2006,2006-03-11,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,4.0,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,,North,Other,,,,Vehicle turns left while ped crosses with ROW ...,Crossing with right of way,Unknown,,,,Yes,,Yes,,,,,,,Yes,,,,88,High Park North,88,High Park North (88),D11,"{'type': 'MultiPoint', 'coordinates': [[-79.45..."
2,3,892810,2006,2006-03-11,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25.0,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,,East,Motorcycle,Turning Right,Disobeyed Traffic Control,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,"{'type': 'MultiPoint', 'coordinates': [[-79.19..."
3,4,893184,2006,2006-01-01,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,19.0,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,,,,,,,,,,,,,,,Yes,,,,,Yes,Yes,Yes,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,"{'type': 'MultiPoint', 'coordinates': [[-79.31..."
4,5,892810,2006,2006-03-11,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,25.0,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,,South,"Automobile, Station Wagon",Going Ahead,Driving Properly,Unknown,,,,,,,,,Yes,Yes,,,,,,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,"{'type': 'MultiPoint', 'coordinates': [[-79.19..."


In [8]:
# Make a list of columns to drop that won't help machine learning

drop_columns = ['_id', 'ACCNUM', 'YEAR', 'DATE', 'WARDNUM', 'STREET1', 'STREET2', 'DISTRICT', 'NEIGHBOURHOOD_158', 'NEIGHBOURHOOD_140', 'DIVISION', 'geometry', 'FATAL_NO', 'PEDTYPE', 'PEDACT', 'PEDCOND', 'CYCLISTYPE', 'CYCACT', 'CYCCOND', 'TIME', 'OFFSET', 'HOOD_158', 'HOOD_140']

In [9]:
# Get rid of those columns

data_refined = data.drop(columns = drop_columns)
data_refined.head()

Unnamed: 0,ROAD_CLASS,LOCCOORD,ACCLOC,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,INITDIR,VEHTYPE,MANOEUVER,DRIVACT,DRIVCOND,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,EMERG_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY
0,Major Arterial,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,South,"Automobile, Station Wagon",Turning Left,Failed to Yield Right of Way,Unknown,Yes,,Yes,,,,,,,Yes,,,
1,Major Arterial,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,North,Other,,,,Yes,,Yes,,,,,,,Yes,,,
2,Major Arterial,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,East,Motorcycle,Turning Right,Disobeyed Traffic Control,Unknown,,,Yes,Yes,,,,,,Yes,Yes,,
3,Major Arterial,Intersection,Intersection Related,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,,,,,,,,Yes,,,,,Yes,Yes,Yes,,Yes,
4,Major Arterial,Intersection,At Intersection,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,South,"Automobile, Station Wagon",Going Ahead,Driving Properly,Unknown,,,Yes,Yes,,,,,,Yes,Yes,,


In [10]:
# Determine the number of unique values in each column.

data_refined.nunique()

ROAD_CLASS       11
LOCCOORD          8
ACCLOC           10
TRAFFCTL         11
VISIBILITY        9
LIGHT             9
RDSFCOND         10
ACCLASS           4
IMPACTYPE        11
INVTYPE          20
INVAGE           21
INJURY            5
INITDIR           6
VEHTYPE          32
MANOEUVER        17
DRIVACT          14
DRIVCOND         11
PEDESTRIAN        2
CYCLIST           2
AUTOMOBILE        2
MOTORCYCLE        2
TRUCK             2
TRSN_CITY_VEH     2
EMERG_VEH         2
PASSENGER         2
SPEEDING          2
AG_DRIV           2
REDLIGHT          2
ALCOHOL           2
DISABILITY        2
dtype: int64

<hr>

# Start the binning process now, one feature at a time

## Road Classification -- ROAD_CLASS

In [11]:
# Look at road classification values

# It looks like we should bin down to Expressway and then include 'None' in the 'Other' category.

data_refined['ROAD_CLASS'].value_counts()

Major Arterial         12956
Minor Arterial          2840
Collector                996
Local                    841
None                     376
Expressway               132
Other                     25
Laneway                   11
Expressway Ramp            9
Pending                    7
Major Arterial Ramp        1
Name: ROAD_CLASS, dtype: int64

In [12]:
# Pass the road classification to the function and replace the 'None'

bin_column(data_refined['ROAD_CLASS'], 6)

data_refined['ROAD_CLASS'] = data_refined['ROAD_CLASS'].replace('None', 'Other')

In [13]:
# Check on final bins for road classification

data_refined['ROAD_CLASS'].value_counts()

Major Arterial    12956
Minor Arterial     2840
Collector           996
Local               841
Other               429
Expressway          132
Name: ROAD_CLASS, dtype: int64

## Location Coordinates -- LOCCOORD

In [14]:
# Look at Location Coordinates values

# It looks like we should bin down to None and then include 'None' in the 'Other' category.

data_refined['LOCCOORD'].value_counts()

Intersection                           11965
Mid-Block                               6113
None                                      95
Mid-Block (Abnormal)                       8
Exit Ramp Westbound                        5
Exit Ramp Southbound                       3
Park, Private Property, Public Lane        3
Entrance Ramp Westbound                    2
Name: LOCCOORD, dtype: int64

In [15]:
# Pass the Location Coordinates to the function and replace the 'None'

bin_column(data_refined['LOCCOORD'], 3)

data_refined['LOCCOORD'] = data_refined['LOCCOORD'].replace('None', 'Other')

In [16]:
# Check on final bins for Location Coordinates

data_refined['LOCCOORD'].value_counts()

Intersection    11965
Mid-Block        6113
Other             116
Name: LOCCOORD, dtype: int64

## Accident Location --  ACCLOC

In [17]:
# Look at Accident Location values

# It looks like we should bin down to At/Near Private Drive and then include 'None' in the 'Other' category.

data_refined['ACCLOC'].value_counts()

At Intersection          8689
None                     5450
Non Intersection         2423
Intersection Related     1202
At/Near Private Drive     379
Overpass or Bridge         17
Laneway                    14
Private Driveway           13
Underpass or Tunnel         6
Trail                       1
Name: ACCLOC, dtype: int64

In [18]:
# Pass the Accident Location to the function and replace the 'None'

bin_column(data_refined['ACCLOC'], 5)

data_refined['ACCLOC'] = data_refined['ACCLOC'].replace('None', 'Other')

In [19]:
# Check on final bins for Accident Location

data_refined['ACCLOC'].value_counts()

At Intersection          8689
Other                    5501
Non Intersection         2423
Intersection Related     1202
At/Near Private Drive     379
Name: ACCLOC, dtype: int64

## Traffic Control -- TRAFFCTL

In [20]:
# Look at Traffic Control values

# It looks like we should bin down to Traffic Controller 

data_refined['TRAFFCTL'].value_counts()

No Control              8791
Traffic Signal          7637
Stop Sign               1380
Pedestrian Crossover     198
Traffic Controller       108
None                      34
Yield Sign                21
Streetcar (Stop for)      16
Traffic Gate               5
School Guard               2
Police Control             2
Name: TRAFFCTL, dtype: int64

In [21]:
# Pass the Traffic Control to the function 

bin_column(data_refined['TRAFFCTL'], 5)

In [22]:
# Check on final bins for Traffic Control

data_refined['TRAFFCTL'].value_counts()

No Control              8791
Traffic Signal          7637
Stop Sign               1380
Pedestrian Crossover     198
Traffic Controller       108
Other                     80
Name: TRAFFCTL, dtype: int64

## Visibility -- VISIBILITY

In [23]:
# Look at Visibility values

# It looks like we should bin down to Drifting Snow

data_refined['VISIBILITY'].value_counts()

Clear                     15719
Rain                       1879
Snow                        351
Other                        97
Fog, Mist, Smoke, Dust       50
Freezing Rain                47
Drifting Snow                21
None                         20
Strong wind                  10
Name: VISIBILITY, dtype: int64

In [24]:
# Pass the Visibility to the function 

bin_column(data_refined['VISIBILITY'], 7)

In [25]:
# Check on final bins for Visibility

data_refined['VISIBILITY'].value_counts()

Clear                     15719
Rain                       1879
Snow                        351
Other                       127
Fog, Mist, Smoke, Dust       50
Freezing Rain                47
Drifting Snow                21
Name: VISIBILITY, dtype: int64

## Light -- LIGHT

In [26]:
# Look at Light values

# It looks like we should drop the six 'Other' and then combine the artificial and non-artificials that are related

data_refined['LIGHT'].value_counts()

Daylight                10388
Dark                     3687
Dark, artificial         3302
Dusk                      240
Dusk, artificial          219
Daylight, artificial      141
Dawn                      110
Dawn, artificial          101
Other                       6
Name: LIGHT, dtype: int64

In [27]:
# Combine the artificals with regular value.

data_refined['LIGHT'] = data_refined['LIGHT'].replace('Dark, artificial', 'Dark')
data_refined['LIGHT'] = data_refined['LIGHT'].replace('Dusk, artificial', 'Dusk')
data_refined['LIGHT'] = data_refined['LIGHT'].replace('Daylight, artificial', 'Daylight')
data_refined['LIGHT'] = data_refined['LIGHT'].replace('Dawn, artificial', 'Dawn')

# Remove the 'Other' from datatset

data_refined = data_refined[data_refined['LIGHT'].isin(['Dark', 'Dusk', 'Daylight', 'Dawn'])]

In [28]:
# Check on final bins for Light

data_refined['LIGHT'].value_counts()

Daylight    10529
Dark         6989
Dusk          459
Dawn          211
Name: LIGHT, dtype: int64

## Road conditions  -- RDSFCOND 

In [29]:
# Look at Road contidtions values

# It looks like we should bin down to Packed Snow

data_refined['RDSFCOND'].value_counts()

Dry                     14599
Wet                      3021
Loose Snow                169
Other                     141
Slush                     102
Ice                        77
Packed Snow                44
None                       23
Loose Sand or Gravel       11
Spilled liquid              1
Name: RDSFCOND, dtype: int64

In [30]:
# Pass the Road contidtions to the function 

bin_column(data_refined['RDSFCOND'], 7)

In [31]:
# Check on final bins for Road contidtions

data_refined['RDSFCOND'].value_counts()

Dry            14599
Wet             3021
Other            176
Loose Snow       169
Slush            102
Ice               77
Packed Snow       44
Name: RDSFCOND, dtype: int64

## Impact type -- IMPACTYPE

In [32]:
# Look at Impact type values

# It looks like we should combine the two SMV (single motor vehicle) categories and then combine 'none' and 'other'

data_refined['IMPACTYPE'].value_counts()

Pedestrian Collisions     7293
Turning Movement          2790
Cyclist Collisions        1795
Rear End                  1746
SMV Other                 1460
Angle                     1283
Approaching                928
Sideswipe                  506
Other                      193
SMV Unattended Vehicle     190
None                         4
Name: IMPACTYPE, dtype: int64

In [33]:
# Combine the related columns

data_refined['IMPACTYPE'] = data_refined['IMPACTYPE'].replace('SMV Unattended Vehicle', 'SMV Other')
data_refined['IMPACTYPE'] = data_refined['IMPACTYPE'].replace('None', 'Other')

In [34]:
# Check on final bins for Impact type

data_refined['IMPACTYPE'].value_counts()

Pedestrian Collisions    7293
Turning Movement         2790
Cyclist Collisions       1795
Rear End                 1746
SMV Other                1650
Angle                    1283
Approaching               928
Sideswipe                 506
Other                     197
Name: IMPACTYPE, dtype: int64

## Involved Type -- INVTYPE

In [35]:
# Look at Involved Type values

# It looks like we should bin down to 'Wheelchair'

data_refined['INVTYPE'].value_counts()

Driver                  8273
Pedestrian              3111
Passenger               2767
Vehicle Owner           1636
Cyclist                  784
Motorcycle Driver        696
Truck Driver             346
Other Property Owner     257
Other                    186
Motorcycle Passenger      39
Moped Driver              30
Driver - Not Hit          17
Wheelchair                17
None                      16
In-Line Skater             5
Cyclist Passenger          3
Trailer Owner              2
Pedestrian - Not Hit       1
Witness                    1
Moped Passenger            1
Name: INVTYPE, dtype: int64

In [36]:
# Pass the Involved Type to the function 

bin_column(data_refined['INVTYPE'], 13)

In [37]:
# Check on final bins for Involved Type

data_refined['INVTYPE'].value_counts()

Driver                  8273
Pedestrian              3111
Passenger               2767
Vehicle Owner           1636
Cyclist                  784
Motorcycle Driver        696
Truck Driver             346
Other Property Owner     257
Other                    215
Motorcycle Passenger      39
Moped Driver              30
Driver - Not Hit          17
Wheelchair                17
Name: INVTYPE, dtype: int64

## Involved age -- INVAGE

In [38]:
# Look at Involved age values

# It looks like we should bin under-10 and bin over-85

data_refined['INVAGE'].value_counts()

unknown     2608
20 to 24    1712
25 to 29    1637
30 to 34    1384
35 to 39    1312
50 to 54    1301
40 to 44    1274
45 to 49    1239
55 to 59    1098
60 to 64     876
15 to 19     852
65 to 69     681
70 to 74     529
75 to 79     434
80 to 84     336
10 to 14     249
85 to 89     212
5 to 9       199
0 to 4       177
90 to 94      63
Over 95       15
Name: INVAGE, dtype: int64

In [39]:
# Combine columns

data_refined['INVAGE'] = data_refined['INVAGE'].replace('5 to 9', 'Under 15')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('0 to 4', 'Under 15')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('10 to 14', 'Under 15')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('90 to 94', 'Over 85')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('Over 95', 'Over 85')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('85 to 89', 'Over 85')

In [40]:
# Check on final bins for Involved age

data_refined['INVAGE'].value_counts()

unknown     2608
20 to 24    1712
25 to 29    1637
30 to 34    1384
35 to 39    1312
50 to 54    1301
40 to 44    1274
45 to 49    1239
55 to 59    1098
60 to 64     876
15 to 19     852
65 to 69     681
Under 15     625
70 to 74     529
75 to 79     434
80 to 84     336
Over 85      290
Name: INVAGE, dtype: int64

## Injury severity -- INJURY

In [41]:
# Look at Injury severity values

# It looks like we should drop this completely as it overlaps with our target variable

data_refined['INJURY'].value_counts()

None       8564
Major      6150
Minor      1422
Minimal    1123
Fatal       929
Name: INJURY, dtype: int64

In [42]:
data_refined = data_refined.drop(columns = ['INJURY'])

## Direction of travel -- INITDIR

In [43]:
# Look at Direction of travel values

# It looks like we should combine none and unknown

data_refined['INITDIR'].value_counts()

None       5051
East       3258
West       3197
South      3104
North      3068
Unknown     510
Name: INITDIR, dtype: int64

In [44]:
# COmbine columns

data_refined['INITDIR'] = data_refined['INITDIR'].replace('Unknown', 'None')

In [45]:
# Check on final bins for Direction of travel

data_refined['INITDIR'].value_counts()

None     5561
East     3258
West     3197
South    3104
North    3068
Name: INITDIR, dtype: int64

## Vehicle Type -- VEHTYPE

In [46]:
# Look at Vehicle Type values

# It looks like we should start by binning down to Truck tank and combining Other and None, then reassess

data_refined['VEHTYPE'].value_counts()

Automobile, Station Wagon           7484
Other                               4750
None                                3227
Bicycle                              780
Motorcycle                           698
Municipal Transit Bus (TTC)          272
Pick Up Truck                        238
Truck - Open                         179
Passenger Van                        132
Delivery Van                          85
Truck - Closed (Blazer, etc)          66
Street Car                            48
Truck - Dump                          42
Truck-Tractor                         40
Taxi                                  28
Moped                                 24
Truck (other)                         19
Bus (Other) (Go Bus, Gray Coach)      17
Intercity Bus                         14
Truck - Tank                          11
Tow Truck                              7
Police Vehicle                         6
School Bus                             6
Construction Equipment                 4
Fire Vehicle    

In [47]:
# Pass the Vehicle Type to the function and combine none/other

bin_column(data_refined['VEHTYPE'], 20)

data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('None', 'Other')

In [48]:
# Check on bins for Vehicle Type again

# Let's combine the bus-related, van-related and truck-related

data_refined['VEHTYPE'].value_counts()

Other                               8011
Automobile, Station Wagon           7484
Bicycle                              780
Motorcycle                           698
Municipal Transit Bus (TTC)          272
Pick Up Truck                        238
Truck - Open                         179
Passenger Van                        132
Delivery Van                          85
Truck - Closed (Blazer, etc)          66
Street Car                            48
Truck - Dump                          42
Truck-Tractor                         40
Taxi                                  28
Moped                                 24
Truck (other)                         19
Bus (Other) (Go Bus, Gray Coach)      17
Intercity Bus                         14
Truck - Tank                          11
Name: VEHTYPE, dtype: int64

In [49]:
# Combine columns

# Bus
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Municipal Transit Bus (TTC)', 'Bus')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Bus (Other) (Go Bus, Gray Coach)', 'Bus')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Intercity Bus', 'Bus')

# Truck
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck - Open', 'Truck')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck - Closed (Blazer, etc)', 'Truck')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck - Dump', 'Truck')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck-Tractor', 'Truck')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck (other)', 'Truck')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Truck - Tank', 'Truck')

# Van
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Passenger Van', 'Van')
data_refined['VEHTYPE'] = data_refined['VEHTYPE'].replace('Delivery Van', 'Van')


In [50]:
# Check on final bins for Vehicle Type

data_refined['VEHTYPE'].value_counts()

Other                        8011
Automobile, Station Wagon    7484
Bicycle                       780
Motorcycle                    698
Truck                         357
Bus                           303
Pick Up Truck                 238
Van                           217
Street Car                     48
Taxi                           28
Moped                          24
Name: VEHTYPE, dtype: int64

## The type of driving manoeuvre -- MANOEUVER

In [51]:
# Look at driving manoeuvre values

# It looks like we should combine other, none and unknown and then put disabled in that category

data_refined['MANOEUVER'].value_counts()

None                                   7658
Going Ahead                            6266
Turning Left                           1785
Stopped                                 620
Turning Right                           476
Slowing or Stopping                     282
Changing Lanes                          216
Parked                                  183
Other                                   181
Reversing                               122
Unknown                                 122
Making U Turn                           106
Overtaking                               91
Pulling Away from Shoulder or Curb       40
Pulling Onto Shoulder or towardCurb      18
Merging                                  18
Disabled                                  4
Name: MANOEUVER, dtype: int64

In [52]:
# Combine columns

data_refined['MANOEUVER'] = data_refined['MANOEUVER'].replace('None', 'Other')
data_refined['MANOEUVER'] = data_refined['MANOEUVER'].replace('Unknown', 'Other')
data_refined['MANOEUVER'] = data_refined['MANOEUVER'].replace('Disabled', 'Other')

In [53]:
# Check on final bins for driving manoeuvre

data_refined['MANOEUVER'].value_counts()

Other                                  7965
Going Ahead                            6266
Turning Left                           1785
Stopped                                 620
Turning Right                           476
Slowing or Stopping                     282
Changing Lanes                          216
Parked                                  183
Reversing                               122
Making U Turn                           106
Overtaking                               91
Pulling Away from Shoulder or Curb       40
Pulling Onto Shoulder or towardCurb      18
Merging                                  18
Name: MANOEUVER, dtype: int64

## Apparent driver action -- DRIVACT

In [54]:
# Look at Apparent driver action values

# It looks like we should bin down to Improper Passing and then combine other and none

data_refined['DRIVACT'].value_counts()

None                            8949
Driving Properly                4221
Failed to Yield Right of Way    1540
Lost control                     975
Improper Turn                    573
Other                            503
Disobeyed Traffic Control        475
Following too Close              251
Exceeding Speed Limit            246
Speed too Fast For Condition     208
Improper Lane Change             122
Improper Passing                 112
Wrong Way on One Way Road          9
Speed too Slow                     4
Name: DRIVACT, dtype: int64

In [55]:
# Pass the Apparent driver action to the function and combine none/other

bin_column(data_refined['DRIVACT'], 12)

data_refined['DRIVACT'] = data_refined['DRIVACT'].replace('None', 'Other')

In [56]:
# Check on final bins for Apparent driver action

data_refined['DRIVACT'].value_counts()

Other                           9465
Driving Properly                4221
Failed to Yield Right of Way    1540
Lost control                     975
Improper Turn                    573
Disobeyed Traffic Control        475
Following too Close              251
Exceeding Speed Limit            246
Speed too Fast For Condition     208
Improper Lane Change             122
Improper Passing                 112
Name: DRIVACT, dtype: int64

## Driver condition -- DRIVCOND

In [57]:
# Look at Driver condition values

# It looks like we should combine none and normal, as well as unkown and other, and then alcohol impared

data_refined['DRIVCOND'].value_counts()

None                                  8952
Normal                                5846
Inattentive                           1581
Unknown                               1099
Medical or Physical Disability         177
Had Been Drinking                      163
Ability Impaired, Alcohol Over .08     126
Ability Impaired, Alcohol              121
Other                                   52
Fatigue                                 51
Ability Impaired, Drugs                 20
Name: DRIVCOND, dtype: int64

In [58]:
# Combine columns

data_refined['DRIVCOND'] = data_refined['DRIVCOND'].replace('None', 'Normal')
data_refined['DRIVCOND'] = data_refined['DRIVCOND'].replace('Unknown', 'Other')
data_refined['DRIVCOND'] = data_refined['DRIVCOND'].replace('Ability Impaired, Alcohol Over .08', 'Ability Impaired, Alcohol')


In [59]:
# Check on final bins for Driver condition

data_refined['DRIVCOND'].value_counts()

Normal                            14798
Inattentive                        1581
Other                              1151
Ability Impaired, Alcohol           247
Medical or Physical Disability      177
Had Been Drinking                   163
Fatigue                              51
Ability Impaired, Drugs              20
Name: DRIVCOND, dtype: int64

<hr>

# Binary variables conversion

In [60]:
# Make function for binary conversion

def binary_columns (feature):
    feature.replace({'Yes': 1}, inplace=True)
    feature.replace({'None': 0}, inplace=True)


In [61]:
# Make a list of binary variables

binary = ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'EMERG_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'DISABILITY']

In [62]:
# Pass each binary variable for conversion

for variable in binary:
    binary_columns(data_refined[variable])

In [63]:
# Check one of the binary columns to see if worked

data_refined['PEDESTRIAN'].value_counts()

0    10836
1     7352
Name: PEDESTRIAN, dtype: int64

In [64]:
# Check to see if any binary variables are skewed heavily

for variable in binary:
    print(variable)
    print(data_refined[variable].value_counts())
    print('-'*12)

PEDESTRIAN
0    10836
1     7352
Name: PEDESTRIAN, dtype: int64
------------
CYCLIST
0    16282
1     1906
Name: CYCLIST, dtype: int64
------------
AUTOMOBILE
1    16548
0     1640
Name: AUTOMOBILE, dtype: int64
------------
MOTORCYCLE
0    16603
1     1585
Name: MOTORCYCLE, dtype: int64
------------
TRUCK
0    17066
1     1122
Name: TRUCK, dtype: int64
------------
TRSN_CITY_VEH
0    17082
1     1106
Name: TRSN_CITY_VEH, dtype: int64
------------
EMERG_VEH
0    18145
1       43
Name: EMERG_VEH, dtype: int64
------------
PASSENGER
0    11282
1     6906
Name: PASSENGER, dtype: int64
------------
SPEEDING
0    15613
1     2575
Name: SPEEDING, dtype: int64
------------
AG_DRIV
1    9458
0    8730
Name: AG_DRIV, dtype: int64
------------
REDLIGHT
0    16668
1     1520
Name: REDLIGHT, dtype: int64
------------
ALCOHOL
0    17400
1      788
Name: ALCOHOL, dtype: int64
------------
DISABILITY
0    17702
1      486
Name: DISABILITY, dtype: int64
------------


# Make Target variable ready

In [65]:
# Look at accident classification values

data_refined['ACCLASS'].value_counts()

Non-Fatal Injury        15597
Fatal                    2569
Property Damage Only       17
None                        5
Name: ACCLASS, dtype: int64

In [66]:
# Remove the 22 that are not fatal or non-fatal

data_refined = data_refined[data_refined['ACCLASS'].isin(['Non-Fatal Injury', 'Fatal'])]

In [67]:
# Make target variable a binary result with Fatal = 1

data_refined['ACCLASS'] = data_refined['ACCLASS'].replace('Fatal', 1)
data_refined['ACCLASS'] = data_refined['ACCLASS'].replace('Non-Fatal Injury', 0)

In [68]:
# Look at accident classification values

data_refined['ACCLASS'].value_counts()

0    15597
1     2569
Name: ACCLASS, dtype: int64

<hr>

# Use XGBoost on feature analysis for all features

In [69]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,EMERG_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Drifting Snow,"VISIBILITY_Fog, Mist, Smoke, Dust",VISIBILITY_Freezing Rain,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Ice,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Packed Snow,RDSFCOND_Slush,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,INVTYPE_Cyclist,INVTYPE_Driver,INVTYPE_Driver - Not Hit,INVTYPE_Moped Driver,INVTYPE_Motorcycle Driver,INVTYPE_Motorcycle Passenger,INVTYPE_Other,INVTYPE_Other Property Owner,INVTYPE_Passenger,INVTYPE_Pedestrian,INVTYPE_Truck Driver,INVTYPE_Vehicle Owner,INVTYPE_Wheelchair,INVAGE_15 to 19,INVAGE_20 to 24,INVAGE_25 to 29,INVAGE_30 to 34,INVAGE_35 to 39,INVAGE_40 to 44,INVAGE_45 to 49,INVAGE_50 to 54,INVAGE_55 to 59,INVAGE_60 to 64,INVAGE_65 to 69,INVAGE_70 to 74,INVAGE_75 to 79,INVAGE_80 to 84,INVAGE_Over 85,INVAGE_Under 15,INVAGE_unknown,INITDIR_East,INITDIR_None,INITDIR_North,INITDIR_South,INITDIR_West,"VEHTYPE_Automobile, Station Wagon",VEHTYPE_Bicycle,VEHTYPE_Bus,VEHTYPE_Moped,VEHTYPE_Motorcycle,VEHTYPE_Other,VEHTYPE_Pick Up Truck,VEHTYPE_Street Car,VEHTYPE_Taxi,VEHTYPE_Truck,VEHTYPE_Van,MANOEUVER_Changing Lanes,MANOEUVER_Going Ahead,MANOEUVER_Making U Turn,MANOEUVER_Merging,MANOEUVER_Other,MANOEUVER_Overtaking,MANOEUVER_Parked,MANOEUVER_Pulling Away from Shoulder or Curb,MANOEUVER_Pulling Onto Shoulder or towardCurb,MANOEUVER_Reversing,MANOEUVER_Slowing or Stopping,MANOEUVER_Stopped,MANOEUVER_Turning Left,MANOEUVER_Turning Right,DRIVACT_Disobeyed Traffic Control,DRIVACT_Driving Properly,DRIVACT_Exceeding Speed Limit,DRIVACT_Failed to Yield Right of Way,DRIVACT_Following too Close,DRIVACT_Improper Lane Change,DRIVACT_Improper Passing,DRIVACT_Improper Turn,DRIVACT_Lost control,DRIVACT_Other,DRIVACT_Speed too Fast For Condition,"DRIVCOND_Ability Impaired, Alcohol","DRIVCOND_Ability Impaired, Drugs",DRIVCOND_Fatigue,DRIVCOND_Had Been Drinking,DRIVCOND_Inattentive,DRIVCOND_Medical or Physical Disability,DRIVCOND_Normal,DRIVCOND_Other
0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,1,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
4,1,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
18190,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
18191,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [70]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [71]:
# Use XGBoost to check feature importance

model = XGBClassifier()
model.fit(X, y)


In [72]:
# Get feature importances
importances = model.feature_importances_

# Get the list of feature names
feature_names = dummies.drop(columns='ACCLASS').columns.tolist()

# Combine feature names and their corresponding importances
feature_importance = list(zip(feature_names, importances))

# Sort features by importance in descending order
feature_importance.sort(key=lambda x: x[1], reverse=True)

# Print the list of features and their importance scores
for feature, importance in feature_importance:
    print(f"{feature}, Importance: {round(importance, 2)}")

IMPACTYPE_Pedestrian Collisions, Importance: 0.03999999910593033
ROAD_CLASS_Expressway, Importance: 0.029999999329447746
TRUCK, Importance: 0.029999999329447746
DRIVCOND_Other, Importance: 0.019999999552965164
INVAGE_Over 85, Importance: 0.019999999552965164
SPEEDING, Importance: 0.019999999552965164
INVTYPE_Other, Importance: 0.019999999552965164
INVTYPE_Motorcycle Driver, Importance: 0.019999999552965164
CYCLIST, Importance: 0.019999999552965164
VISIBILITY_Other, Importance: 0.019999999552965164
IMPACTYPE_Rear End, Importance: 0.019999999552965164
RDSFCOND_Other, Importance: 0.009999999776482582
ROAD_CLASS_Other, Importance: 0.009999999776482582
VEHTYPE_Van, Importance: 0.009999999776482582
INVAGE_unknown, Importance: 0.009999999776482582
VISIBILITY_Freezing Rain, Importance: 0.009999999776482582
INVAGE_75 to 79, Importance: 0.009999999776482582
ALCOHOL, Importance: 0.009999999776482582
TRAFFCTL_Traffic Signal, Importance: 0.009999999776482582
INVAGE_80 to 84, Importance: 0.009999999

# List of the features with minimal importance to model

DRIVCOND_Inattentive, Importance: 0.0

DRIVCOND_Normal, Importance: 0.0    

DRIVCOND_Ability Impaired, Drugs, Importance: 0.0

DRIVCOND_Medical or Physical Disability, Importance: 0.0   

DRIVCOND_Fatigue, Importance: 0.0


    
DRIVACT_Driving Properly, Importance: 0.0

DRIVACT_Following too Close, Importance: 0.0

DRIVACT_Other, Importance: 0.0

DRIVACT_Disobeyed Traffic Control, Importance: 0.0

DRIVACT_Improper Turn, Importance: 0.0  

DRIVACT_Improper Passing, Importance: 0.0

DRIVACT_Improper Lane Change, Importance: 0.0    

    
INITDIR_West, Importance: 0.0

INITDIR_South, Importance: 0.0

INITDIR_East, Importance: 0.0

INITDIR_North, Importance: 0.0  


INVAGE_55 to 59, Importance: 0.0

INVAGE_20 to 24, Importance: 0.0

INVAGE_45 to 49, Importance: 0.0

INVAGE_50 to 54, Importance: 0.0

INVAGE_30 to 34, Importance: 0.0

INVAGE_25 to 29, Importance: 0.0 

INVAGE_Under 15, Importance: 0.0  

    
RDSFCOND_Slush, Importance: 0.0

RDSFCOND_Ice, Importance: 0.0   

RDSFCOND_Packed Snow, Importance: 0.0  

    
MANOEUVER_Turning Right, Importance: 0.0

MANOEUVER_Turning Left, Importance: 0.0  

MANOEUVER_Changing Lanes, Importance: 0.0

MANOEUVER_Overtaking, Importance: 0.0    

MANOEUVER_Making U Turn, Importance: 0.0

MANOEUVER_Pulling Away from Shoulder or Curb, Importance: 0.0  

MANOEUVER_Pulling Onto Shoulder or towardCurb, Importance: 0.0

MANOEUVER_Merging, Importance: 0.0

    
TRAFFCTL_Pedestrian Crossover, Importance: 0.0

    
VEHTYPE_Other, Importance: 0.0

VEHTYPE_Truck, Importance: 0.0

VEHTYPE_Taxi, Importance: 0.0  

VEHTYPE_Bicycle, Importance: 0.0

VEHTYPE_Street Car, Importance: 0.0    

VEHTYPE_Moped, Importance: 0.0

    
INVTYPE_Driver, Importance: 0.0

INVTYPE_Vehicle Owner, Importance: 0.0   

INVTYPE_Truck Driver, Importance: 0.0

INVTYPE_Wheelchair, Importance: 0.0

INVTYPE_Driver - Not Hit, Importance: 0.0

INVTYPE_Moped Driver, Importance: 0.0

INVTYPE_Motorcycle Passenger, Importance: 0.0    

    
EMERG_VEH, Importance: 0.0


VISIBILITY_Drifting Snow, Importance: 0.0

VISIBILITY_Fog, Mist, Smoke, Dust, Importance: 0.0



## Driver Condition Revisit

In [73]:
# Look at Driver condition values

# With five of the values having minimal impact, let's drop it.

data_refined['DRIVCOND'].value_counts()

Normal                            14780
Inattentive                        1579
Other                              1149
Ability Impaired, Alcohol           247
Medical or Physical Disability      177
Had Been Drinking                   163
Fatigue                              51
Ability Impaired, Drugs              20
Name: DRIVCOND, dtype: int64

In [74]:
data_refined = data_refined.drop(columns = ['DRIVCOND'])

## Driver action revisit

In [75]:
# Look at Driver action values

# With seven of the values having minimal impact, let's drop it.

data_refined['DRIVACT'].value_counts()

Other                           9455
Driving Properly                4213
Failed to Yield Right of Way    1539
Lost control                     973
Improper Turn                    573
Disobeyed Traffic Control        475
Following too Close              251
Exceeding Speed Limit            246
Speed too Fast For Condition     207
Improper Lane Change             122
Improper Passing                 112
Name: DRIVACT, dtype: int64

In [76]:
data_refined = data_refined.drop(columns = ['DRIVACT'])

## Initial direction revisit

In [77]:
# Look at Driver action values

# With four of the values having minimal impact, let's drop it.

data_refined['INITDIR'].value_counts()

None     5556
East     3252
West     3194
South    3100
North    3064
Name: INITDIR, dtype: int64

In [78]:
# Drop the column

data_refined = data_refined.drop(columns = ['INITDIR'])

## Road conditions revisit -- RDSFCOND

In [79]:
# Look at Road conditions values

# With the bottom three values having minimal impact, let's re-bin.

data_refined['RDSFCOND'].value_counts()

Dry            14585
Wet             3013
Other            176
Loose Snow       169
Slush            102
Ice               77
Packed Snow       44
Name: RDSFCOND, dtype: int64

In [80]:
# Pass the Road contidtions to the function 

bin_column(data_refined['RDSFCOND'], 4)

In [81]:
# Another look

data_refined['RDSFCOND'].value_counts()

Dry           14585
Wet            3013
Other           399
Loose Snow      169
Name: RDSFCOND, dtype: int64

## Revisit Manouvre MANOEUVER

In [82]:
# Look at Manouvre values

# With the bottom five values having minimal impact, let's re-bin.
# And then let's combine turning right with turning left, as they had minimal impact

data_refined['MANOEUVER'].value_counts()

Other                                  7957
Going Ahead                            6254
Turning Left                           1784
Stopped                                 620
Turning Right                           476
Slowing or Stopping                     282
Changing Lanes                          215
Parked                                  183
Reversing                               122
Making U Turn                           106
Overtaking                               91
Pulling Away from Shoulder or Curb       40
Pulling Onto Shoulder or towardCurb      18
Merging                                  18
Name: MANOEUVER, dtype: int64

In [83]:
# Pass the Manouvre to the function and then combine turn columns

bin_column(data_refined['MANOEUVER'], 9)

data_refined['MANOEUVER'] = data_refined['MANOEUVER'].replace('Turning Left', 'Turning')
data_refined['MANOEUVER'] = data_refined['MANOEUVER'].replace('Turning Right', 'Turning')

In [84]:
# Check values again

data_refined['MANOEUVER'].value_counts()

Other                  8230
Going Ahead            6254
Turning                2260
Stopped                 620
Slowing or Stopping     282
Changing Lanes          215
Parked                  183
Reversing               122
Name: MANOEUVER, dtype: int64

## Revisit Vehicle Type -- VEHTYPE

In [85]:
# Look at Vehicle Type

# With the bottom three values having minimal impact, let's re-bin.

data_refined['VEHTYPE'].value_counts()

Other                        8002
Automobile, Station Wagon    7476
Bicycle                       780
Motorcycle                    698
Truck                         357
Bus                           301
Pick Up Truck                 235
Van                           217
Street Car                     48
Taxi                           28
Moped                          24
Name: VEHTYPE, dtype: int64

In [86]:
# Pass the Vehicle Type to the function

bin_column(data_refined['VEHTYPE'], 8)

In [87]:
# Look at Vehicle Type values again

data_refined['VEHTYPE'].value_counts()

Other                        8102
Automobile, Station Wagon    7476
Bicycle                       780
Motorcycle                    698
Truck                         357
Bus                           301
Pick Up Truck                 235
Van                           217
Name: VEHTYPE, dtype: int64

## Revisit involved type -- INVTYPE

In [88]:
# Look at involved type

# With the bottom four values having minimal impact, let's re-bin.

data_refined['INVTYPE'].value_counts()

Driver                  8261
Pedestrian              3106
Passenger               2766
Vehicle Owner           1636
Cyclist                  784
Motorcycle Driver        696
Truck Driver             346
Other Property Owner     257
Other                    211
Motorcycle Passenger      39
Moped Driver              30
Driver - Not Hit          17
Wheelchair                17
Name: INVTYPE, dtype: int64

In [89]:
# Pass the involved type to the function

bin_column(data_refined['INVTYPE'], 9)

In [90]:
# Check again
data_refined['INVTYPE'].value_counts()

Driver                  8261
Pedestrian              3106
Passenger               2766
Vehicle Owner           1636
Cyclist                  784
Motorcycle Driver        696
Truck Driver             346
Other                    314
Other Property Owner     257
Name: INVTYPE, dtype: int64

## Revisit visibilty -- VISIBILITY

In [91]:
# Look at visibilty

# Leave it alone, as freezing rain scored well for importance

data_refined['VISIBILITY'].value_counts()

Clear                     15705
Rain                       1871
Snow                        351
Other                       121
Fog, Mist, Smoke, Dust       50
Freezing Rain                47
Drifting Snow                21
Name: VISIBILITY, dtype: int64

## Revisit Emergency vehicle ivolvement and drop it as binary

In [92]:
# Look at values for EMERG_VEH

data_refined['EMERG_VEH'].value_counts()

0    18123
1       43
Name: EMERG_VEH, dtype: int64

In [93]:
# Drop the column

data_refined = data_refined.drop(columns = ['EMERG_VEH'])

<hr>

# Machine Learning Time

In [94]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Drifting Snow,"VISIBILITY_Fog, Mist, Smoke, Dust",VISIBILITY_Freezing Rain,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,INVTYPE_Cyclist,INVTYPE_Driver,INVTYPE_Motorcycle Driver,INVTYPE_Other,INVTYPE_Other Property Owner,INVTYPE_Passenger,INVTYPE_Pedestrian,INVTYPE_Truck Driver,INVTYPE_Vehicle Owner,INVAGE_15 to 19,INVAGE_20 to 24,INVAGE_25 to 29,INVAGE_30 to 34,INVAGE_35 to 39,INVAGE_40 to 44,INVAGE_45 to 49,INVAGE_50 to 54,INVAGE_55 to 59,INVAGE_60 to 64,INVAGE_65 to 69,INVAGE_70 to 74,INVAGE_75 to 79,INVAGE_80 to 84,INVAGE_Over 85,INVAGE_Under 15,INVAGE_unknown,"VEHTYPE_Automobile, Station Wagon",VEHTYPE_Bicycle,VEHTYPE_Bus,VEHTYPE_Motorcycle,VEHTYPE_Other,VEHTYPE_Pick Up Truck,VEHTYPE_Truck,VEHTYPE_Van,MANOEUVER_Changing Lanes,MANOEUVER_Going Ahead,MANOEUVER_Other,MANOEUVER_Parked,MANOEUVER_Reversing,MANOEUVER_Slowing or Stopping,MANOEUVER_Stopped,MANOEUVER_Turning
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [95]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [96]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [97]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [98]:
# Find input dimension length
len(X_train[0])

98

In [99]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='First Go')

INFO:tensorflow:Reloading Tuner from .\First Go\tuner0.json


In [101]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 10s]
val_accuracy: 0.8608542680740356

Best val_accuracy So Far: 0.8703214526176453
Total elapsed time: 00h 03m 52s
INFO:tensorflow:Oracle triggered exit


'First Go' Model based on all features returns a best accuracy of 0.8703214526176453

In [100]:
data_refined.nunique()

ROAD_CLASS        6
LOCCOORD          3
ACCLOC            5
TRAFFCTL          6
VISIBILITY        7
LIGHT             4
RDSFCOND          4
ACCLASS           2
IMPACTYPE         9
INVTYPE           9
INVAGE           17
VEHTYPE           8
MANOEUVER         8
PEDESTRIAN        2
CYCLIST           2
AUTOMOBILE        2
MOTORCYCLE        2
TRUCK             2
TRSN_CITY_VEH     2
PASSENGER         2
SPEEDING          2
AG_DRIV           2
REDLIGHT          2
ALCOHOL           2
DISABILITY        2
dtype: int64

# Let's run XGBoost again

In [101]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Drifting Snow,"VISIBILITY_Fog, Mist, Smoke, Dust",VISIBILITY_Freezing Rain,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,INVTYPE_Cyclist,INVTYPE_Driver,INVTYPE_Motorcycle Driver,INVTYPE_Other,INVTYPE_Other Property Owner,INVTYPE_Passenger,INVTYPE_Pedestrian,INVTYPE_Truck Driver,INVTYPE_Vehicle Owner,INVAGE_15 to 19,INVAGE_20 to 24,INVAGE_25 to 29,INVAGE_30 to 34,INVAGE_35 to 39,INVAGE_40 to 44,INVAGE_45 to 49,INVAGE_50 to 54,INVAGE_55 to 59,INVAGE_60 to 64,INVAGE_65 to 69,INVAGE_70 to 74,INVAGE_75 to 79,INVAGE_80 to 84,INVAGE_Over 85,INVAGE_Under 15,INVAGE_unknown,"VEHTYPE_Automobile, Station Wagon",VEHTYPE_Bicycle,VEHTYPE_Bus,VEHTYPE_Motorcycle,VEHTYPE_Other,VEHTYPE_Pick Up Truck,VEHTYPE_Truck,VEHTYPE_Van,MANOEUVER_Changing Lanes,MANOEUVER_Going Ahead,MANOEUVER_Other,MANOEUVER_Parked,MANOEUVER_Reversing,MANOEUVER_Slowing or Stopping,MANOEUVER_Stopped,MANOEUVER_Turning
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [102]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [103]:
# Use XGBoost to check feature importance

model = XGBClassifier()
model.fit(X, y)

In [104]:
# Get feature importances
importances = model.feature_importances_

# Get the list of feature names
feature_names = dummies.drop(columns='ACCLASS').columns.tolist()

# Combine feature names and their corresponding importances
feature_importance = list(zip(feature_names, importances))

# Sort features by importance in descending order
feature_importance.sort(key=lambda x: x[1], reverse=True)

# Print the list of features and their importance scores
for feature, importance in feature_importance:
    print(f"{feature}, Importance: {round(importance, 2)}")

IMPACTYPE_Pedestrian Collisions, Importance: 0.05000000074505806
TRUCK, Importance: 0.029999999329447746
INVAGE_Over 85, Importance: 0.019999999552965164
SPEEDING, Importance: 0.019999999552965164
CYCLIST, Importance: 0.019999999552965164
VEHTYPE_Van, Importance: 0.019999999552965164
IMPACTYPE_Rear End, Importance: 0.019999999552965164
VISIBILITY_Other, Importance: 0.019999999552965164
ROAD_CLASS_Expressway, Importance: 0.019999999552965164
ROAD_CLASS_Other, Importance: 0.019999999552965164
VISIBILITY_Snow, Importance: 0.019999999552965164
TRAFFCTL_Other, Importance: 0.019999999552965164
REDLIGHT, Importance: 0.019999999552965164
INVAGE_80 to 84, Importance: 0.019999999552965164
ALCOHOL, Importance: 0.009999999776482582
TRAFFCTL_Traffic Signal, Importance: 0.009999999776482582
DISABILITY, Importance: 0.009999999776482582
ACCLOC_At/Near Private Drive, Importance: 0.009999999776482582
LIGHT_Daylight, Importance: 0.009999999776482582
VISIBILITY_Clear, Importance: 0.009999999776482582
INVA

## Revisit involved type again

In [105]:
# Look at involved type

# With low scores across the board, we can drop this feature.

data_refined['INVTYPE'].value_counts()

Driver                  8261
Pedestrian              3106
Passenger               2766
Vehicle Owner           1636
Cyclist                  784
Motorcycle Driver        696
Truck Driver             346
Other                    314
Other Property Owner     257
Name: INVTYPE, dtype: int64

In [106]:
#Remove the column

data_refined = data_refined.drop(columns = ['INVTYPE'])

## Revisit visibility again

In [107]:
# Look at visibilty

# Re-bin to other

data_refined['VISIBILITY'].value_counts()

Clear                     15705
Rain                       1871
Snow                        351
Other                       121
Fog, Mist, Smoke, Dust       50
Freezing Rain                47
Drifting Snow                21
Name: VISIBILITY, dtype: int64

In [108]:
# Pass the involved type to the function

bin_column(data_refined['VISIBILITY'], 4)

## Revisit vehichle type again

In [109]:
# Look at Vehicle Type

# Most of these scored low, let's remove

data_refined['VEHTYPE'].value_counts()

Other                        8102
Automobile, Station Wagon    7476
Bicycle                       780
Motorcycle                    698
Truck                         357
Bus                           301
Pick Up Truck                 235
Van                           217
Name: VEHTYPE, dtype: int64

In [110]:
#Remove the column

data_refined = data_refined.drop(columns = ['VEHTYPE'])

## Revisit age again

In [111]:
# Look at Involved age values

# Let's bin further by decade

data_refined['INVAGE'].value_counts()

unknown     2608
20 to 24    1707
25 to 29    1635
30 to 34    1383
35 to 39    1309
50 to 54    1300
40 to 44    1272
45 to 49    1236
55 to 59    1097
60 to 64     876
15 to 19     850
65 to 69     681
Under 15     625
70 to 74     528
75 to 79     434
80 to 84     335
Over 85      290
Name: INVAGE, dtype: int64

In [112]:
# Combine the columns

data_refined['INVAGE'] = data_refined['INVAGE'].replace('20 to 24', '20s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('25 to 29', '20s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('30 to 34', '30s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('35 to 39', '30s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('40 to 44', '40s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('45 to 49', '40s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('50 to 54', '50s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('55 to 59', '50s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('60 to 64', '60s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('65 to 69', '60s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('70 to 74', '70s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('75 to 79', '70s')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('80 to 84', '80 and older')
data_refined['INVAGE'] = data_refined['INVAGE'].replace('Over 85', '80 and older')

In [113]:
# Check again

data_refined['INVAGE'].value_counts()

20s             3342
30s             2692
unknown         2608
40s             2508
50s             2397
60s             1557
70s              962
15 to 19         850
Under 15         625
80 and older     625
Name: INVAGE, dtype: int64

## Revisit Manouvre again

In [114]:
# Look at Manouvre values

# None of them scored high. Let's drop it

data_refined['MANOEUVER'].value_counts()

Other                  8230
Going Ahead            6254
Turning                2260
Stopped                 620
Slowing or Stopping     282
Changing Lanes          215
Parked                  183
Reversing               122
Name: MANOEUVER, dtype: int64

In [115]:
#Remove the column

data_refined = data_refined.drop(columns = ['MANOEUVER'])

<hr>

# Run Keras Tuner again

In [116]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,INVAGE_15 to 19,INVAGE_20s,INVAGE_30s,INVAGE_40s,INVAGE_50s,INVAGE_60s,INVAGE_70s,INVAGE_80 and older,INVAGE_Under 15,INVAGE_unknown
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [117]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [118]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [119]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [120]:
# Find input dimension length
len(X_train[0])

63

In [121]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='After XGBoost')

INFO:tensorflow:Reloading Tuner from .\After XGBoost\tuner0.json


In [125]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

INFO:tensorflow:Oracle triggered exit


<hr>

# One more time with XGBoost

In [122]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,INVAGE_15 to 19,INVAGE_20s,INVAGE_30s,INVAGE_40s,INVAGE_50s,INVAGE_60s,INVAGE_70s,INVAGE_80 and older,INVAGE_Under 15,INVAGE_unknown
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [123]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [124]:
# Use XGBoost to check feature importance

model = XGBClassifier()
model.fit(X, y)

In [125]:
# Get feature importances
importances = model.feature_importances_

# Get the list of feature names
feature_names = dummies.drop(columns='ACCLASS').columns.tolist()

# Combine feature names and their corresponding importances
feature_importance = list(zip(feature_names, importances))

# Sort features by importance in descending order
feature_importance.sort(key=lambda x: x[1], reverse=True)

# Print the list of features and their importance scores
for feature, importance in feature_importance:
    print(f"{feature}, Importance: {round(importance, 2)}")

IMPACTYPE_Pedestrian Collisions, Importance: 0.07000000029802322
TRUCK, Importance: 0.03999999910593033
INVAGE_80 and older, Importance: 0.03999999910593033
SPEEDING, Importance: 0.029999999329447746
CYCLIST, Importance: 0.029999999329447746
ROAD_CLASS_Other, Importance: 0.019999999552965164
IMPACTYPE_Rear End, Importance: 0.019999999552965164
ACCLOC_Non Intersection, Importance: 0.019999999552965164
ALCOHOL, Importance: 0.019999999552965164
REDLIGHT, Importance: 0.019999999552965164
TRAFFCTL_Traffic Signal, Importance: 0.019999999552965164
INVAGE_70s, Importance: 0.019999999552965164
LIGHT_Daylight, Importance: 0.019999999552965164
TRAFFCTL_Traffic Controller, Importance: 0.019999999552965164
DISABILITY, Importance: 0.019999999552965164
ROAD_CLASS_Minor Arterial, Importance: 0.019999999552965164
IMPACTYPE_Approaching, Importance: 0.019999999552965164
AG_DRIV, Importance: 0.019999999552965164
VISIBILITY_Snow, Importance: 0.019999999552965164
PASSENGER, Importance: 0.019999999552965164


In [126]:
# Determine the number of unique values in each column.

data_refined.nunique()

ROAD_CLASS        6
LOCCOORD          3
ACCLOC            5
TRAFFCTL          6
VISIBILITY        4
LIGHT             4
RDSFCOND          4
ACCLASS           2
IMPACTYPE         9
INVAGE           10
PEDESTRIAN        2
CYCLIST           2
AUTOMOBILE        2
MOTORCYCLE        2
TRUCK             2
TRSN_CITY_VEH     2
PASSENGER         2
SPEEDING          2
AG_DRIV           2
REDLIGHT          2
ALCOHOL           2
DISABILITY        2
dtype: int64

<hr>

# Let's try just the most important features

In [127]:
data_important = data_refined[['IMPACTYPE', 'TRUCK', 'SPEEDING', 'CYCLIST', 'ROAD_CLASS', 'ALCOHOL', 'ACCLASS']]

In [128]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_important)
dummies

Unnamed: 0,TRUCK,SPEEDING,CYCLIST,ALCOHOL,ACCLASS,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement,ROAD_CLASS_Collector,ROAD_CLASS_Expressway,ROAD_CLASS_Local,ROAD_CLASS_Major Arterial,ROAD_CLASS_Minor Arterial,ROAD_CLASS_Other
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
18190,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
18191,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
18192,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [129]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [130]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [131]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [132]:
# Find input dimension length
len(X_train[0])

19

In [133]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='Most Important Only')

In [141]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 09s]
val_accuracy: 0.8639366030693054

Best val_accuracy So Far: 0.8659180998802185
Total elapsed time: 00h 03m 50s
INFO:tensorflow:Oracle triggered exit


# Reclassify Age 

In [134]:
# Look at Involved age values

# Let's convert to binary as 'Over 70', 'Teen/New Driver'

data_refined['INVAGE'].value_counts()

20s             3342
30s             2692
unknown         2608
40s             2508
50s             2397
60s             1557
70s              962
15 to 19         850
Under 15         625
80 and older     625
Name: INVAGE, dtype: int64

In [135]:
# Make a new column for Over 70

data_refined['OVER70'] = data_refined['INVAGE']

In [136]:
# Iterate the column and change to binary

for index, value in data_refined['OVER70'].items():
    if value == '70s':
        data_refined.at[index, 'OVER70'] = 1
    elif value == '80 and older':
        data_refined.at[index, 'OVER70'] = 1
    else:
        data_refined.at[index, 'OVER70'] = 0

In [137]:
# Check the column

data_refined['OVER70'].value_counts()

0    16579
1     1587
Name: OVER70, dtype: int64

In [138]:
# Make a new column for New Driver

data_refined['NEWDRIVER'] = data_refined['INVAGE']

In [139]:
for index, value in data_refined['NEWDRIVER'].items():
    if value == '15 to 19':
        data_refined.at[index, 'NEWDRIVER'] = 1
    else:
        data_refined.at[index, 'NEWDRIVER'] = 0

In [140]:
# Check the column

data_refined['NEWDRIVER'].value_counts()

0    17316
1      850
Name: NEWDRIVER, dtype: int64

In [141]:
# Drop the age column

data_refined = data_refined.drop(columns = ['INVAGE'])

In [142]:
# Determine the number of unique values in each column.

data_refined.nunique()

ROAD_CLASS       6
LOCCOORD         3
ACCLOC           5
TRAFFCTL         6
VISIBILITY       4
LIGHT            4
RDSFCOND         4
ACCLASS          2
IMPACTYPE        9
PEDESTRIAN       2
CYCLIST          2
AUTOMOBILE       2
MOTORCYCLE       2
TRUCK            2
TRSN_CITY_VEH    2
PASSENGER        2
SPEEDING         2
AG_DRIV          2
REDLIGHT         2
ALCOHOL          2
DISABILITY       2
OVER70           2
NEWDRIVER        2
dtype: int64

## Check Road Class again

In [143]:
# Look at road classification values

# Since OTHER had the most importance per XGBoost and Major Arterial has the least, let's drop it

data_refined['ROAD_CLASS'].value_counts()

Major Arterial    12929
Minor Arterial     2840
Collector           996
Local               841
Other               428
Expressway          132
Name: ROAD_CLASS, dtype: int64

In [144]:
# Drop the road classification column

data_refined = data_refined.drop(columns = ['ROAD_CLASS'])

In [145]:
data_refined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18166 entries, 0 to 18193
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   LOCCOORD       18166 non-null  object
 1   ACCLOC         18166 non-null  object
 2   TRAFFCTL       18166 non-null  object
 3   VISIBILITY     18166 non-null  object
 4   LIGHT          18166 non-null  object
 5   RDSFCOND       18166 non-null  object
 6   ACCLASS        18166 non-null  int64 
 7   IMPACTYPE      18166 non-null  object
 8   PEDESTRIAN     18166 non-null  int64 
 9   CYCLIST        18166 non-null  int64 
 10  AUTOMOBILE     18166 non-null  int64 
 11  MOTORCYCLE     18166 non-null  int64 
 12  TRUCK          18166 non-null  int64 
 13  TRSN_CITY_VEH  18166 non-null  int64 
 14  PASSENGER      18166 non-null  int64 
 15  SPEEDING       18166 non-null  int64 
 16  AG_DRIV        18166 non-null  int64 
 17  REDLIGHT       18166 non-null  int64 
 18  ALCOHOL        18166 non-n

In [146]:
data_refined['OVER70'] = data_refined['OVER70'].astype('int64')
data_refined['NEWDRIVER'] = data_refined['NEWDRIVER'].astype('int64')
data_refined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18166 entries, 0 to 18193
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   LOCCOORD       18166 non-null  object
 1   ACCLOC         18166 non-null  object
 2   TRAFFCTL       18166 non-null  object
 3   VISIBILITY     18166 non-null  object
 4   LIGHT          18166 non-null  object
 5   RDSFCOND       18166 non-null  object
 6   ACCLASS        18166 non-null  int64 
 7   IMPACTYPE      18166 non-null  object
 8   PEDESTRIAN     18166 non-null  int64 
 9   CYCLIST        18166 non-null  int64 
 10  AUTOMOBILE     18166 non-null  int64 
 11  MOTORCYCLE     18166 non-null  int64 
 12  TRUCK          18166 non-null  int64 
 13  TRSN_CITY_VEH  18166 non-null  int64 
 14  PASSENGER      18166 non-null  int64 
 15  SPEEDING       18166 non-null  int64 
 16  AG_DRIV        18166 non-null  int64 
 17  REDLIGHT       18166 non-null  int64 
 18  ALCOHOL        18166 non-n

<hr>

# Keras Tuner time again

In [147]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,OVER70,NEWDRIVER,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [148]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [149]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [150]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [151]:
# Find input dimension length
len(X_train[0])

49

In [152]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='Age Re-Classified')

In [161]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 09s]
val_accuracy: 0.8560105562210083

Best val_accuracy So Far: 0.8918978571891785
Total elapsed time: 00h 03m 59s
INFO:tensorflow:Oracle triggered exit


<hr>

# Let's try Keras with the ideal dashboard at this time

In [153]:
data_refined.nunique()

LOCCOORD         3
ACCLOC           5
TRAFFCTL         6
VISIBILITY       4
LIGHT            4
RDSFCOND         4
ACCLASS          2
IMPACTYPE        9
PEDESTRIAN       2
CYCLIST          2
AUTOMOBILE       2
MOTORCYCLE       2
TRUCK            2
TRSN_CITY_VEH    2
PASSENGER        2
SPEEDING         2
AG_DRIV          2
REDLIGHT         2
ALCOHOL          2
DISABILITY       2
OVER70           2
NEWDRIVER        2
dtype: int64

In [154]:
data_ideal = data_refined[['VISIBILITY', 'RDSFCOND', 'SPEEDING', 'AG_DRIV', 'ALCOHOL', 'REDLIGHT', 'OVER70', 'NEWDRIVER', 'IMPACTYPE', 'ACCLASS']]

In [155]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_ideal)
dummies

Unnamed: 0,SPEEDING,AG_DRIV,ALCOHOL,REDLIGHT,OVER70,NEWDRIVER,ACCLASS,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement
0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
4,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18190,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18191,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18192,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [156]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [157]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [158]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [159]:
# Find input dimension length
len(X_train[0])

23

In [160]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='Ideal Dashboard')

In [170]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 08s]
val_accuracy: 0.8582122325897217

Best val_accuracy So Far: 0.8588727712631226
Total elapsed time: 00h 03m 50s
INFO:tensorflow:Oracle triggered exit


<hr>

# XGBoost on refined data again to see what driving

In [161]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_refined)
dummies

Unnamed: 0,ACCLASS,PEDESTRIAN,CYCLIST,AUTOMOBILE,MOTORCYCLE,TRUCK,TRSN_CITY_VEH,PASSENGER,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,OVER70,NEWDRIVER,LOCCOORD_Intersection,LOCCOORD_Mid-Block,LOCCOORD_Other,ACCLOC_At Intersection,ACCLOC_At/Near Private Drive,ACCLOC_Intersection Related,ACCLOC_Non Intersection,ACCLOC_Other,TRAFFCTL_No Control,TRAFFCTL_Other,TRAFFCTL_Pedestrian Crossover,TRAFFCTL_Stop Sign,TRAFFCTL_Traffic Controller,TRAFFCTL_Traffic Signal,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,LIGHT_Dark,LIGHT_Dawn,LIGHT_Daylight,LIGHT_Dusk,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Angle,IMPACTYPE_Approaching,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Sideswipe,IMPACTYPE_Turning Movement
0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18190,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18191,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
18192,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [162]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [163]:
# Use XGBoost to check feature importance

model = XGBClassifier()
model.fit(X, y)

In [164]:
# Get feature importances
importances = model.feature_importances_

# Get the list of feature names
feature_names = dummies.drop(columns='ACCLASS').columns.tolist()

# Combine feature names and their corresponding importances
feature_importance = list(zip(feature_names, importances))

# Sort features by importance in descending order
feature_importance.sort(key=lambda x: x[1], reverse=True)

# Print the list of features and their importance scores
for feature, importance in feature_importance:
    print(f"{feature}, Importance: {round(importance, 2)}")

IMPACTYPE_Pedestrian Collisions, Importance: 0.09000000357627869
TRUCK, Importance: 0.05000000074505806
OVER70, Importance: 0.03999999910593033
SPEEDING, Importance: 0.03999999910593033
IMPACTYPE_Rear End, Importance: 0.029999999329447746
CYCLIST, Importance: 0.029999999329447746
REDLIGHT, Importance: 0.019999999552965164
VISIBILITY_Snow, Importance: 0.019999999552965164
ACCLOC_Non Intersection, Importance: 0.019999999552965164
TRAFFCTL_Traffic Signal, Importance: 0.019999999552965164
TRAFFCTL_Traffic Controller, Importance: 0.019999999552965164
ALCOHOL, Importance: 0.019999999552965164
AG_DRIV, Importance: 0.019999999552965164
ACCLOC_At Intersection, Importance: 0.019999999552965164
IMPACTYPE_Approaching, Importance: 0.019999999552965164
TRAFFCTL_No Control, Importance: 0.019999999552965164
PASSENGER, Importance: 0.019999999552965164
IMPACTYPE_Turning Movement, Importance: 0.019999999552965164
IMPACTYPE_SMV Other, Importance: 0.019999999552965164
DISABILITY, Importance: 0.019999999552

## Revisit Impact type

In [165]:
# Look at Impact type values

# Let's re-bin to SMV (single motor vehicle)

data_refined['IMPACTYPE'].value_counts()

Pedestrian Collisions    7285
Turning Movement         2788
Cyclist Collisions       1795
Rear End                 1744
SMV Other                1642
Angle                    1283
Approaching               928
Sideswipe                 506
Other                     195
Name: IMPACTYPE, dtype: int64

In [166]:
# Call binning function

bin_column(data_refined['IMPACTYPE'], 5)

In [167]:
# Check again

data_refined['IMPACTYPE'].value_counts()

Pedestrian Collisions    7285
Other                    2912
Turning Movement         2788
Cyclist Collisions       1795
Rear End                 1744
SMV Other                1642
Name: IMPACTYPE, dtype: int64

<hr>

# Keras Tuner, one more time with ideal dashboard and trimmed IMPACTYPE

In [168]:
data_idealv2 = data_refined[['VISIBILITY', 'RDSFCOND', 'SPEEDING', 'AG_DRIV', 'ALCOHOL', 'REDLIGHT', 'OVER70', 'NEWDRIVER', 'IMPACTYPE', 'ACCLASS']]

In [169]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies = pd.get_dummies(data_idealv2)
dummies

Unnamed: 0,SPEEDING,AG_DRIV,ALCOHOL,REDLIGHT,OVER70,NEWDRIVER,ACCLASS,VISIBILITY_Clear,VISIBILITY_Other,VISIBILITY_Rain,VISIBILITY_Snow,RDSFCOND_Dry,RDSFCOND_Loose Snow,RDSFCOND_Other,RDSFCOND_Wet,IMPACTYPE_Cyclist Collisions,IMPACTYPE_Other,IMPACTYPE_Pedestrian Collisions,IMPACTYPE_Rear End,IMPACTYPE_SMV Other,IMPACTYPE_Turning Movement
0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0
2,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0
4,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18189,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
18190,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
18191,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0
18192,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0


In [170]:
# Split our preprocessed data into our features and target arrays
y = dummies['ACCLASS'].values
X = dummies.drop(columns='ACCLASS').values

In [171]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [172]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [173]:
# Find input dimension length
len(X_train[0])

20

In [174]:
tuner = kt.Hyperband(
    lambda hp: create_model_var(hp, input_dim=len(X_train[0]), min_units=1, max_units=len(X_train[0]), step_units=1),
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    project_name='Big Finish')

In [175]:
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 00m 08s]
val_accuracy: 0.8522677421569824

Best val_accuracy So Far: 0.8535887002944946
Total elapsed time: 00h 04m 02s
INFO:tensorflow:Oracle triggered exit


In [177]:
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 20,
 'num_layers': 6,
 'units_0': 17,
 'units_1': 11,
 'units_2': 18,
 'units_3': 12,
 'units_4': 13,
 'units_5': 14,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

<hr>

# Manually construct the best model for the ideal parameters

In [178]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train[0])
hidden_nodes_first = 4
hidden_nodes_second = 18
hidden_nodes_third = 6
hidden_nodes_fourth = 11
hidden_nodes_fifth = 5
hidden_nodes_sixth = 8


nn = tf.keras.models.Sequential()

activation = 'tanh'

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_first, input_dim = input_features, activation = activation))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_second, activation = activation))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_third, activation = activation))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_fourth, activation = activation))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_fifth, activation = activation))
nn.add(tf.keras.layers.Dense(units=hidden_nodes_sixth, activation = activation))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 4)                 84        
                                                                 
 dense_4 (Dense)             (None, 18)                90        
                                                                 
 dense_5 (Dense)             (None, 6)                 114       
                                                                 
 dense_6 (Dense)             (None, 11)                77        
                                                                 
 dense_7 (Dense)             (None, 5)                 60        
                                                                 
 dense_8 (Dense)             (None, 8)                 48        
                                                                 
 dense_9 (Dense)             (None, 1)                

In [179]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [180]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [181]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

142/142 - 0s - loss: 0.3944 - accuracy: 0.8501 - 153ms/epoch - 1ms/step
Loss: 0.3944050967693329, Accuracy: 0.850066065788269


In [184]:
save_model(nn, 'crash_model.h5')
nn.save('crash_model.keras')

  save_model(nn, 'crash_model.h5')


In [185]:
data_idealv2

Unnamed: 0,VISIBILITY,RDSFCOND,SPEEDING,AG_DRIV,ALCOHOL,REDLIGHT,OVER70,NEWDRIVER,IMPACTYPE,ACCLASS
0,Clear,Dry,0,1,0,0,0,0,Pedestrian Collisions,1
1,Clear,Dry,0,1,0,0,0,0,Pedestrian Collisions,1
2,Clear,Dry,0,1,0,1,0,0,Turning Movement,1
3,Clear,Wet,1,1,1,0,0,0,Other,0
4,Clear,Dry,0,1,0,1,0,0,Turning Movement,1
...,...,...,...,...,...,...,...,...,...,...
18189,Clear,Wet,0,1,0,0,0,0,Pedestrian Collisions,0
18190,Clear,Wet,0,1,0,0,0,0,Pedestrian Collisions,0
18191,Clear,Wet,0,1,0,0,0,0,Pedestrian Collisions,0
18192,Rain,Wet,0,0,0,0,0,0,Pedestrian Collisions,0


In [186]:
data_idealv2['IMPACTYPE'].value_counts()

Pedestrian Collisions    7285
Other                    2912
Turning Movement         2788
Cyclist Collisions       1795
Rear End                 1744
SMV Other                1642
Name: IMPACTYPE, dtype: int64

In [187]:
data_idealv2['VISIBILITY'].value_counts()

Clear    15705
Rain      1871
Snow       351
Other      239
Name: VISIBILITY, dtype: int64

In [188]:
data_idealv2['RDSFCOND'].value_counts()

Dry           14585
Wet            3013
Other           399
Loose Snow      169
Name: RDSFCOND, dtype: int64

In [192]:
print(X[0])
print(y)

[0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0]
[1 1 1 ... 0 0 0]


In [194]:
dummies.columns.to_list()

['SPEEDING',
 'AG_DRIV',
 'ALCOHOL',
 'REDLIGHT',
 'OVER70',
 'NEWDRIVER',
 'ACCLASS',
 'VISIBILITY_Clear',
 'VISIBILITY_Other',
 'VISIBILITY_Rain',
 'VISIBILITY_Snow',
 'RDSFCOND_Dry',
 'RDSFCOND_Loose Snow',
 'RDSFCOND_Other',
 'RDSFCOND_Wet',
 'IMPACTYPE_Cyclist Collisions',
 'IMPACTYPE_Other',
 'IMPACTYPE_Pedestrian Collisions',
 'IMPACTYPE_Rear End',
 'IMPACTYPE_SMV Other',
 'IMPACTYPE_Turning Movement']

In [195]:
data_idealv2.to_csv('data_for_model.csv')

In [196]:
data_idealv2.nunique()

VISIBILITY    4
RDSFCOND      4
SPEEDING      2
AG_DRIV       2
ALCOHOL       2
REDLIGHT      2
OVER70        2
NEWDRIVER     2
IMPACTYPE     6
ACCLASS       2
dtype: int64

<hr>

# Summary

We can maintain 86% accuracy by using iterations of XGBoost to cull the features from our data and check it using Keras Tuner to maintain a model that is strong.

The final feature set for the dashboard would be 

VISIBILITY - dropdown

RDSFCOND - dropdown

IMPACTYPE - dropdown

SPEEDING - binary

AG_DRIV - binary

ALCOHOL - binary

REDLIGHT - binary

OVER70 - binary

NEWDRIVER - binary (this is age 15 to 19)

All to predict 
ACCLASS - Fatal or non-fatal with 86% accuracy