In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
from sklearn.utils import shuffle

In [2]:
data_path = r'/Users/alexandraszenti/Documents/GitHub Projects/ARRBHack-2018/_data'

In [10]:
df = pd.read_csv(os.path.join(data_path,'road_crash_locations.csv'))

In [11]:
df.columns

Index(['Crash_Ref_Number', 'Crash_Severity', 'Crash_Year', 'Crash_Month',
       'Crash_Day_Of_Week', 'Crash_Hour', 'Crash_Nature', 'Crash_Type',
       'Crash_Longitude_GDA94', 'Crash_Latitude_GDA94', 'Crash_Street',
       'Crash_Street_Intersecting', 'Loc_Suburb', 'Loc_Local_Government_Area',
       'Loc_Post_Code', 'Loc_Police_Division', 'Loc_Police_District',
       'Loc_Police_Region', 'Loc_Queensland_Transport_Region',
       'Loc_Main_Roads_Region', 'Loc_ABS_Statistical_Area_2',
       'Loc_ABS_Statistical_Area_3', 'Loc_ABS_Statistical_Area_4',
       'Loc_ABS_Remoteness', 'Loc_State_Electorate', 'Loc_Federal_Electorate',
       'Crash_Controlling_Authority', 'Crash_Roadway_Feature',
       'Crash_Traffic_Control', 'Crash_Speed_Limit',
       'Crash_Road_Surface_Condition', 'Crash_Atmospheric_Condition',
       'Crash_Lighting_Condition', 'Crash_Road_Horiz_Align',
       'Crash_Road_Vert_Align', 'Crash_DCA_Code', 'Crash_DCA_Description',
       'Crash_DCA_Group_Description', 'C

In [12]:
cleaned = df[['Crash_Severity', 'Crash_Month', 'Crash_Day_Of_Week', 'Crash_Hour',
#              'Crash_Longitude_GDA94', 'Crash_Latitude_GDA94',
             'Loc_Suburb',
             'Loc_ABS_Remoteness',
             'Crash_Roadway_Feature',
             'Crash_Street', 'Crash_Street_Intersecting',
             'Crash_Speed_Limit',
             'Crash_Road_Surface_Condition', 'Crash_Atmospheric_Condition',
             'Crash_Lighting_Condition',
             'Crash_Road_Horiz_Align', 'Crash_Road_Vert_Align',
             'Count_Unit_Bicycle'
             ]]

In [13]:
cleaned.head()

Unnamed: 0,Crash_Severity,Crash_Month,Crash_Day_Of_Week,Crash_Hour,Loc_Suburb,Loc_ABS_Remoteness,Crash_Roadway_Feature,Crash_Street,Crash_Street_Intersecting,Crash_Speed_Limit,Crash_Road_Surface_Condition,Crash_Atmospheric_Condition,Crash_Lighting_Condition,Crash_Road_Horiz_Align,Crash_Road_Vert_Align,Count_Unit_Bicycle
0,Property damage only,January,Monday,22,Herston,Major cities,Intersection - T-Junction,Bowen Bridge Rd,Herston Rd,60 km/h,Unknown,Clear,Darkness - Lighted,Curved - view open,Level,0.0
1,Property damage only,January,Monday,2,Caboolture,Major cities,No Roadway Feature,Pumicestone Rd,,100 - 110 km/h,Sealed - Wet,Clear,Darkness - Not lighted,Straight,Level,0.0
2,Medical treatment,January,Monday,14,Karalee,Major cities,No Roadway Feature,Lyndon Way,,0 - 50 km/h,Sealed - Dry,Clear,Daylight,Straight,Level,0.0
3,Minor injury,January,Monday,0,Dakabin,Major cities,Intersection - T-Junction,Alma Rd,Thompson Rd,60 km/h,Sealed - Dry,Clear,Darkness - Not lighted,Curved - view obscured,Level,0.0
4,Medical treatment,January,Monday,15,Kallangur,Major cities,Intersection - Cross,Duffield Rd,Redcliffe Rd,60 km/h,Sealed - Dry,Clear,Daylight,Straight,Level,0.0


In [14]:
cleaned = cleaned.fillna('')

In [15]:
isCrashRoadSurfaceConditionNotUnknown = cleaned['Crash_Road_Surface_Condition'] != 'Unknown'
isCrashAtmosphericConditionNotUnknown = cleaned['Crash_Atmospheric_Condition'] != 'Unknown'
isCrashLightingConditionNotUnknown = cleaned['Crash_Lighting_Condition'] != 'Unknown'
cleaned = cleaned[isCrashRoadSurfaceConditionNotUnknown & isCrashAtmosphericConditionNotUnknown & isCrashLightingConditionNotUnknown]

In [16]:
cleaned['Crash_Road_Vert_Align'].value_counts()

Level      229755
Grade       50406
Crest       15252
Dip         10412
Unknown        14
Name: Crash_Road_Vert_Align, dtype: int64

In [17]:
isCrashRoadVertAvail = cleaned['Crash_Road_Vert_Align'] != 'Unknown'
isCrashRoadHorizAvail = cleaned['Crash_Road_Horiz_Align'] != 'Unknown'
cleaned = cleaned[isCrashRoadVertAvail & isCrashRoadHorizAvail]

In [18]:
isMajorCities = cleaned['Loc_ABS_Remoteness'] == 'Major cities'
cleaned = cleaned[isMajorCities]
cleaned.shape

(182491, 16)

In [19]:
cleaned['Cyclist_FSI'] = cleaned.apply(lambda row: (row.Crash_Severity in ['Fatal', 'Hospitalisation'] and row.Count_Unit_Bicycle > 0), axis=1)


General cleaning until this point, remapping after


In [20]:
cleaned['isCitySpeed'] = cleaned.apply(lambda row: (row.Crash_Speed_Limit in ['60 km/h', '0 - 50 km/h']), axis=1)

In [21]:
cleaned['Crash_Lighting_Condition'] = cleaned['Crash_Lighting_Condition'].replace(['Darkness - Lighted', 'Darkness - Not lighted'], 'Darkness')

In [22]:
cleaned['combined_street'] = cleaned.apply(lambda row: ("%s - %s - %s" % (row.Loc_Suburb, row.Crash_Street, row.Crash_Street_Intersecting) ), axis=1)

In [23]:
cleaned.columns

Index(['Crash_Severity', 'Crash_Month', 'Crash_Day_Of_Week', 'Crash_Hour',
       'Loc_Suburb', 'Loc_ABS_Remoteness', 'Crash_Roadway_Feature',
       'Crash_Street', 'Crash_Street_Intersecting', 'Crash_Speed_Limit',
       'Crash_Road_Surface_Condition', 'Crash_Atmospheric_Condition',
       'Crash_Lighting_Condition', 'Crash_Road_Horiz_Align',
       'Crash_Road_Vert_Align', 'Count_Unit_Bicycle', 'Cyclist_FSI',
       'isCitySpeed', 'combined_street'],
      dtype='object')

In [24]:
cleaned.corr()

Unnamed: 0,Crash_Hour,Cyclist_FSI,isCitySpeed
Crash_Hour,1.0,-0.029247,0.024281
Cyclist_FSI,-0.029247,1.0,0.047595
isCitySpeed,0.024281,0.047595,1.0


In [29]:
cleaned['Crash_Lighting_Condition'].value_counts()

Daylight     125376
Darkness      46627
Dawn/Dusk     10488
Name: Crash_Lighting_Condition, dtype: int64

In [27]:
shuffle(cleaned).to_csv(os.path.join(data_path,'cyclist_any_address_align.csv'),
                          index=False,
                         columns=['Crash_Month', 'Crash_Day_Of_Week', 'Crash_Hour',
                                 'combined_street', 'isCitySpeed',
                                 'Crash_Road_Surface_Condition', 'Crash_Atmospheric_Condition',
                                 'Crash_Lighting_Condition',
                                 'Crash_Road_Horiz_Align', 'Crash_Road_Vert_Align',
                                 'Cyclist_FSI'
                                 ])