In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import keplergl
#from keplergl import KeplerGL
import matplotlib as plt
import plotly.express as px
import dtale 
import h3



In [2]:
#reading in data
collisions = pd.read_csv('All_Collisions.csv')
ksi = pd.read_csv('KSI.csv')

# All Collisions

In [89]:
collisions.head()

Unnamed: 0,X,Y,OBJECTID,EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Division,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,Longitude,Latitude,ObjectId2
0,0.0,0.0,401,GO-20148000204,2014/01/03 05:00:00+00,January,Friday,2014,8,NSA,NSA,NSA,0,NO,NO,YES,0.0,0.0,1
1,-8833635.0,5414591.0,402,GO-20148000205,2014/01/03 05:00:00+00,January,Friday,2014,12,D54/D55,68,North Riverdale (68),0,NO,NO,YES,-79.353893,43.670491,2
2,0.0,0.0,403,GO-20148000206,2014/01/03 05:00:00+00,January,Friday,2014,8,NSA,NSA,NSA,0,NO,NO,YES,0.0,0.0,3
3,-8838998.0,5413446.0,404,GO-20148000208,2014/01/03 05:00:00+00,January,Friday,2014,11,D52,79,University (79),0,NO,NO,YES,-79.40207,43.663052,4
4,-8840999.0,5434235.0,405,GO-20148000209,2014/01/03 05:00:00+00,January,Friday,2014,13,D32,50,Newtonbrook East (50),0,NO,NO,YES,-79.420043,43.798,5


In [90]:
#converting column headers to lowercase
collisions.columns = [s.strip().replace(' ','_').lower() for s in collisions.columns]

In [91]:
#shape of dataframe
collisions.shape

(553780, 19)

In [92]:
#checking column names
collisions.columns

Index(['x', 'y', 'objectid', 'eventuniqueid', 'occurrencedate', 'month',
       'day_of_week', 'year', 'hour', 'division', 'atom', 'neighbourhood',
       'fatalities', 'injury_collisions', 'ftr_collisions', 'pd_collisions',
       'longitude', 'latitude', 'objectid2'],
      dtype='object')

In [93]:
#dropping unneeded columns
collisions = collisions.drop(columns=['x','y','objectid','atom','objectid2','occurrencedate'])
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude
0,GO-20148000204,January,Friday,2014,8,NSA,NSA,0,NO,NO,YES,0.0,0.0
1,GO-20148000205,January,Friday,2014,12,D54/D55,North Riverdale (68),0,NO,NO,YES,-79.353893,43.670491
2,GO-20148000206,January,Friday,2014,8,NSA,NSA,0,NO,NO,YES,0.0,0.0
3,GO-20148000208,January,Friday,2014,11,D52,University (79),0,NO,NO,YES,-79.40207,43.663052
4,GO-20148000209,January,Friday,2014,13,D32,Newtonbrook East (50),0,NO,NO,YES,-79.420043,43.798


In [94]:
#we will only take data from year 2016 onwards, since that is the time period of our analysis
collisions.drop(collisions[collisions['year']<2016].index,inplace=True)

In [95]:
#checking for missing values
#no missing values in the dataset
collisions.isna().sum()

eventuniqueid        0
month                0
day_of_week          0
year                 0
hour                 0
division             0
neighbourhood        0
fatalities           0
injury_collisions    0
ftr_collisions       0
pd_collisions        0
longitude            0
latitude             0
dtype: int64

In [96]:
collisions.describe()

Unnamed: 0,year,hour,fatalities,longitude,latitude
count,421934.0,421934.0,421934.0,421934.0,421934.0
mean,2018.474209,13.440192,0.000927,-67.690507,37.271687
std,1.791188,4.979152,0.031197,28.145074,15.497216
min,2016.0,0.0,0.0,-80.044952,0.0
25%,2017.0,10.0,0.0,-79.448265,43.647187
50%,2018.0,14.0,0.0,-79.375178,43.697201
75%,2020.0,17.0,0.0,-79.267238,43.755535
max,2022.0,23.0,3.0,0.0,44.435809


In [97]:
#checking how many collision rows have latitude and longitude = 0. i.e. are missing
collisions.latitude.where(collisions.longitude==0.000000).count()

62192

In [98]:
#dropping rows with missing latitudes and longitudes
collisions.drop(collisions[collisions['latitude']==0].index,inplace=True)

In [99]:
#checking if rows were dropped from dataframe
collisions.shape

(359742, 13)

In [100]:
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude
131003,GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851
131004,GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944
131005,GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248
131006,GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053
131007,GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081


In [101]:
#resetting the index because dropping the rows messed up the index
collisions.reset_index(inplace=True,drop=True)
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude
0,GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851
1,GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944
2,GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248
3,GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053
4,GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081


In [102]:
#creating a column to identify whether a collision resulted in fatalities. this will work as a flag

#first define function to create the labels for the new column
def fatal_collision_indicator(num_fatalities):
    if num_fatalities==0:
        return 'NO'
    elif num_fatalities>0:
        return 'YES'

#applying this function on the dataframe to create a new fatal_collision column
collisions['fatal_collisions'] = collisions['fatalities'].apply(fatal_collision_indicator)

In [103]:
#checking the values for the new columns created
collisions.fatal_collisions.value_counts()

NO     359366
YES       376
Name: fatal_collisions, dtype: int64

In [104]:
#looking at data types
collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359742 entries, 0 to 359741
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   eventuniqueid      359742 non-null  object 
 1   month              359742 non-null  object 
 2   day_of_week        359742 non-null  object 
 3   year               359742 non-null  int64  
 4   hour               359742 non-null  int64  
 5   division           359742 non-null  object 
 6   neighbourhood      359742 non-null  object 
 7   fatalities         359742 non-null  int64  
 8   injury_collisions  359742 non-null  object 
 9   ftr_collisions     359742 non-null  object 
 10  pd_collisions      359742 non-null  object 
 11  longitude          359742 non-null  float64
 12  latitude           359742 non-null  float64
 13  fatal_collisions   359742 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 38.4+ MB


In [105]:
#we want to convert the latitude and longitude values to h3 hex cells

#first, we need to combine the latitude and longitude values into a coordinates column
collisions['coordinates'] = collisions['latitude'].astype('str') + ',' + collisions['longitude'].astype('str')

#then we define the function to get h3 hex cells from coordinates data
def get_hex_cell(coordinates):
    a = float(coordinates.split(',')[0])
    b = float(coordinates.split(',')[1])
    return h3.latlng_to_cell(a,b,res=10)

#finally, create a new column for hex cells by applying the function on coordinates column
collisions['hex_cell'] = collisions['coordinates'].apply(get_hex_cell)

In [106]:
#number of unique hex cells created
collisions.hex_cell.nunique()

17135

In [107]:
#creating a column for times of day from the hour column
collisions['time_of_day'] = collisions['hour'].apply(lambda x: 'morning' if 5<=x<12 else 'afternoon' if 12<=x<17 else 'evening' if 17<=x<21 else 'night')

In [108]:
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude,fatal_collisions,coordinates,hex_cell,time_of_day
0,GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851,NO,"43.79585129,-79.31927061",8b2b9bd7364afff,afternoon
1,GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944,NO,"43.68394391,-79.32085134",8b2b9bc66305fff,evening
2,GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248,NO,"43.73224777,-79.25165261",8b2b9bd4c4ddfff,evening
3,GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053,NO,"43.64605338,-79.38552201",8b2b9bc4601cfff,night
4,GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081,NO,"43.68081049,-79.28402002",8b2b9bc65c81fff,night


In [109]:
collisions.describe()

Unnamed: 0,year,hour,fatalities,longitude,latitude
count,359742.0,359742.0,359742.0,359742.0,359742.0
mean,2018.414981,13.483744,0.001067,-79.392805,43.715196
std,1.805922,4.977226,0.033495,0.10177,0.0561
min,2016.0,0.0,0.0,-80.044952,43.302551
25%,2017.0,10.0,0.0,-79.464519,43.665824
50%,2018.0,14.0,0.0,-79.392814,43.714087
75%,2020.0,17.0,0.0,-79.315292,43.761509
max,2022.0,23.0,3.0,-78.495607,44.435809


In [110]:
#checking for skews 
#fatalities is the only column that is skewed. this is fixed by adding the fatal_collision boolean column earlier
collisions.skew()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



year           0.360431
hour          -0.417705
fatalities    33.637088
longitude     -0.011680
latitude       0.151831
dtype: float64

# KSI Data

In [3]:
ksi.head()

Unnamed: 0,X,Y,INDEX_,ACCNUM,YEAR,DATE,TIME,STREET1,STREET2,OFFSET,...,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,DIVISION,ObjectId
0,-8844611.0,5412414.0,3387730,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,...,Yes,,,,88,High Park North,88,High Park North (88),D11,1
1,-8844611.0,5412414.0,3387731,892658.0,2006,2006/03/11 05:00:00+00,852,BLOOR ST W,DUNDAS ST W,,...,Yes,,,,88,High Park North,88,High Park North (88),D11,2
2,-8816480.0,5434843.0,3388101,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,...,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,3
3,-8829728.0,5419071.0,3389067,893184.0,2006,2006/01/01 05:00:00+00,236,WOODBINE AVE,O CONNOR DR,,...,Yes,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4
4,-8816480.0,5434843.0,3388102,892810.0,2006,2006/03/11 05:00:00+00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,...,Yes,Yes,,,146,Malvern East,132,Malvern (132),D42,5


In [4]:
ksi.columns

Index(['X', 'Y', 'INDEX_', 'ACCNUM', 'YEAR', 'DATE', 'TIME', 'STREET1',
       'STREET2', 'OFFSET', 'ROAD_CLASS', 'DISTRICT', 'WARDNUM', 'LATITUDE',
       'LONGITUDE', 'LOCCOORD', 'ACCLOC', 'TRAFFCTL', 'VISIBILITY', 'LIGHT',
       'RDSFCOND', 'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY',
       'FATAL_NO', 'INITDIR', 'VEHTYPE', 'MANOEUVER', 'DRIVACT', 'DRIVCOND',
       'PEDTYPE', 'PEDACT', 'PEDCOND', 'CYCLISTYPE', 'CYCACT', 'CYCCOND',
       'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK',
       'TRSN_CITY_VEH', 'EMERG_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV',
       'REDLIGHT', 'ALCOHOL', 'DISABILITY', 'HOOD_158', 'NEIGHBOURHOOD_158',
       'HOOD_140', 'NEIGHBOURHOOD_140', 'DIVISION', 'ObjectId'],
      dtype='object')

In [5]:
#convert column headers to lowercase, strip whitespace, replace spaces with underscore
ksi.columns = [s.strip().replace(' ','_').lower() for s in ksi.columns]

In [6]:
#renaming index column
ksi.rename(columns={'index_':'index'},inplace=True)

In [7]:
#dropping unneccesary columns
ksi = ksi.drop(columns = ['x','y','accnum','offset','fatal_no','initdir','hood_158','hood_140','neighbourhood_140','wardnum','objectid'])

In [8]:
#dropping rows with year before 2016
ksi.drop(ksi[ksi['year']<2016].index,inplace=True)

In [9]:
#resetting the index because dropping the rows messed up the index
ksi.reset_index(inplace=True,drop=True)
ksi.head()

Unnamed: 0,index,year,date,time,street1,street2,road_class,district,latitude,longitude,...,trsn_city_veh,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,neighbourhood_158,division
0,80566288,2016,2016/08/01 04:00:00+00,156,MORNINGSIDE AVE,HALFWAY AVE,Minor Arterial,Scarborough,43.817847,-79.213687,...,Yes,,Yes,,Yes,,,,Malvern East,D42
1,80632072,2016,2016/08/02 04:00:00+00,1745,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51
2,80632073,2016,2016/08/02 04:00:00+00,1745,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51
3,80632074,2016,2016/08/02 04:00:00+00,2112,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,,,Yes,,Yes,,,,Harbourfront-CityPlace,D52
4,80632075,2016,2016/08/02 04:00:00+00,2112,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,,,Yes,,Yes,,,,Harbourfront-CityPlace,D52


In [10]:
ksi.shape

(5962, 46)

In [11]:
#checking for duplicate rows
ksi.duplicated().sum()

0

In [12]:
#checking null values
ksi.isnull().sum()

index                   0
year                    0
date                    0
time                    0
street1                 0
street2               607
road_class             55
district               96
latitude                0
longitude               0
loccoord                5
accloc                 10
traffctl                5
visibility             20
light                   0
rdsfcond               25
acclass                 5
impactype               4
invtype                 9
invage                  0
injury                 12
vehtype              2244
manoeuver            2220
drivact              2600
drivcond             2601
pedtype              4852
pedact               4820
pedcond              4813
cyclistype           5691
cycact               5685
cyccond              5685
pedestrian           3437
cyclist              5351
automobile            554
motorcycle           5222
truck                5617
trsn_city_veh        5622
emerg_veh            5932
passenger   

In [13]:
#filling null values with N/A
ksi.fillna('N/A',inplace=True)

In [14]:
#adding a column called intersection
ksi['intersection'] = ksi['street1'].str.cat(ksi['street2'],sep=',')

In [15]:
#converting date column to datetime data type
ksi.date = ksi.date.astype('datetime64')

In [16]:
#only showing date in the date column, not the time
ksi['date'] = ksi['date'].dt.date

In [17]:
#rounding the time column to the nearest hour segment
import math
ksi['time'] = [t // 100 + (1 if t % 100 >= 30 else 0) for t in ksi['time']]

In [18]:
#creating a column for times of day based on the hour column
ksi['time_of_day'] = ksi['time'].apply(lambda x: 'morning' if 5<=x<12 else 'afternoon' if 12<=x<17 else 'evening' if 17<=x<21 else 'night')

In [27]:
#we want to convert the latitude and longitude values to h3 hex cells

#drop the columns if they exist
ksi.drop(columns=['coordinates','hex_cell'])

#first, we need to combine the latitude and longitude values into a coordinates column
ksi['coordinates'] = ksi['latitude'].astype('str') + ',' + ksi['longitude'].astype('str')

#then we define the function to get h3 hex cells from coordinates data
def get_hex_cell(coordinates):
    a = float(coordinates.split(',')[0])
    b = float(coordinates.split(',')[1])
    return h3.latlng_to_cell(a,b,res=10)

#finally, create a new column for hex cells by applying the function on coordinates column
ksi['hex_cell'] = ksi['coordinates'].apply(get_hex_cell)

In [19]:
ksi.head()

Unnamed: 0,index,year,date,time,street1,street2,road_class,district,latitude,longitude,...,passenger,speeding,ag_driv,redlight,alcohol,disability,neighbourhood_158,division,intersection,time_of_day
0,80566288,2016,2016-08-01,2,MORNINGSIDE AVE,HALFWAY AVE,Minor Arterial,Scarborough,43.817847,-79.213687,...,Yes,,Yes,,,,Malvern East,D42,"MORNINGSIDE AVE,HALFWAY AVE",night
1,80632072,2016,2016-08-02,18,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,"FRONT ST E,YONGE ST",evening
2,80632073,2016,2016-08-02,18,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,"FRONT ST E,YONGE ST",evening
3,80632074,2016,2016-08-02,21,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,Yes,,Yes,,,,Harbourfront-CityPlace,D52,"BLUE JAYS WAY,NAVY WHARF CRT",night
4,80632075,2016,2016-08-02,21,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,Yes,,Yes,,,,Harbourfront-CityPlace,D52,"BLUE JAYS WAY,NAVY WHARF CRT",night


In [31]:
ksi.index.duplicated().sum()

0

In [20]:
ksi.describe()

Unnamed: 0,index,year,time,latitude,longitude
count,5962.0,5962.0,5962.0,5962.0,5962.0
mean,81146010.0,2018.653304,14.235156,43.711171,-79.396253
std,383215.0,1.954197,6.378966,0.057434,0.106259
min,80000080.0,2016.0,0.0,43.589678,-79.633502
25%,80784030.0,2017.0,10.0,43.659791,-79.476459
50%,81134140.0,2018.0,15.0,43.706267,-79.399108
75%,81509600.0,2020.0,19.0,43.757796,-79.318286
max,81706060.0,2022.0,24.0,43.84878,-79.122974


In [32]:
#checking for skews
ksi.skew()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



index       -0.101749
year         0.278481
time        -0.558438
latitude     0.214206
longitude    0.073801
dtype: float64