In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import keplergl
#from keplergl import KeplerGL
import matplotlib as plt
import plotly.express as px
import dtale 
import h3



In [117]:
#reading in data
collisions = gpd.read_file('All_Collisions.geojson')
ksi = gpd.read_file('KSI.geojson')

# All Collisions

In [72]:
collisions.head()

Unnamed: 0,OBJECTID,EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Division,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,Longitude,Latitude,ObjectId2,geometry
0,401,GO-20148000204,2014-01-03 05:00:00+00:00,January,Friday,2014,8,NSA,NSA,NSA,0,NO,NO,YES,0.0,0.0,1,POINT (0.00000 0.00000)
1,402,GO-20148000205,2014-01-03 05:00:00+00:00,January,Friday,2014,12,D54/D55,68,North Riverdale (68),0,NO,NO,YES,-79.353893,43.670491,2,POINT (-79.35389 43.67049)
2,403,GO-20148000206,2014-01-03 05:00:00+00:00,January,Friday,2014,8,NSA,NSA,NSA,0,NO,NO,YES,0.0,0.0,3,POINT (0.00000 0.00000)
3,404,GO-20148000208,2014-01-03 05:00:00+00:00,January,Friday,2014,11,D52,79,University (79),0,NO,NO,YES,-79.40207,43.663052,4,POINT (-79.40207 43.66305)
4,405,GO-20148000209,2014-01-03 05:00:00+00:00,January,Friday,2014,13,D32,50,Newtonbrook East (50),0,NO,NO,YES,-79.420043,43.798,5,POINT (-79.42004 43.79800)


In [73]:
#converting column headers to lowercase
collisions.columns = [s.strip().replace(' ','_').lower() for s in collisions.columns]

In [74]:
#shape of dataframe
collisions.shape

(553780, 18)

In [75]:
#checking column names
collisions.columns

Index(['objectid', 'eventuniqueid', 'occurrencedate', 'month', 'day_of_week',
       'year', 'hour', 'division', 'atom', 'neighbourhood', 'fatalities',
       'injury_collisions', 'ftr_collisions', 'pd_collisions', 'longitude',
       'latitude', 'objectid2', 'geometry'],
      dtype='object')

In [76]:
#dropping unneeded columns
collisions = collisions.drop(columns=['objectid','atom','objectid2','occurrencedate'])
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude,geometry
0,GO-20148000204,January,Friday,2014,8,NSA,NSA,0,NO,NO,YES,0.0,0.0,POINT (0.00000 0.00000)
1,GO-20148000205,January,Friday,2014,12,D54/D55,North Riverdale (68),0,NO,NO,YES,-79.353893,43.670491,POINT (-79.35389 43.67049)
2,GO-20148000206,January,Friday,2014,8,NSA,NSA,0,NO,NO,YES,0.0,0.0,POINT (0.00000 0.00000)
3,GO-20148000208,January,Friday,2014,11,D52,University (79),0,NO,NO,YES,-79.40207,43.663052,POINT (-79.40207 43.66305)
4,GO-20148000209,January,Friday,2014,13,D32,Newtonbrook East (50),0,NO,NO,YES,-79.420043,43.798,POINT (-79.42004 43.79800)


In [77]:
#we will only take data from year 2016 onwards, since that is the time period of our analysis
collisions.drop(collisions[collisions['year']<2016].index,inplace=True)

In [78]:
#checking for missing values
#no missing values in the dataset
collisions.isna().sum()

eventuniqueid        0
month                0
day_of_week          0
year                 0
hour                 0
division             0
neighbourhood        0
fatalities           0
injury_collisions    0
ftr_collisions       0
pd_collisions        0
longitude            0
latitude             0
geometry             0
dtype: int64

In [79]:
collisions.describe()

Unnamed: 0,year,hour,fatalities,longitude,latitude
count,421934.0,421934.0,421934.0,421934.0,421934.0
mean,2018.474209,13.440192,0.000927,-67.690507,37.271687
std,1.791188,4.979152,0.031197,28.145074,15.497216
min,2016.0,0.0,0.0,-80.044952,0.0
25%,2017.0,10.0,0.0,-79.448265,43.647187
50%,2018.0,14.0,0.0,-79.375178,43.697201
75%,2020.0,17.0,0.0,-79.267238,43.755535
max,2022.0,23.0,3.0,0.0,44.435809


In [80]:
#checking how many collision rows have latitude and longitude = 0. i.e. are missing
collisions.latitude.where(collisions.longitude==0.000000).count()

62192

In [81]:
#dropping rows with missing latitudes and longitudes
collisions.drop(collisions[collisions['latitude']==0].index,inplace=True)

In [82]:
#checking if rows were dropped from dataframe
collisions.shape

(359742, 14)

In [83]:
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude,geometry
131003,GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851,POINT (-79.31927 43.79585)
131004,GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944,POINT (-79.32085 43.68394)
131005,GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248,POINT (-79.25165 43.73225)
131006,GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053,POINT (-79.38552 43.64605)
131007,GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081,POINT (-79.28402 43.68081)


In [84]:
#resetting the index because dropping the rows messed up the index
collisions.reset_index(inplace=True,drop=True)
collisions.head()

Unnamed: 0,eventuniqueid,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude,geometry
0,GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851,POINT (-79.31927 43.79585)
1,GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944,POINT (-79.32085 43.68394)
2,GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248,POINT (-79.25165 43.73225)
3,GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053,POINT (-79.38552 43.64605)
4,GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081,POINT (-79.28402 43.68081)


In [85]:
#setting eventuniqueid as index
collisions.set_index('eventuniqueid',inplace=True)

In [86]:
#creating a column to identify whether a collision resulted in fatalities. this will work as a flag

#first define function to create the labels for the new column
def fatal_collision_indicator(num_fatalities):
    if num_fatalities==0:
        return 'NO'
    elif num_fatalities>0:
        return 'YES'

#applying this function on the dataframe to create a new fatal_collision column
collisions['fatal_collisions'] = collisions['fatalities'].apply(fatal_collision_indicator)

In [87]:
#checking the values for the new columns created
collisions.fatal_collisions.value_counts()

NO     359366
YES       376
Name: fatal_collisions, dtype: int64

In [88]:
#looking at data types
collisions.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 359742 entries, GO-20168000055 to GO-20228022870
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   month              359742 non-null  object  
 1   day_of_week        359742 non-null  object  
 2   year               359742 non-null  int64   
 3   hour               359742 non-null  int64   
 4   division           359742 non-null  object  
 5   neighbourhood      359742 non-null  object  
 6   fatalities         359742 non-null  int64   
 7   injury_collisions  359742 non-null  object  
 8   ftr_collisions     359742 non-null  object  
 9   pd_collisions      359742 non-null  object  
 10  longitude          359742 non-null  float64 
 11  latitude           359742 non-null  float64 
 12  geometry           359742 non-null  geometry
 13  fatal_collisions   359742 non-null  object  
dtypes: float64(2), geometry(1), int64(3), object(8)
memory usage

In [89]:
#we want to convert the latitude and longitude values to h3 hex cells

#first, we need to combine the latitude and longitude values into a coordinates column
#collisions['coordinates'] = collisions['latitude'].astype('str') + ',' + collisions['longitude'].astype('str')

#then we define the function to get h3 hex cells from coordinates data
#def get_hex_cell(coordinates):
    a = float(coordinates.split(',')[0])
    b = float(coordinates.split(',')[1])
    return h3.latlng_to_cell(a,b,res=10)

#finally, create a new column for hex cells by applying the function on coordinates column
#collisions['hex_cell'] = collisions['coordinates'].apply(get_hex_cell)

IndentationError: unexpected indent (2758631492.py, line 8)

In [90]:
#number of unique hex cells created
#collisions.hex_cell.nunique()

In [91]:
#creating a column for times of day from the hour column
collisions['time_of_day'] = collisions['hour'].apply(lambda x: 'morning' if 5<=x<12 else 'afternoon' if 12<=x<17 else 'evening' if 17<=x<21 else 'night')

In [92]:
collisions.head()

Unnamed: 0_level_0,month,day_of_week,year,hour,division,neighbourhood,fatalities,injury_collisions,ftr_collisions,pd_collisions,longitude,latitude,geometry,fatal_collisions,time_of_day
eventuniqueid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GO-20168000055,January,Friday,2016,16,D42,LAmoreaux (117),0,NO,NO,YES,-79.319271,43.795851,POINT (-79.31927 43.79585),NO,afternoon
GO-20168000056,January,Friday,2016,17,D54/D55,Woodbine Corridor (64),0,YES,NO,NO,-79.320851,43.683944,POINT (-79.32085 43.68394),NO,evening
GO-20168000057,January,Friday,2016,19,D41,Kennedy Park (124),0,NO,YES,NO,-79.251653,43.732248,POINT (-79.25165 43.73225),NO,evening
GO-20168000058,January,Friday,2016,3,D52,Bay Street Corridor (76),0,NO,NO,YES,-79.385522,43.646053,POINT (-79.38552 43.64605),NO,night
GO-20168000059,January,Friday,2016,4,D41,Birchcliffe-Cliffside (122),0,NO,NO,YES,-79.28402,43.68081,POINT (-79.28402 43.68081),NO,night


In [93]:
collisions.describe()

Unnamed: 0,year,hour,fatalities,longitude,latitude
count,359742.0,359742.0,359742.0,359742.0,359742.0
mean,2018.414981,13.483744,0.001067,-79.392805,43.715196
std,1.805922,4.977226,0.033495,0.10177,0.0561
min,2016.0,0.0,0.0,-80.044952,43.302551
25%,2017.0,10.0,0.0,-79.464519,43.665824
50%,2018.0,14.0,0.0,-79.392814,43.714087
75%,2020.0,17.0,0.0,-79.315292,43.761509
max,2022.0,23.0,3.0,-78.495607,44.435809


In [94]:
#checking for skews 
#fatalities is the only column that is skewed. this is fixed by adding the fatal_collision boolean column earlier
collisions.fatalities.skew()

33.63708848648862

In [96]:
#adding a column prefix for columns in the dataset
collisions.columns = ['collisions_' + s if s != 'geometry' else s for s in collisions.columns]

In [97]:
collisions.to_file('collisions_processed.geojson')

# KSI Data

In [118]:
ksi.head()

Unnamed: 0,INDEX_,ACCNUM,YEAR,DATE,TIME,STREET1,STREET2,OFFSET,ROAD_CLASS,DISTRICT,...,REDLIGHT,ALCOHOL,DISABILITY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,DIVISION,ObjectId,geometry
0,3387730,892658,2006,2006-03-11 05:00:00+00:00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,...,,,,88,High Park North,88,High Park North (88),D11,1,POINT (-79.45249 43.65635)
1,3387731,892658,2006,2006-03-11 05:00:00+00:00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,...,,,,88,High Park North,88,High Park North (88),D11,2,POINT (-79.45249 43.65635)
2,3388101,892810,2006,2006-03-11 05:00:00+00:00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,...,Yes,,,146,Malvern East,132,Malvern (132),D42,3,POINT (-79.19979 43.80194)
3,3389067,893184,2006,2006-01-01 05:00:00+00:00,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,...,,Yes,,60,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4,POINT (-79.31880 43.69960)
4,3388102,892810,2006,2006-03-11 05:00:00+00:00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,...,Yes,,,146,Malvern East,132,Malvern (132),D42,5,POINT (-79.19979 43.80194)


In [119]:
ksi.columns

Index(['INDEX_', 'ACCNUM', 'YEAR', 'DATE', 'TIME', 'STREET1', 'STREET2',
       'OFFSET', 'ROAD_CLASS', 'DISTRICT', 'WARDNUM', 'LATITUDE', 'LONGITUDE',
       'LOCCOORD', 'ACCLOC', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND',
       'ACCLASS', 'IMPACTYPE', 'INVTYPE', 'INVAGE', 'INJURY', 'FATAL_NO',
       'INITDIR', 'VEHTYPE', 'MANOEUVER', 'DRIVACT', 'DRIVCOND', 'PEDTYPE',
       'PEDACT', 'PEDCOND', 'CYCLISTYPE', 'CYCACT', 'CYCCOND', 'PEDESTRIAN',
       'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH',
       'EMERG_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL',
       'DISABILITY', 'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140',
       'NEIGHBOURHOOD_140', 'DIVISION', 'ObjectId', 'geometry'],
      dtype='object')

In [120]:
#convert column headers to lowercase, strip whitespace, replace spaces with underscore
ksi.columns = [s.strip().replace(' ','_').lower() for s in ksi.columns]

In [121]:
#renaming index column
ksi.rename(columns={'index_':'index'},inplace=True)

In [122]:
#dropping unneccesary columns
ksi = ksi.drop(columns = ['accnum','offset','fatal_no','initdir','hood_158','hood_140','neighbourhood_140','wardnum','objectid'])

In [123]:
#dropping rows with year before 2016
ksi.drop(ksi[ksi['year']<2016].index,inplace=True)

In [124]:
#resetting the index because dropping the rows messed up the index
ksi.reset_index(inplace=True,drop=True)
ksi.head()

Unnamed: 0,index,year,date,time,street1,street2,road_class,district,latitude,longitude,...,emerg_veh,passenger,speeding,ag_driv,redlight,alcohol,disability,neighbourhood_158,division,geometry
0,80566288,2016,2016-08-01 04:00:00+00:00,156,MORNINGSIDE AVE,HALFWAY AVE,Minor Arterial,Scarborough,43.817847,-79.213687,...,,Yes,,Yes,,,,Malvern East,D42,POINT (-79.21369 43.81785)
1,80632072,2016,2016-08-02 04:00:00+00:00,1745,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,POINT (-79.37693 43.64681)
2,80632073,2016,2016-08-02 04:00:00+00:00,1745,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,...,,,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,POINT (-79.37693 43.64681)
3,80632074,2016,2016-08-02 04:00:00+00:00,2112,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,,Yes,,Yes,,,,Harbourfront-CityPlace,D52,POINT (-79.39186 43.64157)
4,80632075,2016,2016-08-02 04:00:00+00:00,2112,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,...,,Yes,,Yes,,,,Harbourfront-CityPlace,D52,POINT (-79.39186 43.64157)


In [125]:
#setting accnum as index
ksi.set_index('index',inplace=True)

In [126]:
ksi.shape

(5962, 46)

In [127]:
#checking for duplicate rows
ksi.duplicated().sum()

115

In [12]:
#checking null values
ksi.isnull().sum()

index                   0
year                    0
date                    0
time                    0
street1                 0
street2               607
road_class             55
district               96
latitude                0
longitude               0
loccoord                5
accloc                 10
traffctl                5
visibility             20
light                   0
rdsfcond               25
acclass                 5
impactype               4
invtype                 9
invage                  0
injury                 12
vehtype              2244
manoeuver            2220
drivact              2600
drivcond             2601
pedtype              4852
pedact               4820
pedcond              4813
cyclistype           5691
cycact               5685
cyccond              5685
pedestrian           3437
cyclist              5351
automobile            554
motorcycle           5222
truck                5617
trsn_city_veh        5622
emerg_veh            5932
passenger   

In [128]:
#filling null values with N/A
ksi.fillna('N/A',inplace=True)

In [129]:
#adding a column called intersection
ksi['intersection'] = ksi['street1'].str.cat(ksi['street2'],sep=',')

In [134]:
#converting date column to datetime data type
#ksi.date = ksi.date.astype('datetime64')

In [135]:
#only showing date in the date column, not the time
ksi['date'] = ksi['date'].dt.date

In [138]:
#converting time to integer data type
ksi['time'] = ksi['time'].astype('int')

In [139]:
#rounding the time column to the nearest hour segment
import math
ksi['time'] = [t // 100 + (1 if t % 100 >= 30 else 0) for t in ksi['time']]

In [140]:
#creating a column for times of day based on the hour column
ksi['time_of_day'] = ksi['time'].apply(lambda x: 'morning' if 5<=x<12 else 'afternoon' if 12<=x<17 else 'evening' if 17<=x<21 else 'night')

In [142]:
#we want to convert the latitude and longitude values to h3 hex cells

#drop the columns if they exist
#ksi.drop(columns=['coordinates','hex_cell'])

#first, we need to combine the latitude and longitude values into a coordinates column
#ksi['coordinates'] = ksi['latitude'].astype('str') + ',' + ksi['longitude'].astype('str')

#then we define the function to get h3 hex cells from coordinates data
#def get_hex_cell(coordinates):
    a = float(coordinates.split(',')[0])
    b = float(coordinates.split(',')[1])
    return h3.latlng_to_cell(a,b,res=10)

#finally, create a new column for hex cells by applying the function on coordinates column
#ksi['hex_cell'] = ksi['coordinates'].apply(get_hex_cell)

IndentationError: unexpected indent (2608546461.py, line 11)

In [143]:
ksi.head()

Unnamed: 0_level_0,year,date,time,street1,street2,road_class,district,latitude,longitude,loccoord,...,speeding,ag_driv,redlight,alcohol,disability,neighbourhood_158,division,geometry,intersection,time_of_day
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80566288,2016,2016-08-01,2,MORNINGSIDE AVE,HALFWAY AVE,Minor Arterial,Scarborough,43.817847,-79.213687,Intersection,...,,Yes,,,,Malvern East,D42,POINT (-79.21369 43.81785),"MORNINGSIDE AVE,HALFWAY AVE",night
80632072,2016,2016-08-02,18,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,Intersection,...,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,POINT (-79.37693 43.64681),"FRONT ST E,YONGE ST",evening
80632073,2016,2016-08-02,18,FRONT ST E,YONGE ST,Major Arterial,Toronto and East York,43.646811,-79.376933,Intersection,...,,Yes,,,,St Lawrence-East Bayfront-The Islands,D51,POINT (-79.37693 43.64681),"FRONT ST E,YONGE ST",evening
80632074,2016,2016-08-02,21,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,Intersection,...,,Yes,,,,Harbourfront-CityPlace,D52,POINT (-79.39186 43.64157),"BLUE JAYS WAY,NAVY WHARF CRT",night
80632075,2016,2016-08-02,21,BLUE JAYS WAY,NAVY WHARF CRT,Minor Arterial,Toronto and East York,43.64157,-79.391864,Intersection,...,,Yes,,,,Harbourfront-CityPlace,D52,POINT (-79.39186 43.64157),"BLUE JAYS WAY,NAVY WHARF CRT",night


In [144]:
ksi.index.duplicated().sum()

0

In [146]:
ksi.describe()

Unnamed: 0,year,time,latitude,longitude
count,5962.0,5962.0,5962.0,5962.0
mean,2018.653304,14.235156,43.711171,-79.396253
std,1.954197,6.378966,0.057434,0.106259
min,2016.0,0.0,43.589678,-79.633502
25%,2017.0,10.0,43.659791,-79.476459
50%,2018.0,15.0,43.706267,-79.399108
75%,2020.0,19.0,43.757796,-79.318286
max,2022.0,24.0,43.84878,-79.122974


In [150]:
#adding a column prefix for columns in the dataset
ksi.columns = ['ksi_' + s if s != 'geometry' else s for s in ksi.columns]

In [153]:
ksi.to_csv('ksi_processed.csv')