In [1]:
# Import dependencies
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Import and read clean_KSI.csv
clean_ksi_df = pd.read_csv("../Preprocessing/clean_KSI.csv")
clean_ksi_df.head()

Unnamed: 0,Accident Number,Year,Date,Month,Day,Hour,Time,Weekday,Is_Weekend,Is_Holiday,...,Passenger Involved,Speeding Related,Aggressive and Distracted Driving,Red Light Related,Alcohol Related,Medical or Physical Disability Related,Police Division,Neighbourhood ID,Neighbourhood,Object ID
0,892658,2006,2006-03-11 05:00:00+00:00,3,11,8,Morning,Saturday,1,0,...,0,0,1,0,0,0,11,88,High Park North,1
1,892658,2006,2006-03-11 05:00:00+00:00,3,11,8,Morning,Saturday,1,0,...,0,0,1,0,0,0,11,88,High Park North,2
2,892810,2006,2006-03-11 05:00:00+00:00,3,11,9,Morning,Saturday,1,0,...,0,0,1,1,0,0,42,131,Rouge,3
3,892810,2006,2006-03-11 05:00:00+00:00,3,11,9,Morning,Saturday,1,0,...,0,0,1,1,0,0,42,131,Rouge,4
4,892682,2006,2006-03-12 05:00:00+00:00,3,12,2,Night,Sunday,1,0,...,0,0,0,0,1,0,41,138,Eglinton East,5


In [3]:
# Check Columns
clean_ksi_df.columns

Index(['Accident Number', 'Year', 'Date', 'Month', 'Day', 'Hour', 'Time',
       'Weekday', 'Is_Weekend', 'Is_Holiday', 'Holiday', 'Season',
       'Road Classification', 'City District', 'Latitude', 'Longitude',
       'Location Coordinate', 'Traffic Control', 'Environment Condition',
       'Light Condition', 'Road Surface Condition',
       'Classification of Accident', 'Initial Impact Type', 'Involvement Type',
       'Age of Involved Party', 'Severity of Injury', 'Type of Vehicle',
       'Vehicle Manouever', 'Pedestrian Involved', 'Cyclists Involved',
       'Driver Involved', 'Motorcyclist Involved', 'Truck Driver Involved',
       'Transit or City Vehicle Involved', 'Emergency Vehicle Involved',
       'Passenger Involved', 'Speeding Related',
       'Aggressive and Distracted Driving', 'Red Light Related',
       'Alcohol Related', 'Medical or Physical Disability Related',
       'Police Division', 'Neighbourhood ID', 'Neighbourhood', 'Object ID'],
      dtype='object')

In [4]:
# Drop non-beneficial columns
clean_ksi_df = clean_ksi_df.drop(columns=['Accident Number', 'Date', 'Latitude', 'Longitude', 'Neighbourhood', 'Object ID', 'Holiday', 'Weekday'])

In [9]:
# Drop null columns where all values are null
clean_ksi_df = clean_ksi_df.dropna(axis='columns', how='all')

# Drop null rows
clean_ksi_df = clean_ksi_df.dropna()

# Remove the '<Null>' City District values
null = clean_ksi_df['City District'] != '<Null>'
clean_ksi_df = clean_ksi_df.loc[null]

clean_ksi_df.reset_index(inplace=True, drop=True)
clean_ksi_df.head()

Unnamed: 0,Year,Month,Day,Hour,Time,Is_Weekend,Is_Holiday,Season,Road Classification,City District,...,Transit or City Vehicle Involved,Emergency Vehicle Involved,Passenger Involved,Speeding Related,Aggressive and Distracted Driving,Red Light Related,Alcohol Related,Medical or Physical Disability Related,Police Division,Neighbourhood ID
0,2006,3,11,8,Morning,1,0,Spring,Major Arterial,Toronto and East York,...,0,0,0,0,1,0,0,0,11,88
1,2006,3,11,8,Morning,1,0,Spring,Major Arterial,Toronto and East York,...,0,0,0,0,1,0,0,0,11,88
2,2006,3,11,9,Morning,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,1,1,0,0,42,131
3,2006,3,11,9,Morning,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,1,1,0,0,42,131
4,2006,3,12,2,Night,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,0,0,1,0,41,138


In [17]:
# Target Variable = Classification of Accident
# Create Features
X = clean_ksi_df.drop(columns='Classification of Accident')
X = pd.get_dummies(X)

# Create Target

y = clean_ksi_df['Classification of Accident']

In [18]:
X.describe()

Unnamed: 0,Month,Day,Hour,Is_Weekend,Is_Holiday,Pedestrian Involved,Cyclists Involved,Driver Involved,Motorcyclist Involved,Truck Driver Involved,...,Vehicle Manouever_Going Ahead,Vehicle Manouever_Not Applicable,Vehicle Manouever_Other,Vehicle Manouever_Parked,Vehicle Manouever_Reversing,Vehicle Manouever_Slowing or Stopping,Vehicle Manouever_Stopped,Vehicle Manouever_Turning Left,Vehicle Manouever_Turning Right,Vehicle Manouever_Unknown
count,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,...,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0
mean,6.7897,15.598959,13.23883,0.272744,0.02554,0.405347,0.10509,0.908428,0.081225,0.061965,...,0.337042,0.429691,0.025899,0.010348,0.006759,0.014774,0.033914,0.096896,0.0256,0.007058
std,3.29152,8.85486,6.299307,0.445383,0.157763,0.490974,0.306679,0.28843,0.273188,0.2411,...,0.472713,0.495047,0.158838,0.101198,0.081936,0.120649,0.181012,0.295825,0.157942,0.083716
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,8.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,16.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,23.0,18.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,31.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# Check the balance of our target values
y.value_counts()

Non-Fatal    14424
Fatal         2295
Name: Classification of Accident, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Oversampling

## Naive Random Oversampling

In [22]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

ModuleNotFoundError: No module named 'imblearn'