In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [3]:
# Import and read clean_KSI.csv
clean_ksi_df = pd.read_csv("../Preprocessing/clean_KSI.csv")
clean_ksi_df.head()

Unnamed: 0,Accident Number,Year,Date,Month,Day,Hour,Time,Weekday,Is_Weekend,Is_Holiday,...,Passenger Involved,Speeding Related,Aggressive and Distracted Driving,Red Light Related,Alcohol Related,Medical or Physical Disability Related,Police Division,Neighbourhood ID,Neighbourhood,Object ID
0,892658,2006,2006-03-11 05:00:00+00:00,3,11,8,Morning,Saturday,1,0,...,0,0,1,0,0,0,11,88,High Park North,1
1,892658,2006,2006-03-11 05:00:00+00:00,3,11,8,Morning,Saturday,1,0,...,0,0,1,0,0,0,11,88,High Park North,2
2,892810,2006,2006-03-11 05:00:00+00:00,3,11,9,Morning,Saturday,1,0,...,0,0,1,1,0,0,42,131,Rouge,3
3,892810,2006,2006-03-11 05:00:00+00:00,3,11,9,Morning,Saturday,1,0,...,0,0,1,1,0,0,42,131,Rouge,4
4,892682,2006,2006-03-12 05:00:00+00:00,3,12,2,Night,Sunday,1,0,...,0,0,0,0,1,0,41,138,Eglinton East,5


In [4]:
# Check Columns
clean_ksi_df.columns

Index(['Accident Number', 'Year', 'Date', 'Month', 'Day', 'Hour', 'Time',
       'Weekday', 'Is_Weekend', 'Is_Holiday', 'Holiday', 'Season',
       'Road Classification', 'City District', 'Latitude', 'Longitude',
       'Location Coordinate', 'Traffic Control', 'Environment Condition',
       'Light Condition', 'Road Surface Condition',
       'Classification of Accident', 'Initial Impact Type', 'Involvement Type',
       'Age of Involved Party', 'Severity of Injury', 'Type of Vehicle',
       'Vehicle Manouever', 'Pedestrian Involved', 'Cyclists Involved',
       'Driver Involved', 'Motorcyclist Involved', 'Truck Driver Involved',
       'Transit or City Vehicle Involved', 'Emergency Vehicle Involved',
       'Passenger Involved', 'Speeding Related',
       'Aggressive and Distracted Driving', 'Red Light Related',
       'Alcohol Related', 'Medical or Physical Disability Related',
       'Police Division', 'Neighbourhood ID', 'Neighbourhood', 'Object ID'],
      dtype='object')

In [5]:
# Drop non-beneficial columns
clean_ksi_df = clean_ksi_df.drop(columns=['Accident Number', 'Date', 'Latitude', 'Longitude', 'Neighbourhood', 'Object ID', 'Holiday', 'Weekday'])

In [6]:
# Drop null columns where all values are null
clean_ksi_df = clean_ksi_df.dropna(axis='columns', how='all')

# Drop null rows
clean_ksi_df = clean_ksi_df.dropna()

# Remove the '<Null>' City District values
null = clean_ksi_df['City District'] != '<Null>'
clean_ksi_df = clean_ksi_df.loc[null]

clean_ksi_df.reset_index(inplace=True, drop=True)
clean_ksi_df.head()

Unnamed: 0,Year,Month,Day,Hour,Time,Is_Weekend,Is_Holiday,Season,Road Classification,City District,...,Transit or City Vehicle Involved,Emergency Vehicle Involved,Passenger Involved,Speeding Related,Aggressive and Distracted Driving,Red Light Related,Alcohol Related,Medical or Physical Disability Related,Police Division,Neighbourhood ID
0,2006,3,11,8,Morning,1,0,Spring,Major Arterial,Toronto and East York,...,0,0,0,0,1,0,0,0,11,88
1,2006,3,11,8,Morning,1,0,Spring,Major Arterial,Toronto and East York,...,0,0,0,0,1,0,0,0,11,88
2,2006,3,11,9,Morning,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,1,1,0,0,42,131
3,2006,3,11,9,Morning,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,1,1,0,0,42,131
4,2006,3,12,2,Night,1,0,Spring,Major Arterial,Scarborough,...,0,0,0,0,0,0,1,0,41,138


In [7]:
# Target Variable = Classification of Accident
# Create Features
X = clean_ksi_df.drop(columns='Classification of Accident')
X = pd.get_dummies(X)

# Create Target

y = clean_ksi_df['Classification of Accident']

In [8]:
X.describe()

Unnamed: 0,Year,Month,Day,Hour,Is_Weekend,Is_Holiday,Pedestrian Involved,Cyclists Involved,Driver Involved,Motorcyclist Involved,...,Vehicle Manouever_Going Ahead,Vehicle Manouever_Not Applicable,Vehicle Manouever_Other,Vehicle Manouever_Parked,Vehicle Manouever_Reversing,Vehicle Manouever_Slowing or Stopping,Vehicle Manouever_Stopped,Vehicle Manouever_Turning Left,Vehicle Manouever_Turning Right,Vehicle Manouever_Unknown
count,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,...,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0,16719.0
mean,2012.194509,6.7897,15.598959,13.23883,0.272744,0.02554,0.405347,0.10509,0.908428,0.081225,...,0.337042,0.429691,0.025899,0.010348,0.006759,0.014774,0.033914,0.096896,0.0256,0.007058
std,4.216105,3.29152,8.85486,6.299307,0.445383,0.157763,0.490974,0.306679,0.28843,0.273188,...,0.472713,0.495047,0.158838,0.101198,0.081936,0.120649,0.181012,0.295825,0.157942,0.083716
min,2006.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2008.0,4.0,8.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2012.0,7.0,16.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2016.0,10.0,23.0,18.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2020.0,12.0,31.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# Check the balance of our target values
y.value_counts()

Non-Fatal    14424
Fatal         2295
Name: Classification of Accident, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Oversampling

### Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Non-Fatal': 10817, 'Fatal': 10817})

In [12]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [13]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7263070982300752

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 419,  154],
       [1005, 2602]], dtype=int64)

In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.29      0.73      0.72      0.42      0.73      0.53       573
  Non-Fatal       0.94      0.72      0.73      0.82      0.73      0.53      3607

avg / total       0.86      0.72      0.73      0.76      0.73      0.53      4180



### SMOTE Oversampling

In [16]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({'Non-Fatal': 10817, 'Fatal': 10817})

In [17]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6753046117908217

In [19]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 287,  286],
       [ 542, 3065]], dtype=int64)

In [20]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.35      0.50      0.85      0.41      0.65      0.41       573
  Non-Fatal       0.91      0.85      0.50      0.88      0.65      0.44      3607

avg / total       0.84      0.80      0.55      0.82      0.65      0.44      4180



# Undersampling

In [21]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Fatal': 1722, 'Non-Fatal': 1722})

In [22]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [23]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5247470620197009

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 568,    5],
       [3397,  210]], dtype=int64)

In [25]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.14      0.99      0.06      0.25      0.24      0.06       573
  Non-Fatal       0.98      0.06      0.99      0.11      0.24      0.05      3607

avg / total       0.86      0.19      0.86      0.13      0.24      0.05      4180



# Combination (Over and Under) Sampling

In [26]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'Fatal': 14376, 'Non-Fatal': 12307})

In [27]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [28]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6784468923380029

In [29]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 313,  260],
       [ 683, 2924]], dtype=int64)

In [30]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.31      0.55      0.81      0.40      0.67      0.43       573
  Non-Fatal       0.92      0.81      0.55      0.86      0.67      0.45      3607

avg / total       0.84      0.77      0.58      0.80      0.67      0.45      4180



# Ensemble Learners

### Balanced Random Forest Classifier

In [31]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

classifier = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [32]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
classifier.fit(X_train, y_train) 

# Make Prediction
y_pred = classifier.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8675290096675505

In [33]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 503,   70],
       [ 515, 3092]], dtype=int64)

In [34]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.49      0.88      0.86      0.63      0.87      0.75       573
  Non-Fatal       0.98      0.86      0.88      0.91      0.87      0.75      3607

avg / total       0.91      0.86      0.88      0.88      0.87      0.75      4180



In [35]:
# List the features sorted in descending order by feature importance
sorted(zip(classifier.feature_importances_, X.columns), reverse=True)

[(0.11640346845450629, 'Severity of Injury_Fatal'),
 (0.08611159849571752, 'Severity of Injury_Major'),
 (0.0529629251252707, 'Neighbourhood ID'),
 (0.047686739758221845, 'Day'),
 (0.04468094029660609, 'Year'),
 (0.04391201213926802, 'Hour'),
 (0.03784483474181595, 'Police Division'),
 (0.035578182905065384, 'Month'),
 (0.01692969970659132, 'Speeding Related'),
 (0.013930538665272408, 'Severity of Injury_None'),
 (0.0125337094400059, 'Truck Driver Involved'),
 (0.01246429335410087, 'Passenger Involved'),
 (0.011669393215408734, 'Involvement Type_Pedestrian'),
 (0.011654791562818598, 'Pedestrian Involved'),
 (0.011502860853538181, 'Initial Impact Type_Pedestrian Collisions'),
 (0.011236157714403821, 'Aggressive and Distracted Driving'),
 (0.010478117843138622, 'Initial Impact Type_Rear End'),
 (0.010393970232260675, 'Is_Weekend'),
 (0.009898104726965886, 'City District_Toronto and East York'),
 (0.00960756222142897, 'Season_Summer'),
 (0.009512271497445933, 'Traffic Control_No Control')

### Easy Ensemble AdaBoost Classifier

In [36]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
classifier = EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [37]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.7802319612194826

In [38]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 436,  137],
       [ 723, 2884]], dtype=int64)

In [39]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      Fatal       0.38      0.76      0.80      0.50      0.78      0.61       573
  Non-Fatal       0.95      0.80      0.76      0.87      0.78      0.61      3607

avg / total       0.88      0.79      0.77      0.82      0.78      0.61      4180

