In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#Pick whether you wante the data file prior to encoding or after based on whether you want to sample the data before running the model.

##------------------------------------------------##

#import the preprocessed and encoded data:

#Local to Local
# accidents = pd.read_csv('Colab_ETL_FULL_Encoded.csv')

#G-Drive to Colabs
# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/MyDrive/ML_data_and_code/Colab_ETL_FULL_Encoded.csv'
# accidents = pd.read_csv(path)

##------------------------------------------------##

#import the preprocessed but NOT ENCODED data (for modifying prior to encoding) 

#Local to Local:
accidents = pd.read_csv('colab_ETL_FULL_not_encoded.csv')

#G-Drive to Colabs
# from google.colab import drive
# drive.mount('/content/drive')
# path = '/content/drive/MyDrive/ML_data_and_code/Colab_ETL_FULL_not_Encoded.csv'
# accidents = pd.read_csv(path)

In [3]:
# checking to see what the input column types are
# pd.options.display.max_rows = None
# accidents.dtypes

In [4]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted to objects and encoded. Here they are finally converted to object in preparation for encoding. 

# dropped_feature = ''

accidents['Year'] = accidents['Year'].astype(object)
accidents['Month'] = accidents['Month'].astype(object)
accidents['Day'] = accidents['Day'].astype(object)
accidents['Hour'] = accidents['Hour'].astype(object)
# # data = data.drop(columns = [dropped_feature], axis=1)

In [5]:
#optional palce to subselect based on state for training and testing specific to a state.

# accidents = accidents[accidents['State'] == 'AL']


In [6]:
#confirming the layout and size of the accidents df
# accidents

In [7]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
sample_size = 50000
accidents = accidents.sample(n = sample_size, replace=True, random_state=1)
accidents

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),...,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
128037,2,0,AZ,Unknown,Unknown,Unknown,9-10,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
491755,2,0,NJ,30F:39F,80-89%,28-30,6-7,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
2568076,2,0,MD,70F:79F,80-89%,28-30,4-5,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
491263,2,0,CA,50F:59F,90-100%,28-30,0-1,SW,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
836489,2,1,SC,50F:59F,20-29%,30-32,9-10,Variable,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246255,2,0,MD,40F:49F,60-69%,30-32,9-10,NW,10-19,0-1,...,0,1,0,0,0,0,0,0,0,0
2766330,4,1,CO,50F:59F,50-59%,30-32,9-10,W,10-19,0-1,...,0,0,0,0,0,0,0,0,0,0
1031261,3,0,GA,80F:89F,40-49%,28-30,9-10,Variable,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
1523602,2,1,UT,30F:39F,90-100%,28-30,9-10,S,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Saving the pre-encoded sample data set

# output_data_file = "sample_data_not_encoded.csv"
# accidents.to_csv(output_data_file, index=False)

In [9]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode

type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
type_cat


['State',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Day/Night',
 'Year',
 'Month',
 'Day',
 'Hour']

In [10]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))



In [11]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)
encode_df.head()



Unnamed: 0,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_GA,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

In [13]:
#visualizing accdients2 to confirm 
accidents2

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),...,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
0,2,0,AZ,Unknown,Unknown,Unknown,9-10,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,NJ,30F:39F,80-89%,28-30,6-7,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
2,2,0,MD,70F:79F,80-89%,28-30,4-5,Calm,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
3,2,0,CA,50F:59F,90-100%,28-30,0-1,SW,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
4,2,1,SC,50F:59F,20-29%,30-32,9-10,Variable,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,MD,40F:49F,60-69%,30-32,9-10,NW,10-19,0-1,...,0,1,0,0,0,0,0,0,0,0
49996,4,1,CO,50F:59F,50-59%,30-32,9-10,W,10-19,0-1,...,0,0,0,0,0,0,0,0,0,0
49997,3,0,GA,80F:89F,40-49%,28-30,9-10,Variable,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0
49998,2,1,UT,30F:39F,90-100%,28-30,9-10,S,0-9,0-1,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Merge one-hot encoded features and drop the originals
accidents = accidents2.merge(encode_df, left_index=True, right_index=True)
accidents

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,2,0,AZ,Unknown,Unknown,Unknown,9-10,Calm,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,NJ,30F:39F,80-89%,28-30,6-7,Calm,0-9,0-1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,MD,70F:79F,80-89%,28-30,4-5,Calm,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,0,CA,50F:59F,90-100%,28-30,0-1,SW,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,SC,50F:59F,20-29%,30-32,9-10,Variable,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,MD,40F:49F,60-69%,30-32,9-10,NW,10-19,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49996,4,1,CO,50F:59F,50-59%,30-32,9-10,W,10-19,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,3,0,GA,80F:89F,40-49%,28-30,9-10,Variable,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,1,UT,30F:39F,90-100%,28-30,9-10,S,0-9,0-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,2,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49996,4,1,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,3,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#Saving the final sampled and encoded dataset that will be used in the ML model.
# output_data_file = "Accidents_100K_withCities_Preprocessed_Encoded.csv"
# accidents.to_csv(output_data_file, index=False)


Moving the encoded dataset into the ML model:

In [17]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Define target vector
y = accidents["Severity"].values

In [19]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

SMOTEENN Combination Oversampling/Undersampling:

In [20]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data

# Count the resampled classes
smote_enn = SMOTEENN(random_state=0)
X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
Counter(y_SMOTEENNresampled)

Counter({1: 36646, 2: 8810, 3: 33685, 4: 36639})

In [22]:
#Run the model
SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=200,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

GradientBoostingClassifier(learning_rate=0.5, max_features=200, random_state=0)

In [23]:
# Make Prediction
SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [24]:
# Calculating the accuracy score
SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
print(f"Accuracy Score : {SMOTEENN_acc_score}")

Accuracy Score : 0.8986612541026084


In [25]:
# Generate the confusion matrix
cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
cm

array([[36597,     7,    34,     8],
       [  153,  7267,  1278,   112],
       [  692,   654, 27625,  4714],
       [  457,   297,  3327, 32558]])

In [26]:
# Print the imbalanced classification report for SMOTEENN resampling
SMOTEENN_classreport = classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions)
print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.97      1.00      0.98      0.98      0.99      0.98     36646
          2       0.88      0.82      0.99      0.85      0.90      0.80      8810
          3       0.86      0.82      0.94      0.84      0.88      0.76     33685
          4       0.87      0.89      0.94      0.88      0.91      0.83     36639

avg / total       0.90      0.90      0.96      0.90      0.93      0.86    115780

