In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler



In [None]:
#importing mostly pre-processed data from the Accidents_ETL in LOCAL 
#data = pd.read_csv('Accidents_Preprocessed1.csv')

In [None]:
#importing mostly pre-processed data from the Accidents_ETL in COLAB
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/MyDrive/ML_data_and_code/Accidents_Preprocessed1.csv'
data = pd.read_csv(path)

In [None]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted to objects and encoded. Here they are finally converted to object in preparation for encoding. 
#Cities are also dropped to reduce the processing power required to handle an additional 11,000 columns of data. 

# dropped_feature = ['County']

data['Year'] = data['Year'].astype(object)
data['Month'] = data['Month'].astype(object)
data['Day'] = data['Day'].astype(object)
data['Hour'] = data['Hour'].astype(object)
data = data.drop(columns = ['City'], axis=1)
# data = data.drop(columns = [dropped_feature], axis=1)
data = data.drop(columns = ['County'], axis=1)

In [None]:
data['State'].value_counts()

CA    730706
FL    263275
TX    226619
NY    126138
NC    122785
SC    120460
OR     98350
PA     89745
VA     89730
MN     79710
IL     77623
GA     69536
MI     67073
AZ     61704
TN     55493
MD     52742
NJ     50212
LA     50102
WA     49455
OH     47836
UT     46897
CO     37271
OK     35104
AL     33287
MO     28674
MA     25120
IN     24988
CT     24054
KY     15296
WI     14668
NE     13677
IA      9491
RI      8314
NV      7812
KS      7444
NH      6250
MS      5419
DC      5158
AR      4726
DE      4625
ID      4165
NM      3973
WV      3421
MT      3333
ME      2266
VT       555
ND       463
WY       419
SD       217
Name: State, dtype: int64

In [None]:
#optional palce to subselect based on state for training and testing specific to a state.
state_name = 'AL'


data = data[data[state_name] == 'AL']


KeyError: ignored

In [None]:
#creating the accidents df based on the imported and processed data.abs
accidents = data.copy()

In [None]:
#confirming the layout and size of the accidents df
# accidents

In [None]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
# sample_size = 50000
# accidents = accidents.sample(n = sample_size, replace=True, random_state=1)
# accidents

In [None]:
#Saving the pre-encoded sample data set

# output_data_file = "sample_data_not_encoded.csv"
# accidents.to_csv(output_data_file, index=False)

In [None]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode

type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
type_cat


['State',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Day/Night',
 'Year',
 'Month',
 'Day',
 'Hour']

In [None]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))



In [None]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)
encode_df.head()



Unnamed: 0,State_AL,Temperature(F)_0F-9F,Temperature(F)_100F:109F,Temperature(F)_10F:19F,Temperature(F)_20F:29F,Temperature(F)_30F:39F,Temperature(F)_40F:49F,Temperature(F)_50F:59F,Temperature(F)_60F:69F,Temperature(F)_70F:79F,Temperature(F)_80F:89F,Temperature(F)_90F:99F,Temperature(F)_Unknown,Humidity(%)_0-9%,Humidity(%)_10-19%,Humidity(%)_20-29%,Humidity(%)_30-39%,Humidity(%)_40-49%,Humidity(%)_50-59%,Humidity(%)_60-69%,Humidity(%)_70-79%,Humidity(%)_80-89%,Humidity(%)_90-100%,Humidity(%)_Unknown,Pressure(in)_0-2,Pressure(in)_28-30,Pressure(in)_30-32,Pressure(in)_Unknown,Visibility(mi)_0-1,Visibility(mi)_1-2,Visibility(mi)_2-3,Visibility(mi)_3-4,Visibility(mi)_4-5,Visibility(mi)_5-6,Visibility(mi)_6-7,Visibility(mi)_7-8,Visibility(mi)_8-9,Visibility(mi)_9-10,Wind_Direction_Calm,Wind_Direction_E,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

In [None]:
#visualizing accdients2 to confirm 
accidents2

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Day/Night,Year,Month,Day,Hour,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,2,0,AL,80F:89F,40-49%,28-30,9-10,NW,0-9,0-1,Day,2019,8,15,12,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,AL,50F:59F,60-69%,28-30,9-10,NE,0-9,0-1,Night,2019,10,24,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,AL,80F:89F,30-39%,30-32,9-10,N,10-19,0-1,Day,2017,9,7,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,1,AL,90F:99F,40-49%,28-30,9-10,Calm,0-9,0-1,Day,2018,6,26,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,2,0,AL,80F:89F,50-59%,28-30,9-10,NW,0-9,0-1,Day,2019,5,17,10,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33282,2,0,AL,70F:79F,90-100%,28-30,9-10,NW,0-9,0-1,Night,2020,9,17,23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33283,2,0,AL,80F:89F,30-39%,28-30,9-10,S,0-9,0-1,Day,2019,5,16,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
33284,4,0,AL,50F:59F,70-79%,28-30,9-10,NE,0-9,0-1,Night,2020,11,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
33285,2,0,AL,70F:79F,70-79%,30-32,9-10,Calm,0-9,0-1,Night,2018,5,9,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Merge one-hot encoded features and drop the originals
accidents = accidents2.merge(encode_df, left_index=True, right_index=True)
accidents

Unnamed: 0,Severity,Side,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Day/Night,Year,Month,Day,Hour,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,2,0,AL,80F:89F,40-49%,28-30,9-10,NW,0-9,0-1,Day,2019,8,15,12,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,AL,50F:59F,60-69%,28-30,9-10,NE,0-9,0-1,Night,2019,10,24,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,AL,80F:89F,30-39%,30-32,9-10,N,10-19,0-1,Day,2017,9,7,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,AL,90F:99F,40-49%,28-30,9-10,Calm,0-9,0-1,Day,2018,6,26,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,AL,80F:89F,50-59%,28-30,9-10,NW,0-9,0-1,Day,2019,5,17,10,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33282,2,0,AL,70F:79F,90-100%,28-30,9-10,NW,0-9,0-1,Night,2020,9,17,23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33283,2,0,AL,80F:89F,30-39%,28-30,9-10,S,0-9,0-1,Day,2019,5,16,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33284,4,0,AL,50F:59F,70-79%,28-30,9-10,NE,0-9,0-1,Night,2020,11,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33285,2,0,AL,70F:79F,70-79%,30-32,9-10,Calm,0-9,0-1,Night,2018,5,9,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,State_AL,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33282,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33283,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33284,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33285,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Saving the final sampled and encoded dataset that will be used in the ML model.
# output_data_file = "Accidents_100K_withCities_Preprocessed_Encoded.csv"
#accidents.to_csv('/content/drive/MyDrive/ML_data_and_code/AL_Encoded.csv', index=False)


Moving the encoded dataset into the ML model:

In [None]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Weather_Condition_Cloudy,Weather_Condition_Dust,Weather_Condition_Dust_Whirls,Weather_Condition_Fair,Weather_Condition_Fog,Weather_Condition_Funnel_Cloud,Weather_Condition_Hail,Weather_Condition_Heavy_Rain,Weather_Condition_Heavy_Snow,Weather_Condition_Light_Rain,Weather_Condition_Light_Snow,Weather_Condition_Mist,Weather_Condition_Mostly_Cloudy,Weather_Condition_Partly_Cloudy,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix,State_AL,Temperature(F)_0F-9F,...,Day_23,Day_24,Day_25,Day_26,Day_27,Day_28,Day_29,Day_3,Day_30,Day_31,Day_4,Day_5,Day_6,Day_7,Day_8,Day_9,Hour_0,Hour_1,Hour_10,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_2,Hour_20,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Define target vector
y = accidents["Severity"].values

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:

#Testing to see which learning rate would provide the best outcomes for the ML model:

# Create a classifier object
# learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# for learning_rate in learning_rates:
#     classifier = GradientBoostingClassifier(n_estimators=100,
#                                             learning_rate=learning_rate,
#                                             max_features=5,
#                                             max_depth=3,
#                                             random_state=0)

#     # Fit the model
#     classifier.fit(X_train_scaled, y_train)
#     print("Learning rate: ", learning_rate)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier.score(
#             X_train_scaled,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier.score(
#             X_test_scaled,
#             y_test)))
#     print()

In [None]:
#finding the ideal number of features

#Call and Fit the model 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit_transform(X_train, y_train)

#identify the number of features that have importance above the mean importance level.
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

#show the important features
print(selected_feat)




80
Index(['Side', 'Junction', 'Traffic_Signal', 'Weather_Condition_Cloudy',
       'Weather_Condition_Fair', 'Weather_Condition_Light_Rain',
       'Weather_Condition_Mostly_Cloudy', 'Weather_Condition_Partly_Cloudy',
       'Temperature(F)_40F:49F', 'Temperature(F)_50F:59F',
       'Temperature(F)_60F:69F', 'Temperature(F)_70F:79F',
       'Temperature(F)_80F:89F', 'Humidity(%)_30-39%', 'Humidity(%)_40-49%',
       'Humidity(%)_50-59%', 'Humidity(%)_60-69%', 'Humidity(%)_70-79%',
       'Humidity(%)_80-89%', 'Humidity(%)_90-100%', 'Pressure(in)_28-30',
       'Pressure(in)_30-32', 'Visibility(mi)_9-10', 'Wind_Direction_Calm',
       'Wind_Direction_E', 'Wind_Direction_N', 'Wind_Direction_NE',
       'Wind_Direction_NW', 'Wind_Direction_S', 'Wind_Direction_SE',
       'Wind_Direction_SW', 'Wind_Direction_Variable', 'Wind_Direction_W',
       'Wind_Speed(mph)_0-9', 'Wind_Speed(mph)_10-19', 'Day/Night_Day',
       'Day/Night_Night', 'Year_2017', 'Year_2018', 'Year_2019', 'Year_2020',
   

In [None]:
calc_num = len(selected_feat)
manual_num = 100
feature_num = manual_num

In [None]:
#Run the model
GB_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
GB_classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=100, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
pd.options.display.max_rows = 200
import pandas as pd
feature_importances = pd.DataFrame(GB_classifier.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Hour_9,0.344487
Day_31,0.1834599
Month_4,0.1799813
Temperature(F)_70F:79F,0.1277817
Traffic_Signal,0.06552168
Side,0.04592742
Junction,0.004800962
Temperature(F)_20F:29F,0.004220663
Pressure(in)_30-32,0.004161962
Year_2020,0.003789511


In [None]:
# Make Prediction
GB_predictions = GB_classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": GB_predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,2,2
1,2,4
2,2,2
3,2,2
4,2,2
5,2,4
6,2,3
7,2,2
8,2,3
9,2,2


In [None]:
# Calculating the accuracy score
GB_acc_score = accuracy_score(y_test, GB_predictions)
print(f"Accuracy Score : {GB_acc_score}")

Accuracy Score : 0.6988704638308099


In [None]:
# Generate the confusion matrix
GB_cm = confusion_matrix(y_test, GB_predictions)
GB_cm

array([[   0,   29,    9,    4],
       [   1, 5031,  771,  209],
       [   1, 1293,  778,   18],
       [   0,  111,   60,    7]])

In [None]:
# Print the imbalanced classification report
RFM_classreport = classification_report_imbalanced(y_test, GB_predictions)
print(classification_report_imbalanced(y_test, GB_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      1.00      0.00      0.00      0.00        42
          2       0.78      0.84      0.38      0.81      0.56      0.33      6012
          3       0.48      0.37      0.87      0.42      0.57      0.31      2090
          4       0.03      0.04      0.97      0.03      0.20      0.03       178

avg / total       0.68      0.70      0.52      0.69      0.55      0.32      8322



Testing Resampling Methods To Maximize Accuracy

In [None]:
#Counting the distribution of target values to determine representation %'s. 
Counter(y)

Counter({1: 126, 2: 24023, 3: 8421, 4: 717})

Random Oversampling:

In [None]:
#Oversampling of lower represented variables with Random Oversampling

# implement random oversampling
ros = RandomOverSampler(random_state=1)
X_randomoversampled, y_randomoversampled = ros.fit_resample(X_train, y_train)

Counter(y_randomoversampled)



Counter({1: 18011, 2: 18011, 3: 18011, 4: 18011})

In [None]:
#Run the random oversampling model
ROS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
ROS_classifier.fit(X_randomoversampled, y_randomoversampled)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=100, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# Make Prediction with random oversampling
ROS_predictions = ROS_classifier.predict(X_randomoversampled)
pd.DataFrame({"Prediction": ROS_predictions, "Actual": y_randomoversampled}).head(20)

Unnamed: 0,Prediction,Actual
0,2,2
1,3,3
2,3,3
3,2,2
4,3,3
5,3,3
6,2,2
7,3,2
8,4,2
9,2,2


In [None]:
# Calculating the accuracy score of random oversampling
ROS_acc_score = accuracy_score(y_randomoversampled, ROS_predictions)
print(f"Accuracy Score : {ROS_acc_score}")

Accuracy Score : 0.8184581644550553


In [None]:
# Generate the confusion matrix for random oversampling
cm = confusion_matrix(y_randomoversampled, ROS_predictions)
cm

array([[18011,     0,     0,     0],
       [  365, 11115,  4921,  1610],
       [  311,  2517, 13941,  1242],
       [   72,   658,  1383, 15898]])

In [None]:
# Print the imbalanced classification report for random oversampling
ROS_classreport = classification_report_imbalanced(y_randomoversampled, ROS_predictions)
print(classification_report_imbalanced(y_randomoversampled, ROS_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.96      1.00      0.99      0.98      0.99      0.99     18011
          2       0.78      0.62      0.94      0.69      0.76      0.56     18011
          3       0.69      0.77      0.88      0.73      0.83      0.68     18011
          4       0.85      0.88      0.95      0.86      0.91      0.83     18011

avg / total       0.82      0.82      0.94      0.82      0.87      0.76     72044



SMOTE Oversampling:

In [None]:
#Resampling with SMOTE Oversampling:
X_SMOTEresampled, y_SMOTEresampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_SMOTEresampled)



Counter({1: 18011, 2: 18011, 3: 18011, 4: 18011})

In [None]:
#Run the SMOTE oversampling model
SMOTE_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTE_classifier.fit(X_SMOTEresampled, y_SMOTEresampled)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=100, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# Make Prediction with SMOTE oversampling
SMOTE_predictions = SMOTE_classifier.predict(X_SMOTEresampled)
pd.DataFrame({"Prediction": SMOTE_predictions, "Actual": y_SMOTEresampled}).head(20)

Unnamed: 0,Prediction,Actual
0,2,2
1,2,3
2,3,3
3,2,2
4,3,3
5,2,3
6,2,2
7,2,2
8,2,2
9,2,2


In [None]:
# Calculating the accuracy score of SMOTE oversampling
SMOTE_acc_score = accuracy_score(y_SMOTEresampled, SMOTE_predictions)
print(f"Accuracy Score : {SMOTE_acc_score}")

Accuracy Score : 0.9043084781522402


In [None]:
# Generate the confusion matrix for SMOTE oversampling
SMOTE_cm = confusion_matrix(y_SMOTEresampled, SMOTE_predictions)
SMOTE_cm

array([[17970,    34,     7,     0],
       [   16, 15627,  2304,    64],
       [   54,  2846, 14394,   717],
       [    0,   518,   334, 17159]])

In [None]:
# Print the imbalanced classification report for SMOTE oversampling
SMOTE_classreport = classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions)
print(classification_report_imbalanced(y_SMOTEresampled, SMOTE_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      1.00      1.00      1.00      1.00      1.00     18011
          2       0.82      0.87      0.94      0.84      0.90      0.81     18011
          3       0.84      0.80      0.95      0.82      0.87      0.75     18011
          4       0.96      0.95      0.99      0.95      0.97      0.94     18011

avg / total       0.90      0.90      0.97      0.90      0.94      0.87     72044



Random Undersampling:

In [None]:
# Undersample the data using `RandomUnderSampler`
rus = RandomUnderSampler(random_state=1)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)
Counter(y_undersampled)



Counter({1: 84, 2: 84, 3: 84, 4: 84})

In [None]:
#Run the Random Undersampling model
RUS_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
RUS_classifier.fit(X_undersampled, y_undersampled)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=100, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# Make Prediction with random undersampling
RUS_predictions = RUS_classifier.predict(X_undersampled)
pd.DataFrame({"Prediction": RUS_predictions, "Actual": y_undersampled}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [None]:
# Calculating the accuracy score of random undersampling
RUS_acc_score = accuracy_score(y_undersampled, RUS_predictions)
print(f"Accuracy Score : {RUS_acc_score}")

Accuracy Score : 1.0


In [None]:
# Generate the confusion matrix for SMOTE oversampling
RUS_cm = confusion_matrix(y_undersampled, RUS_predictions)
RUS_cm

array([[84,  0,  0,  0],
       [ 0, 84,  0,  0],
       [ 0,  0, 84,  0],
       [ 0,  0,  0, 84]])

In [None]:
# Print the imbalanced classification report for Random Undersampling
RUS_classreport = classification_report_imbalanced(y_undersampled, RUS_predictions)
print(classification_report_imbalanced(y_undersampled, RUS_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       1.00      1.00      1.00      1.00      1.00      1.00        84
          2       1.00      1.00      1.00      1.00      1.00      1.00        84
          3       1.00      1.00      1.00      1.00      1.00      1.00        84
          4       1.00      1.00      1.00      1.00      1.00      1.00        84

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       336



SMOTEENN Combination Oversampling/Undersampling:

In [None]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data

# Count the resampled classes
smote_enn = SMOTEENN(random_state=0)
X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
Counter(y_SMOTEENNresampled)



Counter({1: 23898, 2: 8024, 3: 19363, 4: 23221})

In [None]:
#Run the model
SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.5, loss='deviance', max_depth=3,
                           max_features=100, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
# Make Prediction
SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,2,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,3,1


In [None]:
# Calculating the accuracy score
SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
print(f"Accuracy Score : {SMOTEENN_acc_score}")

Accuracy Score : 0.9467693877003195


In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
cm

array([[23866,    11,    21,     0],
       [   33,  6730,  1228,    33],
       [  161,   594, 17503,  1105],
       [   11,   148,   621, 22441]])

In [None]:
# Print the imbalanced classification report for SMOTEENN resampling
SMOTEENN_classreport = classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions)
print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.99      1.00      1.00      1.00      1.00      0.99     23898
          2       0.90      0.84      0.99      0.87      0.91      0.82      8024
          3       0.90      0.90      0.97      0.90      0.93      0.87     19363
          4       0.95      0.97      0.98      0.96      0.97      0.94     23221

avg / total       0.95      0.95      0.98      0.95      0.96      0.93     74506



In [None]:
print(f'OVERVIEW:')
print(f'Dropped Feature(s): City, County')
print(f'State: {state_name}')
# print('----------------------')
# print(f'Sample Size: {sample_size}')
print(f'selected features: {feature_num}')
print(f'---------------------')
print(f'Gradient Boosted RFM Accuracy: {GB_acc_score}')
print(f'Gradient Boosted RFM Class Report:')
print(f'{RFM_classreport}')
print(f'Random Over Sampling Accuracy: {ROS_acc_score}')
print(f'Random Over Sampling Class Report:')
print(f'{ROS_classreport}')
print(f'SMOTE Over Sampling Accuracy: {SMOTE_acc_score}')
print(f'SMOTE Over Sampling Class Report:')
print(f'{SMOTE_classreport}')
print(f'Random Undersampling Accuracy: {RUS_acc_score}')
print(f'Random Undersampling Class Report:')
print(f'{RUS_classreport}')
print(f'SMOTEEENN Resampling Accuracy: {SMOTEENN_acc_score}')
print(f'SMOTEEENN Class Report:')
print(f'{SMOTEENN_classreport}')

OVERVIEW:
Dropped Feature(s): City, County
State: AL
selected features: 100
---------------------
Gradient Boosted RFM Accuracy: 0.6988704638308099
Gradient Boosted RFM Class Report:
                   pre       rec       spe        f1       geo       iba       sup

          1       0.00      0.00      1.00      0.00      0.00      0.00        42
          2       0.78      0.84      0.38      0.81      0.56      0.33      6012
          3       0.48      0.37      0.87      0.42      0.57      0.31      2090
          4       0.03      0.04      0.97      0.03      0.20      0.03       178

avg / total       0.68      0.70      0.52      0.69      0.55      0.32      8322

Random Over Sampling Accuracy: 0.8184581644550553
Random Over Sampling Class Report:
                   pre       rec       spe        f1       geo       iba       sup

          1       0.96      1.00      0.99      0.98      0.99      0.99     18011
          2       0.78      0.62      0.94      0.69      0.76  