In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#Pick whether you wante the data file prior to encoding or after based on whether you want to sample the data before running the model.

##------------------------------------------------##

#import the preprocessed and encoded data:

#Local to Local
accidents = pd.read_csv('Colab_ETL_FULL_Encoded.csv')

#G-Drive to Colabs
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/ML_data_and_code/Colab_ETL_FULL_Encoded.csv'
accidents = pd.read_csv(path)

##------------------------------------------------##

#import the preprocessed but NOT ENCODED data (for modifying prior to encoding) 

#Local to Local:
accidents = pd.read_csv('colab_ETL_FULL_not_encoded.csv')

#G-Drive to Colabs
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/ML_data_and_code/Colab_ETL_FULL_not_Encoded.csv'
accidents = pd.read_csv(path)

In [5]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted to objects and encoded. Here they are finally converted to object in preparation for encoding. 
#Cities are also dropped to reduce the processing power required to handle an additional 11,000 columns of data. 

# dropped_feature = ''

# data['Year'] = data['Year'].astype(object)
# data['Month'] = data['Month'].astype(object)
# data['Day'] = data['Day'].astype(object)
# data['Hour'] = data['Hour'].astype(object)
# # data = data.drop(columns = [dropped_feature], axis=1)
# data = data.drop(columns = ['City'], axis=1)
# data = data.drop(columns = ['County'], axis=1)

In [8]:
#optional palce to subselect based on state for training and testing specific to a state.

#accidents = accidents[accidents['State'] == 'CA']


In [7]:
#confirming the layout and size of the accidents df
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,2,0,"Greenville,SC","Greenville,SC",SC,70F:79F,50-59%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Charlotte,NC","Mecklenburg,NC",NC,70F:79F,60-69%,28-30,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
2,2,0,"Los Gatos,CA","Santa Clara,CA",CA,50F:59F,70-79%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"Carson City,NV","Douglas,NV",NV,50F:59F,10-19%,30-32,9-10,SW,...,0,0,0,0,0,0,0,0,0,0
4,3,0,"Fort Lauderdale,FL","Broward,FL",FL,80F:89F,80-89%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906376,2,1,"Houston,TX","Harris,TX",TX,80F:89F,60-69%,30-32,8-9,Variable,...,0,0,0,0,0,0,0,0,0,0
2906377,2,0,"Colton,CA","San Bernardino,CA",CA,40F:49F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
2906378,2,1,"Miami,FL","Miami-Dade,FL",FL,70F:79F,80-89%,28-30,9-10,NW,...,0,0,0,0,0,0,0,0,0,0
2906379,2,0,"Salt Lake City,UT","Salt Lake,UT",UT,20F:29F,80-89%,24-26,9-10,SE,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
sample_size = 50000
accidents = accidents.sample(n = sample_size, replace=True, random_state=1)
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
128037,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
491755,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0,0,0,0,0,0,0,0,0,1
2568076,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
491263,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
836489,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246255,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
2766330,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
1031261,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0
1523602,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#Saving the pre-encoded sample data set

# output_data_file = "sample_data_not_encoded.csv"
# accidents.to_csv(output_data_file, index=False)

In [11]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode

type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
type_cat


['City',
 'County',
 'State',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Day/Night',
 'Year',
 'Month',
 'Day',
 'Hour']

In [12]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))



In [13]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)
encode_df.head()



Unnamed: 0,"City_Abbeville,SC","City_Aberdeen,MD","City_Aberdeen,NC","City_Abingdon,MD","City_Abingdon,VA","City_Abington,PA","City_Absecon,NJ","City_Acampo,CA","City_Accokeek,MD","City_Acme,PA",...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

In [15]:
#visualizing accdients2 to confirm 
accidents2

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0,0,0,0,0,0,0,0,0,1
2,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
4,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0,0,0,0,0,0,0,0,0,0
49996,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0,0,0,0,0,0,0,0,0,0
49997,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0
49998,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Merge one-hot encoded features and drop the originals
accidents = accidents2.merge(encode_df, left_index=True, right_index=True)
accidents

Unnamed: 0,Severity,Side,City,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,3,0,"San Francisco,CA","San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,"Tillamook,OR","Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,0,"Rancho Cucamonga,CA","San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,"La Puente,CA","Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,"San Martin,CA","Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,"Mount Shasta,CA","Siskiyou,CA",CA,50F:59F,50-59%,26-28,9-10,Variable,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,2,0,"Leavenworth,WA","Chelan,WA",WA,30F:39F,70-79%,28-30,9-10,SE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,2,0,"Charlotte,NC","Mecklenburg,NC",NC,80F:89F,40-49%,28-30,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,0,"Bradenton,FL","Manatee,FL",FL,80F:89F,70-79%,28-30,9-10,W,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,3,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2,0,0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
#Saving the final sampled and encoded dataset that will be used in the ML model.
# output_data_file = "Accidents_100K_withCities_Preprocessed_Encoded.csv"
# accidents.to_csv(output_data_file, index=False)


Moving the encoded dataset into the ML model:

In [19]:
# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
# Define target vector
y = accidents["Severity"].values

In [21]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

SMOTEENN Combination Oversampling/Undersampling:

In [50]:
# Use the SMOTEENN technique to perform combination of oversampling and undersampling on the data

# Count the resampled classes
smote_enn = SMOTEENN(random_state=0)
X_SMOTEENNresampled, y_SMOTEENNresampled = smote_enn.fit_resample(X, y)
Counter(y_SMOTEENNresampled)

KeyboardInterrupt: 

In [None]:
#Run the model
SMOTEENN_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=feature_num,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
SMOTEENN_classifier.fit(X_SMOTEENNresampled, y_SMOTEENNresampled)

In [None]:
# Make Prediction
SMOTEENN_predictions = SMOTEENN_classifier.predict(X_SMOTEENNresampled)
pd.DataFrame({"Prediction": SMOTEENN_predictions, "Actual": y_SMOTEENNresampled}).head(20)

In [None]:
# Calculating the accuracy score
SMOTEENN_acc_score = accuracy_score(y_SMOTEENNresampled, SMOTEENN_predictions)
print(f"Accuracy Score : {SMOTEENN_acc_score}")

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_SMOTEENNresampled, SMOTEENN_predictions)
cm

In [None]:
# Print the imbalanced classification report for SMOTEENN resampling
SMOTEENN_classreport = classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions)
print(classification_report_imbalanced(y_SMOTEENNresampled, SMOTEENN_predictions))