In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [25]:
#importing mostly pre-processed data from the Accidents_ETL 
data = pd.read_csv('Accidents_Preprocessed1.csv')

In [26]:
#Date/Time features were left as number in case we wanted to create subsets of data based on these condtions before they were converted to objects and encoded. Here they are finally converted to object in preparation for encoding. 
#Cities are also dropped to reduce the processing power required to handle an additional 11,000 columns of data. 

data['Year'] = data['Year'].astype(object)
data['Month'] = data['Month'].astype(object)
data['Day'] = data['Day'].astype(object)
data['Hour'] = data['Hour'].astype(object)
data = data.drop(columns = ['City'], axis=1)

In [27]:
#creating the accidents df based on the imported and processed data.abs
accidents = data.copy()

In [28]:
#confirming the layout and size of the accidents df
accidents

Unnamed: 0,Severity,Side,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,2,0,"Greenville,SC",SC,70F:79F,50-59%,28-30,9-10,N,0-9,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Mecklenburg,NC",NC,70F:79F,60-69%,28-30,9-10,Variable,0-9,...,0,0,0,0,0,0,0,0,0,0
2,2,0,"Santa Clara,CA",CA,50F:59F,70-79%,30-32,9-10,W,0-9,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"Douglas,NV",NV,50F:59F,10-19%,30-32,9-10,SW,0-9,...,0,0,0,0,0,0,0,0,0,0
4,3,0,"Broward,FL",FL,80F:89F,80-89%,28-30,9-10,SE,10-19,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2906376,2,1,"Harris,TX",TX,80F:89F,60-69%,30-32,8-9,Variable,0-9,...,0,0,0,0,0,0,0,0,0,0
2906377,2,0,"San Bernardino,CA",CA,40F:49F,70-79%,30-32,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
2906378,2,1,"Miami-Dade,FL",FL,70F:79F,80-89%,28-30,9-10,NW,10-19,...,0,0,0,0,0,0,0,0,0,0
2906379,2,0,"Salt Lake,UT",UT,20F:29F,80-89%,24-26,9-10,SE,0-9,...,0,0,0,0,0,0,0,0,0,0


In [29]:
#optional palce to subselect based on state for training and testing specific to a state.

#data = data[data['State'] == 'CA']


In [30]:
#creating a sampled subset of the 2.9M rows of data for ease of testig the code and proving functionality of the ML model. 
accidents = accidents.sample(n = 250000, replace=True, random_state=1)
accidents

Unnamed: 0,Severity,Side,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
128037,3,0,"San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,0-9,...,0,0,0,0,0,0,0,0,0,0
491755,2,0,"Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,0-9,...,0,0,0,0,0,0,0,0,0,1
2568076,2,0,"San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,0-9,...,0,0,0,0,0,0,0,0,0,0
491263,2,0,"Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
836489,2,0,"Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2647740,3,0,"Ramsey,MN",MN,30F:39F,90-100%,28-30,0-1,E,0-9,...,0,0,0,0,0,0,0,0,0,0
2776815,3,0,"Howard,MD",MD,60F:69F,60-69%,28-30,9-10,SW,10-19,...,0,0,0,0,0,0,0,0,0,0
1884020,3,0,"Los Angeles,CA",CA,Unknown,Unknown,Unknown,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
2135167,2,1,"Fresno,CA",CA,50F:59F,70-79%,28-30,9-10,NW,10-19,...,0,0,0,0,0,0,0,0,0,0


In [31]:
output_data_file = "Del1_sample_data_not_encoded.csv"
accidents.to_csv(output_data_file, index=False)

In [32]:
#Encoding Obj columns using OneHotEncoder:

#Identifying the 'object' columns to encode

type_cat = accidents.dtypes[accidents.dtypes == 'object'].index.tolist()
type_cat


['County',
 'State',
 'Temperature(F)',
 'Humidity(%)',
 'Pressure(in)',
 'Visibility(mi)',
 'Wind_Direction',
 'Wind_Speed(mph)',
 'Precipitation(in)',
 'Day/Night',
 'Year',
 'Month',
 'Day',
 'Hour']

In [33]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(accidents[type_cat].astype(str)))



In [34]:
# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(type_cat)
encode_df.head()



Unnamed: 0,"County_Abbeville,SC","County_Acadia,LA","County_Accomack,VA","County_Ada,ID","County_Adair,IA","County_Adams,CO","County_Adams,IA","County_Adams,ID","County_Adams,IL","County_Adams,IN",...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
#resetting the index on the sample to match the indices for the sample and the enocoded df so that they are both numbered 1-250,000.
accidents2 = accidents.reset_index(drop=True)

In [36]:
#visualizing accdients2 to confirm 
accidents2

Unnamed: 0,Severity,Side,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),...,Weather_Condition_Rain,Weather_Condition_Shallow_Fog,Weather_Condition_Sleet,Weather_Condition_Smoke/Haze,Weather_Condition_Snow,Weather_Condition_Squalls,Weather_Condition_Thunder,Weather_Condition_Thunderstorm,Weather_Condition_Volcanic_Ash,Weather_Condition_Wintry_Mix
0,3,0,"San Francisco,CA",CA,50F:59F,50-59%,30-32,9-10,W,0-9,...,0,0,0,0,0,0,0,0,0,0
1,2,0,"Tillamook,OR",OR,30F:39F,90-100%,28-30,6-7,Calm,0-9,...,0,0,0,0,0,0,0,0,0,1
2,2,0,"San Bernardino,CA",CA,40F:49F,70-79%,28-30,9-10,N,0-9,...,0,0,0,0,0,0,0,0,0,0
3,2,0,"Los Angeles,CA",CA,60F:69F,70-79%,30-32,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
4,2,0,"Santa Clara,CA",CA,70F:79F,40-49%,28-30,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,3,0,"Ramsey,MN",MN,30F:39F,90-100%,28-30,0-1,E,0-9,...,0,0,0,0,0,0,0,0,0,0
249996,3,0,"Howard,MD",MD,60F:69F,60-69%,28-30,9-10,SW,10-19,...,0,0,0,0,0,0,0,0,0,0
249997,3,0,"Los Angeles,CA",CA,Unknown,Unknown,Unknown,9-10,Calm,0-9,...,0,0,0,0,0,0,0,0,0,0
249998,2,1,"Fresno,CA",CA,50F:59F,70-79%,28-30,9-10,NW,10-19,...,0,0,0,0,0,0,0,0,0,0


In [37]:
#Merge one-hot encoded features and drop the originals
accidents = accidents.merge(encode_df, left_index=True, right_index=True)
accidents

Unnamed: 0,Severity,Side,County,State,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
30,2,0,"Miami-Dade,FL",FL,Unknown,Unknown,28-30,9-10,SE,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,4,0,"Allegheny,PA",PA,40F:49F,90-100%,28-30,9-10,E,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,2,0,"Sacramento,CA",CA,50F:59F,60-69%,28-30,8-9,SW,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,2,0,"Citrus,FL",FL,60F:69F,90-100%,30-32,0-1,Calm,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,2,1,"San Diego,CA",CA,60F:69F,70-79%,28-30,9-10,Calm,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249974,2,1,"Berkeley,SC",SC,70F:79F,70-79%,30-32,9-10,NE,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
249980,3,0,"Ramsey,MN",MN,70F:79F,40-49%,30-32,9-10,NW,10-19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
249985,2,1,"Harris,TX",TX,70F:79F,10-19%,30-32,9-10,Variable,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
249986,2,0,"Denver,CO",CO,30F:39F,90-100%,22-24,2-3,N,0-9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
#Dropping the original columns after encoding
accidents = accidents.drop(type_cat,1)
accidents

Unnamed: 0,Severity,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
30,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,2,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249974,2,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
249980,3,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
249985,2,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
249986,2,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
#Saving the final sampled and encoded dataset that will be used in the ML model.
output_data_file = "Del1_Accidents_Preprocessed_Encoded_Sampled.csv"
accidents.to_csv(output_data_file, index=False)


In [40]:
#Moving the final dataset into the ML model:

# Define features set
X = accidents.copy()
X = X.drop("Severity", axis=1)
X.head()

Unnamed: 0,Side,Weather_Condition_Wind,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,...,Hour_21,Hour_22,Hour_23,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,Hour_9
30,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# Define target vector
y = accidents["Severity"].values

In [42]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:

#Testing to see which learning rate would provide the best outcomes for the ML model:

# Create a classifier object
# learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# for learning_rate in learning_rates:
#     classifier = GradientBoostingClassifier(n_estimators=100,
#                                             learning_rate=learning_rate,
#                                             max_features=5,
#                                             max_depth=3,
#                                             random_state=0)

#     # Fit the model
#     classifier.fit(X_train_scaled, y_train)
#     print("Learning rate: ", learning_rate)

#     # Score the model
#     print("Accuracy score (training): {0:.3f}".format(
#         classifier.score(
#             X_train_scaled,
#             y_train)))
#     print("Accuracy score (validation): {0:.3f}".format(
#         classifier.score(
#             X_test_scaled,
#             y_test)))
#     print()

In [64]:
#finding the ideal number of features

#Call and Fit the model 
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit_transform(X_train, y_train)

#identify the number of features that have importance above the mean importance level.
selected_feat = X_train.columns[(sel.get_support())]
print(len(selected_feat))

#show the important features
print(selected_feat)




319
Index(['Side', 'Weather_Condition_Wind', 'Crossing', 'Give_Way', 'Junction',
       'Railway', 'Station', 'Stop', 'Traffic_Signal',
       'Weather_Condition_Cloudy',
       ...
       'Hour_21', 'Hour_22', 'Hour_23', 'Hour_3', 'Hour_4', 'Hour_5', 'Hour_6',
       'Hour_7', 'Hour_8', 'Hour_9'],
      dtype='object', length=319)


In [71]:
#Run the model
classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.5,
                                        max_features=323,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(learning_rate=0.5, max_features=323, random_state=0)

In [72]:
# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,2,4
1,2,2
2,2,3
3,2,2
4,2,3
5,2,2
6,2,2
7,2,2
8,2,3
9,2,2


In [73]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7166482910694597


In [77]:
# outcome values order in sklearn
TP, FN, FP, TN = confusion_matrix(y_test, predictions, labels=[1,0]).reshape(-1)
print('Outcome values : n', TP, FN, FP, TN)

Outcome values : n 0 0 0 0


In [84]:
conmat = pd.DataFrame({'y_test': y_test, 'predictions': predictions}, columns=['y_test', 'predictions'])
conmat

Unnamed: 0,y_test,predictions
0,4,2
1,2,2
2,3,2
3,2,2
4,3,2
...,...,...
5437,2,1
5438,3,2
5439,3,2
5440,2,2


In [None]:
TP = np.where(conmat['y_test

In [78]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm


# cm_df = pd.DataFrame(
#     cm, index=["Actual 0", "Actual 1"],
#     columns=["Predicted 0", "Predicted 1"]
# )

# # Displaying results
# display(cm_df)

array([[   0,   49,    2,    0],
       [  14, 3848,   85,   52],
       [   3, 1112,   45,   17],
       [   0,  204,    4,    7]])

In [68]:
# Use the SMOTEENN technique to perform combination sampling on the data
# Count the resampled classes
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

NameError: name 'Counter' is not defined

In [None]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_resampled y_resampled)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)