In [None]:
# Import dependencies
import os
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
# from config import username, password
import time
from pathlib import Path
import pickle

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.ensemble.forest import RandomForestClassifier
import tensorflow as tf
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Activation, Dense

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
from sklearn.datasets import make_blobs
import seaborn as sns
from collections import Counter

from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Import the data
file_path = "Resources/Cleaned_Accidents_Data.csv"
cleaned_df = pd.read_csv(file_path)
cleaned_df.head()

Unnamed: 0,Severity,Latitude,Longitude,State,Zipcode,Country,Temperature_F,Visibility_miles,Wind_Speed_mph,Precipitation_inches,...,Roundabout,Stop_Sign,Traffic,Traffic_Light,Turning_Loop,Year,Month,Time,Part_of_Week,Time_of_Day
0,3,39.865147,-84.058723,OH,45424,US,36.9,10.0,0.0,0.02,...,False,False,False,False,False,2016,February,5,Weekday,Early Morning
1,2,39.928059,-82.831184,OH,43068,US,37.9,10.0,0.0,0.0,...,False,False,False,False,False,2016,February,6,Weekday,Early Morning
2,2,39.063148,-84.032608,OH,45176,US,36.0,10.0,3.5,0.0,...,False,False,False,True,False,2016,February,6,Weekday,Early Morning
3,3,39.747753,-84.205582,OH,45417,US,35.1,9.0,4.6,0.0,...,False,False,False,False,False,2016,February,7,Weekday,Early Morning
4,2,39.627781,-84.188354,OH,45459,US,36.0,6.0,3.5,0.0,...,False,False,False,True,False,2016,February,7,Weekday,Early Morning


In [3]:
filtered_cleaned_df = cleaned_df.drop(['Country'], axis=1)

In [4]:
# Define target vector
X = filtered_cleaned_df.copy()
X = filtered_cleaned_df.drop('Severity', axis=1)
y = filtered_cleaned_df['Severity']
y[:5]

0    3
1    2
2    2
3    3
4    2
Name: Severity, dtype: int64

In [5]:
X.head()

Unnamed: 0,Latitude,Longitude,State,Zipcode,Temperature_F,Visibility_miles,Wind_Speed_mph,Precipitation_inches,Weather_Condition,Speed_Bump,...,Roundabout,Stop_Sign,Traffic,Traffic_Light,Turning_Loop,Year,Month,Time,Part_of_Week,Time_of_Day
0,39.865147,-84.058723,OH,45424,36.9,10.0,0.0,0.02,Light Rain,False,...,False,False,False,False,False,2016,February,5,Weekday,Early Morning
1,39.928059,-82.831184,OH,43068,37.9,10.0,0.0,0.0,Light Rain,False,...,False,False,False,False,False,2016,February,6,Weekday,Early Morning
2,39.063148,-84.032608,OH,45176,36.0,10.0,3.5,0.0,Overcast,False,...,False,False,False,True,False,2016,February,6,Weekday,Early Morning
3,39.747753,-84.205582,OH,45417,35.1,9.0,4.6,0.0,Mostly Cloudy,False,...,False,False,False,False,False,2016,February,7,Weekday,Early Morning
4,39.627781,-84.188354,OH,45459,36.0,6.0,3.5,0.0,Mostly Cloudy,False,...,False,False,False,True,False,2016,February,7,Weekday,Early Morning


In [6]:
X = pd.get_dummies(data=X, columns=['Time_of_Day', 
                                    'Part_of_Week',
                                    'Month',
                                    'Turning_Loop',
                                    'Traffic_Light',
                                    'Traffic',
                                    'Stop_Sign',
                                    'Roundabout',
                                    'Year',
                                    'Weather_Condition',
                                    'State', 
                                    'Cross_Walk', 
                                    'Speed_Bump', 
                                    'Yield_Sign', 
                                    'Intersection', 
                                    'Railway', 
                                    'No_Exit'])

X = X.dropna()
X.head()

Unnamed: 0,Latitude,Longitude,Zipcode,Temperature_F,Visibility_miles,Wind_Speed_mph,Precipitation_inches,Time,Time_of_Day_Afteroon,Time_of_Day_Early Morning,...,Speed_Bump_False,Speed_Bump_True,Yield_Sign_False,Yield_Sign_True,Intersection_False,Intersection_True,Railway_False,Railway_True,No_Exit_False,No_Exit_True
0,39.865147,-84.058723,45424,36.9,10.0,0.0,0.02,5,0,1,...,1,0,1,0,1,0,1,0,1,0
1,39.928059,-82.831184,43068,37.9,10.0,0.0,0.0,6,0,1,...,1,0,1,0,1,0,1,0,1,0
2,39.063148,-84.032608,45176,36.0,10.0,3.5,0.0,6,0,1,...,1,0,1,0,1,0,1,0,1,0
3,39.747753,-84.205582,45417,35.1,9.0,4.6,0.0,7,0,1,...,1,0,1,0,1,0,1,0,1,0
4,39.627781,-84.188354,45459,36.0,6.0,3.5,0.0,7,0,1,...,1,0,1,0,1,0,1,0,1,0


In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, train_size=0.80)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2740454, 229)
(685114, 229)
(2740454,)
(685114,)


In [9]:
# Creating a StandardScaler instance.
scaler = MinMaxScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Classifier

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier()
rf_model = RandomForestClassifier(n_estimators=100, random_state=78, verbose=3, max_depth = 10) 

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 500


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s remaining:    0.0s


building tree 2 of 500


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.2s remaining:    0.0s


building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500

building tree 335 of 500
building tree 336 of 500
building tree 337 of 500
building tree 338 of 500
building tree 339 of 500
building tree 340 of 500
building tree 341 of 500
building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 50.5min finished


In [12]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   49.3s finished


array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3", "Actual 4"], 
    columns=["Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])

In [14]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 1,144,5650,31,0
Actual 2,8,459108,4433,0
Actual 3,0,182502,11694,0
Actual 4,2,19475,2067,0


Accuracy Score: 0.6873980096743024
Classification Report


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.94      0.02      0.05      5825
           2       0.69      0.99      0.81    463549
           3       0.64      0.06      0.11    194196
           4       0.00      0.00      0.00     21544

    accuracy                           0.69    685114
   macro avg       0.57      0.27      0.24    685114
weighted avg       0.66      0.69      0.58    685114



In [16]:
# Print the feature importances from the Random Forest classifier
feature_names = X.columns
sorted(zip(rf_model.feature_importances_, feature_names), reverse=True)

[(0.13426497067806012, 'Traffic_Light_False'),
 (0.13305310468207307, 'Traffic_Light_True'),
 (0.07567367526433572, 'Zipcode'),
 (0.06299974737969805, 'Longitude'),
 (0.05454637818239672, 'Cross_Walk_False'),
 (0.05023848320791942, 'Part_of_Week_Weekend'),
 (0.04945675552148799, 'Cross_Walk_True'),
 (0.047105633256479484, 'Latitude'),
 (0.0456437719989327, 'Part_of_Week_Weekday'),
 (0.032060087626056, 'State_GA'),
 (0.029001863318609263, 'Year_2020'),
 (0.022361304286018183, 'Weather_Condition_Fair'),
 (0.021731458732919158, 'State_NC'),
 (0.020271525922860297, 'State_OR'),
 (0.015033942874905153, 'State_CA'),
 (0.012820385552632629, 'Time'),
 (0.012267888505511398, 'Year_2019'),
 (0.011300869810911482, 'Stop_Sign_True'),
 (0.011192511880857544, 'Stop_Sign_False'),
 (0.010734005579734803, 'State_SC'),
 (0.009115741605188509, 'State_MO'),
 (0.008921420821695392, 'Intersection_True'),
 (0.008841356193320299, 'Intersection_False'),
 (0.008775807655919208, 'Year_2018'),
 (0.007903442663542

In [None]:
print('Accuracy of random forest on training: ', rf_model.score(X_train_scaled, y_train))
print('Accuracy of random forest on testing: ', rf_model.score(X_test, y_test))

# rf_model.score(X_test, y_test)

In [None]:
rf_features = pd.Series(rf_model.feature_importances_,index=X.columns).sort_values(ascending=False)

# Creating a bar plot, displaying only the top k features
k=10
sns.barplot(x=rf_features[:10], y=rf_features.index[:k])
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))


                   pre       rec       spe        f1       geo       iba       sup

          1       0.94      0.02      1.00      0.05      0.16      0.02      5825
          2       0.69      0.99      0.06      0.81      0.25      0.07    463549
          3       0.64      0.06      0.99      0.11      0.24      0.05    194196
          4       0.00      0.00      1.00      0.00      0.00      0.00     21544

avg / total       0.66      0.69      0.36      0.58      0.24      0.06    685114



# Logistic Regression Classifier

In [21]:
logistic_regression = LogisticRegression()

In [22]:
# Fit the classifier
logistic_regression.fit(X_train, y_train)

LogisticRegression()

In [None]:
# Calculate the score for the test data
print('Accuracy of logistic regression on training: ', logistic_regression.score(X_train_scaled, y_train))
print('Accuracy of logistic regression on testing: ', logistic_regression.score(X_test, y_test))

In [None]:
# Saving model to disk
pickle.dump(regressor, open('model_MinMaxScaler.pkl','wb'))

# Neural Network

In [53]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints_MinMaxScaler/weights.{epoch:02d}.hdf5"

In [None]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every 1 epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq=2740454)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train,callbacks=[cp_callback], epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 00001: saving model to checkpoints/weights.01.hdf5
Epoch 2/100
Epoch 00002: saving model to checkpoints/weights.02.hdf5
Epoch 3/100
Epoch 00003: saving model to checkpoints/weights.03.hdf5
Epoch 4/100
Epoch 00004: saving model to checkpoints/weights.04.hdf5
Epoch 5/100
Epoch 00005: saving model to checkpoints/weights.05.hdf5
Epoch 6/100
Epoch 00006: saving model to checkpoints/weights.06.hdf5
Epoch 7/100
Epoch 00007: saving model to checkpoints/weights.07.hdf5
Epoch 8/100
Epoch 00008: saving model to checkpoints/weights.08.hdf5
Epoch 9/100
Epoch 00009: saving model to checkpoints/weights.09.hdf5
Epoch 10/100
Epoch 00010: saving model to checkpoints/weights.10.hdf5
Epoch 11/100
Epoch 00011: saving model to checkpoints/weights.11.hdf5
Epoch 12/100
Epoch 00012: saving model to checkpoints/weights.12.hdf5
Epoch 13/100
Epoch 00013: saving model to checkpoints/weights.13.hdf5
Epoch 14/100
Epoch 00014: saving model to checkpoints/weights.14.hdf5
Epoch 15/100
Epoch 00015: sav

Epoch 59/100
Epoch 00059: saving model to checkpoints/weights.59.hdf5
Epoch 60/100
Epoch 00060: saving model to checkpoints/weights.60.hdf5
Epoch 61/100
Epoch 00061: saving model to checkpoints/weights.61.hdf5
Epoch 62/100
Epoch 00062: saving model to checkpoints/weights.62.hdf5
Epoch 63/100
Epoch 00063: saving model to checkpoints/weights.63.hdf5
Epoch 64/100
Epoch 00064: saving model to checkpoints/weights.64.hdf5
Epoch 65/100
Epoch 00065: saving model to checkpoints/weights.65.hdf5
Epoch 66/100
Epoch 00066: saving model to checkpoints/weights.66.hdf5
Epoch 67/100
Epoch 00067: saving model to checkpoints/weights.67.hdf5
Epoch 68/100
Epoch 00068: saving model to checkpoints/weights.68.hdf5
Epoch 69/100
Epoch 00069: saving model to checkpoints/weights.69.hdf5
Epoch 70/100
Epoch 00070: saving model to checkpoints/weights.70.hdf5
Epoch 71/100
Epoch 00071: saving model to checkpoints/weights.71.hdf5
Epoch 72/100
Epoch 00072: saving model to checkpoints/weights.72.hdf5
Epoch 73/100
Epoch 0

Epoch 88/100
Epoch 00088: saving model to checkpoints/weights.88.hdf5
Epoch 89/100
Epoch 00089: saving model to checkpoints/weights.89.hdf5
Epoch 90/100
Epoch 00090: saving model to checkpoints/weights.90.hdf5
Epoch 91/100
Epoch 00091: saving model to checkpoints/weights.91.hdf5
Epoch 92/100

In [None]:
# Make predictions
predicted = nn.predict(X_test_scaled)
predicted = enc.inverse_transform(predicted).flatten().tolist()
results = pd.DataFrame({
    "Actual": y_test.activity.values,
    "Predicted": predicted
})
results.head(10)

In [None]:
# Plotting graphs
df_plot = pd.DataFrame(fit_model.history, index=range(1,len(fit_model.history["loss"]) + 1))

df_plot.plot(y="loss").show()

In [None]:
# Plot the loss
df_plot.plot(y="accuracy")

In [None]:
# Plot a scatterplot of predictions on X_test vs actual y_test values
predictions = best_model.predict(X_test)
plt.scatter(predictions, y_test)

In [None]:
# Map of accidents, color code by State
sns.scatterplot(x='Longitude', y='Latitude', data=cleaned_df, hue='State', legend=False, s=30)
plt.show()

In [None]:
# Predict the classification of a new set of blob data
new_X, new_Y = make_blobs(n_samples=500, centers=2, n_features=2, random_state=78)
new_X_scaled = X_scaler.transform(new_X)
nn_model.predict_classes(new_X_scaled)

In [None]:
# Visualizing four severity classes
plt.scatter(X[:, 1], X[:, 2], X[:, 3], X[:, 4], c=y)

In [None]:
# Export our model to HDF5 file
nn_new.save("final_project_nn_MinMaxScaler.h5")

In [None]:
# Loading model to compare results
model_accident = pickle.load( open('model_MinMaxScaler.pkl','rb'))
print(model_accident.predict([[1.0]]))