# Accident Severity Probability Prediction in Seattle City
IBM Coursera Applied Data Science Capstone Project
September 2022

# 1. Problem Introduction
Data to represent road condition that lead to accident is needed for accident severity analysis and prediction. These data could be the the road condition during the accident, the weather, the light condition, the driver condition, etc. From these data, a model could be built to predict the severity of accident if it occurs during that particular road, and driver conditions.

In [23]:
import pandas as pd
import numpy as np


# 2. Data Acquisition, Selection, and Cleaning

Dataset acquired from https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv. The metadata for this dataset can be downloaded in https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Metadata.pdf.

In [None]:
df = pd.read_csv("file:///C:/Users/popac//date.csv")
df.head()

In [None]:
df.columns

In [None]:
time = df.sort_values(by='INCDATE')
print(time['INCDATE'].head(1))
print(time['INCDATE'].tail(1))

In [None]:
df = df.dropna(subset=["X"], axis=0)
df = df.dropna(subset=["Y"], axis=0)
df = df.rename(columns={'X':'LONGITUDE', 'Y':'LATITUDE'})
print(df.shape)
df.head()

In [None]:
print(df['JUNCTIONTYPE'].value_counts())
print(df['JUNCTIONTYPE'].value_counts().sum())
print(df['ADDRTYPE'].value_counts())
print(df['ADDRTYPE'].value_counts().sum())

In [None]:
df.columns

In [None]:
model_df = df[['LATITUDE', 'LONGITUDE', 'ADDRTYPE', 'PERSONCOUNT', 'VEHCOUNT', 'INATTENTIONIND', 'UNDERINFL',\
               'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SPEEDING', 'SEVERITYCODE']]
model_df.head()

In [None]:
missing_values = model_df.isnull()
for column in missing_values.columns.values.tolist():
    print(column)
    print (missing_values[column].value_counts())
    print("")

In [None]:
clean_df = model_df.copy()

clean_df.dropna(subset=["ADDRTYPE"], axis=0, inplace=True)
clean_df.reset_index(drop=True, inplace=True)
clean_df.replace({'INATTENTIONIND' : {'Y' : int(1), np.nan : int(0)},
                 'SPEEDING' : {'Y' : int(1), np.nan : int(0)},
                 'UNDERINFL' : {'Y' : int(1), '1' : int(1),
                                'N' : int(0), '0' : int(0),
                                np.nan : int(0)}}, inplace=True)
clean_df['WEATHER'].replace(np.nan, 'Unknown', inplace=True)
clean_df['ROADCOND'].replace(np.nan, 'Unknown', inplace=True)
clean_df['LIGHTCOND'].replace(np.nan, 'Unknown', inplace=True)

clean_df.shape

In [None]:
EDA_df = clean_df[['WEATHER', 'ROADCOND', 'LIGHTCOND', 'SEVERITYCODE']]
# drop Unknown and Other data in WEATHER feature
EDA_df = EDA_df[EDA_df.WEATHER != 'Unknown']
EDA_df = EDA_df[EDA_df.WEATHER != 'Other']
print(EDA_df['WEATHER'].value_counts(), "\n")
# drop Unknown and Other data in ROADCOND feature
EDA_df = EDA_df[EDA_df.ROADCOND != 'Unknown']
EDA_df = EDA_df[EDA_df.ROADCOND != 'Other']
print(EDA_df['ROADCOND'].value_counts(), "\n")
# drop Unknown and Other data in LIGHTCOND feature
EDA_df = EDA_df[EDA_df.LIGHTCOND != 'Unknown']
EDA_df = EDA_df[EDA_df.LIGHTCOND != 'Other']
print(EDA_df['LIGHTCOND'].value_counts(), "\n")
print(EDA_df.shape)

In [None]:
EDA_df = EDA_df.groupby(['WEATHER', 'ROADCOND', 'LIGHTCOND'])['SEVERITYCODE'].value_counts().to_frame()
EDA_df = EDA_df.rename(columns={'SEVERITYCODE':'ACC_COUNTS'})
EDA_df = EDA_df.reset_index()

In [None]:
EDA_df1 = EDA_df[EDA_df.SEVERITYCODE == 1]
EDA_df1 = EDA_df1.sort_values(by=['ACC_COUNTS'], ascending=False)
EDA_df1 = EDA_df1.head(10).reset_index(drop=True)
EDA_df1['CONDITIONS'] = EDA_df1[['WEATHER','ROADCOND','LIGHTCOND']].agg(', '.join, axis=1)
cond = EDA_df1['CONDITIONS']
EDA_df1.drop(labels=['CONDITIONS', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SEVERITYCODE'], axis=1, inplace=True)
EDA_df1.insert(0, 'CONDITIONS', cond)
EDA_df1

In [None]:
%matplotlib inline 

import matplotlib as mpl
import matplotlib.pyplot as plt

labels = EDA_df1['CONDITIONS'].to_list()

ax = EDA_df1.plot(kind='bar', stacked=False, width=0.8, figsize=(20,8), fontsize=14)

ax.set_title("Condition During Accident that Lead to Property Damage in Seattle City", size=16)
ax.set_xticks(np.arange(0, 10, 1))
ax.set_xticklabels(labels)

for p in ax.patches:
    height = p.get_height()
    ax.annotate("{:,}".format(height), xy=(p.get_x() + p.get_width() / 2, height),
                xytext=(3, 3), textcoords="offset points", 
                ha='center', va='bottom', fontsize=14)

ax.legend([])
ax.set_ylabel("Number of Accidents", size=14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.get_yaxis().set_ticks([])

plt.show()

In [None]:
EDA_df2 = EDA_df[EDA_df.SEVERITYCODE == 2]
EDA_df2 = EDA_df2.sort_values(by=['ACC_COUNTS'], ascending=False)
EDA_df2 = EDA_df2.head(10).reset_index(drop=True)
EDA_df2['CONDITIONS'] = EDA_df2[['WEATHER','ROADCOND','LIGHTCOND']].agg(', '.join, axis=1)
cond = EDA_df2['CONDITIONS']
EDA_df2.drop(labels=['CONDITIONS', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SEVERITYCODE'], axis=1, inplace=True)
EDA_df2.insert(0, 'CONDITIONS', cond)
EDA_df2

In [None]:
labels = EDA_df2['CONDITIONS'].to_list()

ax = EDA_df2.plot(kind='bar', stacked=False, width=0.8, figsize=(20,8), fontsize=14)

ax.set_title("Condition During Accident that Lead to Injury in Seattle City", size=16)
ax.set_xticks(np.arange(0, 10, 1))
ax.set_xticklabels(labels)

for p in ax.patches:
    height = p.get_height()
    ax.annotate("{:,}".format(height), xy=(p.get_x() + p.get_width() / 2, height),
                xytext=(3, 3), textcoords="offset points", 
                ha='center', va='bottom', fontsize=14)

ax.legend([])
ax.set_ylabel("Number of Accidents", size=14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.get_yaxis().set_ticks([])

plt.show()

In [None]:
print(clean_df.shape)
print("Unique values in feature 'ADDTYPE':", clean_df['ADDRTYPE'].unique())
print("Unique values in feature 'WEATHER':", clean_df['WEATHER'].unique())
print("Unique values in feature 'ROADCOND':", clean_df['ROADCOND'].unique())
print("Unique values in feature 'LIGHTCOND':", clean_df['LIGHTCOND'].unique())

In [None]:
clean_df.replace({'WEATHER' : {'Other' : 'Unknown'},
                 'ROADCOND' : {'Other' : 'Unknown'},
                 'LIGHTCOND' : {'Other' : 'Unknown'}}, inplace=True)
print("Unique values in feature 'ADDTYPE':", clean_df['ADDRTYPE'].unique())
print("Unique values in feature 'WEATHER':", clean_df['WEATHER'].unique())
print("Unique values in feature 'ROADCOND':", clean_df['ROADCOND'].unique())
print("Unique values in feature 'LIGHTCOND':", clean_df['LIGHTCOND'].unique())

In [None]:
X = clean_df.iloc[:, 1:-1]
print(X.head())
print(X.shape)
y = clean_df[['LATITUDE', 'LONGITUDE', 'SEVERITYCODE']]
print(y.head())
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_loc, y_test_loc = train_test_split(X, y, test_size=0.3, random_state=9)
print ('Train set:', X_train.shape,  y_train_loc.shape)
print ('Test set:', X_test.shape,  y_test_loc.shape)

In [None]:
y_train = y_train_loc['SEVERITYCODE'].values
y_test = y_test_loc['SEVERITYCODE'].values
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn import preprocessing
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)
print(X_train.shape)
print(X_train[0:3])
print(X_test.shape)
print(X_test[0:3])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

LR_Class = LogisticRegression(C=0.01, solver='lbfgs').fit(X_train,y_train)
yhat_LR = LR_Class.predict(X_test)
yhat_LR_proba = LR_Class.predict_proba(X_test)

LR_accu = jaccard_similarity_score(y_test, yhat_LR)
LR_f1 = f1_score(y_test, yhat_LR, average='weighted')
LR_logloss = log_loss(y_test, yhat_LR_proba)

print("Jaccard similarity index = %.4f" % LR_accu)
print("f1-score = %.4f" % LR_f1)
print("Logaritmic Loss = %.4f" % LR_logloss)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat_LR, labels=[2,1])
np.set_printoptions(precision=3)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['injury=2','damage=1'],normalize= False,  title='Confusion matrix')

In [None]:
yhat_prob_df = pd.DataFrame(yhat_LR_proba)
yhat_prob_df = yhat_prob_df.head(100)
yhat_prob_df.rename(columns={0:'PROP_DAMAGE', 1:'INJURY'}, inplace=True)
print(yhat_prob_df.shape)
yhat_prob_df.head()

In [None]:
y_test_df = pd.DataFrame(y_test_loc)
y_map = clean_df.iloc[y_test_df.index].copy().head(100)
print(y_map.shape)
y_map.head()

In [None]:
y_map['PROP_DAMAGE'] = yhat_prob_df['PROP_DAMAGE'].values
y_map['INJURY'] = yhat_prob_df['INJURY'].values
print(y_map.shape)
y_map.head()

In [None]:
road_type = list(y_map.ADDRTYPE)
weather = list(y_map.WEATHER)
road_cond = list(y_map.ROADCOND)
light = list(y_map.LIGHTCOND)
damage = list(y_map.PROP_DAMAGE)
injury = list(y_map.INJURY)
labels = []

for i, lbl in enumerate(road_type):
    temp_lbl = "Road Type: " + str(lbl) + "; " +\
                "Weather: " + str(weather[i]) + "; " +\
                "Road Cond.: " + str(road_cond[i]) + "; " +\
                "Light Cond.: " + str(light[i]) + "; " +\
                "Prop. Damage Prob.: " + str(round(damage[i]*100, 2)) + "%; " +\
                "Injury Prob.: " + str(round(injury[i]*100,2)) + "%; "
    labels.append(temp_lbl)

In [None]:
import folium
seattle_map = folium.Map(location=[47.608013, -122.335167], zoom_start=11)

In [None]:
# loop through the 100 accidents and add each to the map
for lat, lng, label in zip(y_map.LATITUDE, y_map.LONGITUDE, labels):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        color='red',
        fill=True,
        popup=label,
        fill_color='orange',
        fill_opacity=0.6
    ).add_to(seattle_map)

# show map
seattle_map