#  IBM Data Science Capstone: Car Accident Severity

In [5]:
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import seaborn as sns

### Cleaning the data

In [6]:
csv = ' https://cocl.us/datascience_survey_data'
data = pd.read_csv('https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
data.shape

(194673, 38)

In [8]:
data.head()

Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


In [9]:
df = data.drop(columns = ['OBJECTID', 'SEVERITYCODE.1', 'REPORTNO', 'INCKEY', 'COLDETKEY', 
              'X', 'Y', 'STATUS','ADDRTYPE',
              'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
              'EXCEPTRSNDESC', 'SEVERITYDESC', 'INCDATE',
              'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
              'SDOT_COLDESC', 'PEDROWNOTGRNT', 'SDOTCOLNUM',
              'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY',
              'CROSSWALKKEY', 'HITPARKEDCAR', 'PEDCOUNT', 
              'PERSONCOUNT', 'VEHCOUNT', 'COLLISIONTYPE',
              'SPEEDING', 'INATTENTIONIND'])

df["WEATHER"] = df["WEATHER"].astype('category')
df["ROADCOND"] = df["ROADCOND"].astype('category')
df["LIGHTCOND"] = df["LIGHTCOND"].astype('category')

df["WEATHER_CODE"] = df["WEATHER"].cat.codes
df["ROADCOND_CODE"] = df["ROADCOND"].cat.codes
df["LIGHTCOND_CODE"] = df["LIGHTCOND"].cat.codes

df.head(5)

Unnamed: 0,SEVERITYCODE,PEDCYLCOUNT,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,WEATHER_CODE,ROADCOND_CODE,LIGHTCOND_CODE
0,2,0,N,Overcast,Wet,Daylight,4,8,5
1,1,0,0,Raining,Wet,Dark - Street Lights On,6,8,2
2,1,0,0,Overcast,Dry,Daylight,4,0,5
3,1,0,N,Clear,Dry,Daylight,1,0,5
4,2,0,0,Raining,Wet,Daylight,6,8,5


In [10]:
df['SEVERITYCODE'].value_counts()

1    136485
2     58188
Name: SEVERITYCODE, dtype: int64

In [11]:
df['WEATHER'].value_counts()

Clear                       111135
Raining                      33145
Overcast                     27714
Unknown                      15091
Snowing                        907
Other                          832
Fog/Smog/Smoke                 569
Sleet/Hail/Freezing Rain       113
Blowing Sand/Dirt               56
Severe Crosswind                25
Partly Cloudy                    5
Name: WEATHER, dtype: int64

In [12]:
df['ROADCOND'].value_counts()

Dry               124510
Wet                47474
Unknown            15078
Ice                 1209
Snow/Slush          1004
Other                132
Standing Water       115
Sand/Mud/Dirt         75
Oil                   64
Name: ROADCOND, dtype: int64

In [13]:
df['LIGHTCOND'].value_counts()

Daylight                    116137
Dark - Street Lights On      48507
Unknown                      13473
Dusk                          5902
Dawn                          2502
Dark - No Street Lights       1537
Dark - Street Lights Off      1199
Other                          235
Dark - Unknown Lighting         11
Name: LIGHTCOND, dtype: int64

In [14]:
from sklearn.utils import resample

df_majority = df[df.SEVERITYCODE==1]
df_minority = df[df.SEVERITYCODE==2]

df_majority_downsampled = resample(df_majority,
                                        replace=False,
                                        n_samples=58188,
                                        random_state=123)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

df_balanced.SEVERITYCODE.value_counts()

2    58188
1    58188
Name: SEVERITYCODE, dtype: int64

In [15]:
X = np.asarray(df_balanced[['WEATHER_CODE', 'ROADCOND_CODE', 'LIGHTCOND_CODE']])
X[0:5]

array([[ 6,  8,  2],
       [ 1,  0,  5],
       [10,  7,  8],
       [ 1,  0,  5],
       [ 1,  0,  5]], dtype=int8)

In [16]:
y = np.asarray(df_balanced['SEVERITYCODE'])
y [0:5]

array([1, 1, 1, 1, 1])

In [17]:
from sklearn import preprocessing

X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]



array([[ 1.15236718,  1.52797946, -1.21648407],
       [-0.67488   , -0.67084969,  0.42978835],
       [ 2.61416492,  1.25312582,  2.07606076],
       [-0.67488   , -0.67084969,  0.42978835],
       [-0.67488   , -0.67084969,  0.42978835]])

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (81463, 3) (81463,)
Test set: (34913, 3) (34913,)


### KNN Model

In [19]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [20]:
from sklearn.neighbors import KNeighborsClassifier
k = 23
knn = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

knn_y_pred = knn.predict(X_test)
knn_y_pred[0:5]

array([2, 2, 1, 1, 2])

In [21]:
jaccard_similarity_score(y_test, knn_y_pred)

0.5640878755764328

In [22]:
f1_score(y_test, knn_y_pred, average='macro')

0.5393282758446943

### Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy', max_depth = 7)

dt.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
dt_y_pred = dt.predict(X_test)

In [26]:
jaccard_similarity_score(y_test, knn_y_pred)

0.5640878755764328

In [27]:
f1_score(y_test, dt_y_pred, average='macro')

0.5450597937389444

### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

LR = LogisticRegression(C=6, solver='liblinear').fit(X_train,y_train)

In [29]:
LR_y_pred = LR.predict(X_test)

In [30]:
LR_y_prob = LR.predict_proba(X_test)
log_loss(y_test, LR_y_prob)

0.6849535383198887

In [31]:
jaccard_similarity_score(y_test, LR_y_pred)

0.5260218256809784

In [32]:
f1_score(y_test, LR_y_pred, average='macro')

0.511602093963383

******

### Results

| Model              | Jaccard | F1-score | LogLoss |
|--------------------|---------|----------|---------|
| KNN                | 0.56    | 0.53     | NA      |
| Decision Tree      | 0.56    | 0.54     | NA      |
| LogisticRegression | 0.52    | 0.51     | 0.68    |