# 

# Data Science Capstone Project

<p>In this project we will analyze data about car accidents and will try to predict the severity of a new collision, if it's fatal or not</p>

### Importing libraries

In [35]:
# Pandas to handle dataset
import pandas as pd 
# Numpy to handle and operate through arrays
import numpy as np
# Matplotlib for data visualization
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [37]:
df_collisions = pd.read_csv("Data-Collisions.csv")

 ### Data understanding

In [38]:
# Useless columns: 'OBJECTID', 'X', 'Y', 'REPORTNO', 'INCKEY', 'COLDETKEY', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'INCDATE', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'SDOTCOLNUM', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY'

In [39]:
useless_columns = ['OBJECTID', 'X', 'Y', 'REPORTNO', 'STATUS', 'INCDTTM', 'INCKEY', 'COLDETKEY', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'INCDATE', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'SDOTCOLNUM', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY']

df_collisions.drop(columns=useless_columns, inplace=True)

print(df_collisions.columns)

Index(['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER',
       'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SPEEDING', 'HITPARKEDCAR'],
      dtype='object')


In [40]:
df_collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
0,2,Intersection,Angles,2,0,0,2,,N,Overcast,Wet,Daylight,,,N
1,1,Block,Sideswipe,2,0,0,2,,0,Raining,Wet,Dark - Street Lights On,,,N
2,1,Block,Parked Car,4,0,0,3,,0,Overcast,Dry,Daylight,,,N
3,1,Block,Other,3,0,0,3,,N,Clear,Dry,Daylight,,,N
4,2,Intersection,Angles,2,0,0,2,,0,Raining,Wet,Daylight,,,N


In [41]:
df_collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   SEVERITYCODE    194673 non-null  int64 
 1   ADDRTYPE        192747 non-null  object
 2   COLLISIONTYPE   189769 non-null  object
 3   PERSONCOUNT     194673 non-null  int64 
 4   PEDCOUNT        194673 non-null  int64 
 5   PEDCYLCOUNT     194673 non-null  int64 
 6   VEHCOUNT        194673 non-null  int64 
 7   INATTENTIONIND  29805 non-null   object
 8   UNDERINFL       189789 non-null  object
 9   WEATHER         189592 non-null  object
 10  ROADCOND        189661 non-null  object
 11  LIGHTCOND       189503 non-null  object
 12  PEDROWNOTGRNT   4667 non-null    object
 13  SPEEDING        9333 non-null    object
 14  HITPARKEDCAR    194673 non-null  object
dtypes: int64(5), object(10)
memory usage: 22.3+ MB


In [42]:
df_collisions.isnull().sum()

SEVERITYCODE           0
ADDRTYPE            1926
COLLISIONTYPE       4904
PERSONCOUNT            0
PEDCOUNT               0
PEDCYLCOUNT            0
VEHCOUNT               0
INATTENTIONIND    164868
UNDERINFL           4884
WEATHER             5081
ROADCOND            5012
LIGHTCOND           5170
PEDROWNOTGRNT     190006
SPEEDING          185340
HITPARKEDCAR           0
dtype: int64

In [43]:
# Total observations: 194673
# Lets drop columns that have half of this value or more of missing values (97336+)
print('Inital total columns: ' + str(np.asarray(df_collisions.columns).size))
df_collisions.dropna(axis=1, thresh=97336, inplace=True)
print('Total columns after dropping: ' + str(np.asarray(df_collisions.columns).size))

Inital total columns: 15
Total columns after dropping: 12


In [44]:
df_collisions.isnull().sum()

SEVERITYCODE        0
ADDRTYPE         1926
COLLISIONTYPE    4904
PERSONCOUNT         0
PEDCOUNT            0
PEDCYLCOUNT         0
VEHCOUNT            0
UNDERINFL        4884
WEATHER          5081
ROADCOND         5012
LIGHTCOND        5170
HITPARKEDCAR        0
dtype: int64

<p>Now lets handle object variables, and perform some data transformation so we can check for correlations</p>

<p>We need to handle those missing values</p>

In [45]:
cols_with_missing_values = ['ADDRTYPE', 'COLLISIONTYPE', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND']

df_collisions[cols_with_missing_values].head()

Unnamed: 0,ADDRTYPE,COLLISIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND
0,Intersection,Angles,N,Overcast,Wet,Daylight
1,Block,Sideswipe,0,Raining,Wet,Dark - Street Lights On
2,Block,Parked Car,0,Overcast,Dry,Daylight
3,Block,Other,N,Clear,Dry,Daylight
4,Intersection,Angles,0,Raining,Wet,Daylight


We can see that UNDERINFL has numeric and string values, so we need to handle it individually and carefully. Other columns has only string values.

In [46]:
df_collisions['UNDERINFL'] = df_collisions['UNDERINFL'].map({'0': 'No', '1': 'Yes', 'N': 'No', 'Y': 'Yes'})

In [47]:
df_collisions['ADDRTYPE'].fillna(df_collisions['ADDRTYPE'].mode, inplace=True)
df_collisions['COLLISIONTYPE'].fillna(df_collisions['COLLISIONTYPE'].mode, inplace=True)
df_collisions['UNDERINFL'].fillna(df_collisions['UNDERINFL'].mode, inplace=True)
df_collisions['WEATHER'].fillna(df_collisions['WEATHER'].mode, inplace=True)
df_collisions['ROADCOND'].fillna(df_collisions['ROADCOND'].mode, inplace=True)
df_collisions['LIGHTCOND'].fillna(df_collisions['LIGHTCOND'].mode, inplace=True)

In [48]:
df_collisions.isnull().sum()

SEVERITYCODE     0
ADDRTYPE         0
COLLISIONTYPE    0
PERSONCOUNT      0
PEDCOUNT         0
PEDCYLCOUNT      0
VEHCOUNT         0
UNDERINFL        0
WEATHER          0
ROADCOND         0
LIGHTCOND        0
HITPARKEDCAR     0
dtype: int64

In [49]:
objetct_columns = ['ADDRTYPE', 'COLLISIONTYPE', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'HITPARKEDCAR']
df_collisions = pd.get_dummies(df_collisions, prefix=['addrtype', 'collisiontype', 'underinfl', 'weather', 'roadcond', 'lightcond', 'hitparkedcar'], prefix_sep='_', columns=objetct_columns, drop_first=True)

@todo<p>We need to handle INCDTTM and create 1 new column: the time</p>

In [50]:
print(df_collisions.shape)
print(df_collisions.info())

(194673, 50)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 50 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   SEVERITYCODE                        194673 non-null  int64
 1   PERSONCOUNT                         194673 non-null  int64
 2   PEDCOUNT                            194673 non-null  int64
 3   PEDCYLCOUNT                         194673 non-null  int64
 4   VEHCOUNT                            194673 non-null  int64
 5   addrtype_Alley                      194673 non-null  uint8
 6   addrtype_Block                      194673 non-null  uint8
 7   addrtype_Intersection               194673 non-null  uint8
 8   collisiontype_Angles                194673 non-null  uint8
 9   collisiontype_Cycles                194673 non-null  uint8
 10  collisiontype_Head On               194673 non-null  uint8
 11  collisiontype_Left Turn             194

In [68]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize

In [64]:
# Dataset is imbalanced, so lets grab samples and balance that
# df_collisions.SEVERITYCODE.value_counts()

# df_collisions = df_collisions.sample(n=58188, weights='SEVERITYCODE')
# df_collisions.SEVERITYCODE.value_counts()

1    33439
2    24749
Name: SEVERITYCODE, dtype: int64

In [69]:
X = df_collisions.drop('SEVERITYCODE', axis=1)
X = normalize(X)

y = df_collisions[['SEVERITYCODE']]

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [70]:
# ML models

# Logistic regression
lr_model = LogisticRegression(class_weight='SEVERITYCODE')
lr_model.fit(x_train, y_train)
print(lr_model.score(x_test, y_test))

0.7004639972503867
