# 

# Data Science Capstone Project

<p>In this project we will analyze data about car accidents and will try to predict the severity of a new collision, if it's fatal or not</p>

## Importing libraries

In [1]:
# Pandas to handle dataset
import pandas as pd 

# Numpy to handle and operate through arrays
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Removing warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [3]:
#Read the dataset
df_collisions = pd.read_csv("https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv")

 ## Data understanding

### First, we need to remove useless columns to make our DataFrame smaller

In [4]:
# Theses columns are used only for identification or location, so they dont have any impact in our predictions
# For more info about these columns, check the Metadata.PDF
useless_columns = ['OBJECTID', 'X', 'Y', 'REPORTNO', 'STATUS', 'INCDTTM', 'INCKEY', 'COLDETKEY', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'INCDATE', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC', 'SDOTCOLNUM', 'ST_COLCODE', 'ST_COLDESC', 'SEGLANEKEY', 'CROSSWALKKEY']

df_collisions.drop(columns=useless_columns, inplace=True)

print(df_collisions.columns)

Index(['SEVERITYCODE', 'ADDRTYPE', 'COLLISIONTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INATTENTIONIND', 'UNDERINFL', 'WEATHER',
       'ROADCOND', 'LIGHTCOND', 'PEDROWNOTGRNT', 'SPEEDING', 'HITPARKEDCAR'],
      dtype='object')


Let's check the first five rows in the dataframe and check the type of each column

In [5]:
df_collisions.head()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
0,2,Intersection,Angles,2,0,0,2,,N,Overcast,Wet,Daylight,,,N
1,1,Block,Sideswipe,2,0,0,2,,0,Raining,Wet,Dark - Street Lights On,,,N
2,1,Block,Parked Car,4,0,0,3,,0,Overcast,Dry,Daylight,,,N
3,1,Block,Other,3,0,0,3,,N,Clear,Dry,Daylight,,,N
4,2,Intersection,Angles,2,0,0,2,,0,Raining,Wet,Daylight,,,N


In [6]:
df_collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 15 columns):
SEVERITYCODE      194673 non-null int64
ADDRTYPE          192747 non-null object
COLLISIONTYPE     189769 non-null object
PERSONCOUNT       194673 non-null int64
PEDCOUNT          194673 non-null int64
PEDCYLCOUNT       194673 non-null int64
VEHCOUNT          194673 non-null int64
INATTENTIONIND    29805 non-null object
UNDERINFL         189789 non-null object
WEATHER           189592 non-null object
ROADCOND          189661 non-null object
LIGHTCOND         189503 non-null object
PEDROWNOTGRNT     4667 non-null object
SPEEDING          9333 non-null object
HITPARKEDCAR      194673 non-null object
dtypes: int64(5), object(10)
memory usage: 22.3+ MB


### Missing values

Now, let's check our dataframe for missing values and how we'll handle then

In [7]:
df_collisions.isnull().sum()

SEVERITYCODE           0
ADDRTYPE            1926
COLLISIONTYPE       4904
PERSONCOUNT            0
PEDCOUNT               0
PEDCYLCOUNT            0
VEHCOUNT               0
INATTENTIONIND    164868
UNDERINFL           4884
WEATHER             5081
ROADCOND            5012
LIGHTCOND           5170
PEDROWNOTGRNT     190006
SPEEDING          185340
HITPARKEDCAR           0
dtype: int64

We can drop columns with 50% or more missing values

In [8]:
print('Inital total columns: ' + str(np.asarray(df_collisions.columns).size))

df_collisions.dropna(axis=1, thresh=97336, inplace=True)

print('Total columns after dropping: ' + str(np.asarray(df_collisions.columns).size))

Inital total columns: 15
Total columns after dropping: 12


<p>Since the remaing columns with missing values are categorical, we can fill those missing values with the recurrent value</p>

In [9]:
cols_with_missing_values = ['ADDRTYPE', 'COLLISIONTYPE', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND']

df_collisions[cols_with_missing_values].head()

Unnamed: 0,ADDRTYPE,COLLISIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND
0,Intersection,Angles,N,Overcast,Wet,Daylight
1,Block,Sideswipe,0,Raining,Wet,Dark - Street Lights On
2,Block,Parked Car,0,Overcast,Dry,Daylight
3,Block,Other,N,Clear,Dry,Daylight
4,Intersection,Angles,0,Raining,Wet,Daylight


We can see that UNDERINFL column has numeric and string values, so we need to handle it individually. We can replace 0 with 'No' and 1 with 'Yes' values.

In [10]:
df_collisions['UNDERINFL'] = df_collisions['UNDERINFL'].map({'0': 'No', '1': 'Yes', 'N': 'No', 'Y': 'Yes'})

Now lets fill those missing values with the recurrent value

In [11]:
df_collisions['ADDRTYPE'].fillna(df_collisions['ADDRTYPE'].mode()[0], inplace=True)
df_collisions['COLLISIONTYPE'].fillna(df_collisions['COLLISIONTYPE'].mode()[0], inplace=True)
df_collisions['UNDERINFL'].fillna(df_collisions['UNDERINFL'].mode()[0], inplace=True)
df_collisions['WEATHER'].fillna(df_collisions['WEATHER'].mode()[0], inplace=True)
df_collisions['ROADCOND'].fillna(df_collisions['ROADCOND'].mode()[0], inplace=True)
df_collisions['LIGHTCOND'].fillna(df_collisions['LIGHTCOND'].mode()[0], inplace=True)

## Data Wrangling

#### Now, lets use OneHotEncoding and get_dummies to handle categorical values

In [12]:
df_collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 12 columns):
SEVERITYCODE     194673 non-null int64
ADDRTYPE         194673 non-null object
COLLISIONTYPE    194673 non-null object
PERSONCOUNT      194673 non-null int64
PEDCOUNT         194673 non-null int64
PEDCYLCOUNT      194673 non-null int64
VEHCOUNT         194673 non-null int64
UNDERINFL        194673 non-null object
WEATHER          194673 non-null object
ROADCOND         194673 non-null object
LIGHTCOND        194673 non-null object
HITPARKEDCAR     194673 non-null object
dtypes: int64(5), object(7)
memory usage: 17.8+ MB


#####  One-hot Encoding

In [13]:
#ADDRTYPE
df_collisions.ADDRTYPE.value_counts()

Block           128852
Intersection     65070
Alley              751
Name: ADDRTYPE, dtype: int64

In [14]:
#COLLISIONTYPE
df_collisions.COLLISIONTYPE.value_counts()

Parked Car    52891
Angles        34674
Rear Ended    34090
Other         23703
Sideswipe     18609
Left Turn     13703
Pedestrian     6608
Cycles         5415
Right Turn     2956
Head On        2024
Name: COLLISIONTYPE, dtype: int64

In [15]:
#UNDERINFL
df_collisions.UNDERINFL.value_counts()

No     185552
Yes      9121
Name: UNDERINFL, dtype: int64

In [16]:
#COLLISIONTYPE
df_collisions.HITPARKEDCAR.value_counts()

N    187457
Y      7216
Name: HITPARKEDCAR, dtype: int64

Since there's no number sequencing in these columns's values, we can use get_dummies in the columns above

In [17]:
object_columns = ['ADDRTYPE', 'COLLISIONTYPE', 'UNDERINFL', 'HITPARKEDCAR']
df_collisions = pd.get_dummies(df_collisions, prefix=['addrtype', 'collisiontype', 'underinfl', 'hitparkedcar'], prefix_sep='_', columns=object_columns, drop_first=True)

##### Label encoding

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
#WEATHER
df_collisions.WEATHER.value_counts()

Clear                       116216
Raining                      33145
Overcast                     27714
Unknown                      15091
Snowing                        907
Other                          832
Fog/Smog/Smoke                 569
Sleet/Hail/Freezing Rain       113
Blowing Sand/Dirt               56
Severe Crosswind                25
Partly Cloudy                    5
Name: WEATHER, dtype: int64

In [20]:
#ROADCOND
df_collisions.ROADCOND.value_counts()

Dry               129522
Wet                47474
Unknown            15078
Ice                 1209
Snow/Slush          1004
Other                132
Standing Water       115
Sand/Mud/Dirt         75
Oil                   64
Name: ROADCOND, dtype: int64

In [21]:
#COLLISIONTYPE
df_collisions.LIGHTCOND.value_counts()

Daylight                    121307
Dark - Street Lights On      48507
Unknown                      13473
Dusk                          5902
Dawn                          2502
Dark - No Street Lights       1537
Dark - Street Lights Off      1199
Other                          235
Dark - Unknown Lighting         11
Name: LIGHTCOND, dtype: int64

These values have some numerical sequencing. Let's label encode this 3 columns:

In [22]:
le_weather = LabelEncoder()
df_collisions['WEATHER'] = le_weather.fit_transform(df_collisions['WEATHER'])

le_roadcond = LabelEncoder()
df_collisions['ROADCOND'] = le_roadcond.fit_transform(df_collisions['ROADCOND'])

le_lightcond = LabelEncoder()
df_collisions['LIGHTCOND'] = le_lightcond.fit_transform(df_collisions['LIGHTCOND'])

In [23]:
print(df_collisions.shape)

(194673, 21)


## Exploratory Data Analysis

All our columns has discrete values

In [24]:
df_collisions.corr()

Unnamed: 0,SEVERITYCODE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,WEATHER,ROADCOND,LIGHTCOND,addrtype_Block,addrtype_Intersection,...,collisiontype_Head On,collisiontype_Left Turn,collisiontype_Other,collisiontype_Parked Car,collisiontype_Pedestrian,collisiontype_Rear Ended,collisiontype_Right Turn,collisiontype_Sideswipe,underinfl_Yes,hitparkedcar_Y
SEVERITYCODE,1.0,0.130949,0.246338,0.214218,-0.054686,-0.105236,-0.044988,-0.061834,-0.195147,0.199089,...,0.02954,0.057692,-0.033384,-0.305388,0.245441,0.132313,-0.025194,-0.116638,0.044377,-0.101498
PERSONCOUNT,0.130949,1.0,-0.023464,-0.038809,0.380523,-0.055734,-0.023762,-0.042779,-0.063657,0.067153,...,0.021879,0.053115,-0.175667,-0.11033,-0.037342,0.139386,0.025008,0.042726,0.02242,-0.052766
PEDCOUNT,0.246338,-0.023464,1.0,-0.01692,-0.261285,-0.009703,0.007871,-0.042293,-0.145026,0.144936,...,-0.019212,-0.049346,-0.064159,-0.110281,0.946507,-0.084517,-0.022214,-0.059348,0.016592,-0.035264
PEDCYLCOUNT,0.214218,-0.038809,-0.01692,1.0,-0.253773,-0.051237,-0.047254,0.009893,-0.084272,0.085377,...,-0.01708,-0.044987,-0.062488,-0.102063,-0.031111,-0.077572,-0.020305,-0.054613,-0.016401,-0.030187
VEHCOUNT,-0.054686,0.380523,-0.261285,-0.253773,1.0,0.048249,0.040352,0.003758,0.076566,-0.073548,...,0.033492,0.05717,-0.280103,0.025913,-0.269287,0.220039,0.020448,0.076444,0.006609,0.048845
WEATHER,-0.105236,-0.055734,-0.009703,-0.051237,0.048249,1.0,0.752051,0.208585,0.089902,-0.091031,...,0.003898,-0.041033,-0.005886,0.167238,-0.008035,-0.039719,-0.021232,-0.044433,-0.03897,0.059543
ROADCOND,-0.044988,-0.023762,0.007871,-0.047254,0.040352,0.752051,1.0,0.02263,0.031271,-0.03123,...,0.015629,-0.013652,0.02991,0.03197,0.010298,-0.000423,-0.02021,-0.035632,-0.008955,0.018229
LIGHTCOND,-0.061834,-0.042779,-0.042293,0.009893,0.003758,0.208585,0.02263,1.0,0.054678,-0.054516,...,-0.021904,-0.032242,-0.119706,0.138715,-0.0426,0.002335,0.002952,0.002806,-0.218037,0.038753
addrtype_Block,-0.195147,-0.063657,-0.145026,-0.084272,0.076566,0.089902,0.031271,0.054678,1.0,-0.991394,...,0.025621,-0.293826,0.131802,0.359542,-0.150375,0.20914,-0.063094,0.087779,0.04141,0.129997
addrtype_Intersection,0.199089,0.067153,0.144936,0.085377,-0.073548,-0.091031,-0.03123,-0.054516,-0.991394,1.0,...,-0.025392,0.29691,-0.138722,-0.363621,0.150109,-0.20628,0.064288,-0.085962,-0.041827,-0.128877


In [25]:
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
#from sklearn.preprocessing import normalize

In [26]:
# Dataset is imbalanced, so lets grab samples and balance that using undersampling technique
#df_collisions.SEVERITYCODE.value_counts()

#df_shuffled = df_collisions.sample(frac=1)

#df_severitycode_2 = df_shuffled.loc[df_shuffled['SEVERITYCODE'] == 2]
#df_severitycode_1 = df_shuffled.loc[df_shuffled['SEVERITYCODE'] == 1].sample(n=58188)

#df_collisions = pd.concat([df_severitycode_1, df_severitycode_2])
#df_collisions.SEVERITYCODE.value_counts()

In [27]:
#X = df_collisions.drop('SEVERITYCODE', axis=1)

#X = normalize(X)

#y = df_collisions[['SEVERITYCODE']]

#x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

### Modeling

In [28]:
# Logistic regression
#lr_model = LogisticRegression()
#lr_model.fit(x_train, y_train)

In [29]:
# Knearest-neighbors (KNN)
#scores = np.zeros(21)
#for k in range(1, 21):
#    knn_model = KNeighborsClassifier(n_neighbors=k)
#    knn_model.fit(x_train, y_train)
#    scores[k-1] = knn_model.score(x_test, y_test)
#best_knn_score = scores.max()
#best_k = scores.argmax()+1

### Evaluation

In [30]:
#print("Logistic regression score: " + str(lr_model.score(x_test, y_test)))
#print("KNN score: " + str(best_knn_score) + " with K=" + str(best_k)) 