# Project 5: Car crash data

## Modeling Notebook
This notebook is for modeling the car crash data.  It assumes that the previous notebooks have been run. 

## Problem Statement:


In [22]:
#imports 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [6]:
#read data
crash = pd.read_csv('./data/crash_data_modified.csv')
crash.head()

Unnamed: 0,Crash Time of Day,Collision Type,Surface Condition,Light,Traffic Control,Driver Substance Abuse,Driver At Fault,Driver Distracted By,Vehicle First Impact Location,Vehicle Second Impact Location,Vehicle Body Type,Vehicle Movement,Speed Limit,Parked Vehicle
0,Evening,SAME DIR REAR END,DRY,DAYLIGHT,STOP SIGN,No,No,No,SIX OCLOCK,SIX OCLOCK,PASSENGER CAR,STOPPED IN TRAFFIC LANE,25,No
1,Early Morning,SAME DIR REAR END,DRY,DAWN,TRAFFIC SIGNAL,No,No,No,SIX OCLOCK,SIX OCLOCK,PASSENGER CAR,STOPPED IN TRAFFIC LANE,40,No
2,Early Morning,SINGLE VEHICLE,DRY,DAYLIGHT,NO CONTROLS,No,No,No,ELEVEN OCLOCK,ELEVEN OCLOCK,POLICE VEHICLE/NON EMERGENCY,MOVING CONSTANT SPEED,35,No
3,Late Night,SINGLE VEHICLE,DRY,DARK LIGHTS ON,NO CONTROLS,No,No,No,TWELVE OCLOCK,TWELVE OCLOCK,POLICE VEHICLE/EMERGENCY,MOVING CONSTANT SPEED,35,No
4,Night,SAME DIR REAR END,DRY,DARK LIGHTS ON,NO CONTROLS,Yes,Yes,Yes,TWELVE OCLOCK,TWELVE OCLOCK,PASSENGER CAR,ACCELERATING,35,No




Index(['Crash Time of Day', 'Collision Type', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse', 'Driver At Fault',
       'Driver Distracted By', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Parked Vehicle'],
      dtype='object')


In [17]:
columns_to_dummify = ['Crash Time of Day', 'Collision Type', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Driver Distracted By', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Parked Vehicle']

df = pd.get_dummies(crash,columns=columns_to_dummify, drop_first=True)

In [20]:
df['Driver At Fault'] = df['Driver At Fault'].map({'No':0, 'Yes':1})

In [21]:
df.head()

Unnamed: 0,Driver At Fault,Speed Limit,Crash Time of Day_Evening,Crash Time of Day_Late Night,Crash Time of Day_Morning,Crash Time of Day_Night,Crash Time of Day_Noon,Collision Type_ANGLE MEETS LEFT TURN,Collision Type_ANGLE MEETS RIGHT TURN,Collision Type_HEAD ON,...,Vehicle Movement_PARKED,Vehicle Movement_PARKING,Vehicle Movement_PASSING,Vehicle Movement_RIGHT TURN ON RED,Vehicle Movement_SKIDDING,Vehicle Movement_SLOWING OR STOPPING,Vehicle Movement_STARTING FROM LANE,Vehicle Movement_STARTING FROM PARKED,Vehicle Movement_STOPPED IN TRAFFIC LANE,Parked Vehicle_Yes
0,0,25,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,40,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,35,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,35,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,35,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Null Model

In [24]:
df['Driver At Fault'].value_counts(normalize = True)

0    0.556386
1    0.443614
Name: Driver At Fault, dtype: float64

For this data set 55.6% of the crashes were listed as the driver not at fault.  This means that if we assign the driver as never at fault we would have a baseline accuracy of 55.6%.  

## Simple Logistic Regression Model

In [25]:
X = df.drop(columns = 'Driver At Fault')
y = df['Driver At Fault']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=13424)

In [28]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [29]:
logreg.score(X_train,y_train),logreg.score(X_test,y_test)

(0.8908028640830647, 0.89393217755763)

## seeing if the collision type matters

In [36]:
columns_to_dummify = ['Crash Time of Day', 'Surface Condition', 'Light',
       'Traffic Control', 'Driver Substance Abuse',
       'Driver Distracted By', 'Vehicle First Impact Location',
       'Vehicle Second Impact Location', 'Vehicle Body Type',
       'Vehicle Movement', 'Parked Vehicle']

df2 = pd.get_dummies(crash,columns=columns_to_dummify, drop_first=True).drop(columns = 'Collision Type')

In [37]:
df2['Driver At Fault'] = df2['Driver At Fault'].map({'No':0,'Yes':1})

In [39]:

X2 = df2.drop(columns = ['Driver At Fault'])
y2 = df2['Driver At Fault']

X2_train,X2_test, y2_train,y2_test = train_test_split(X2,y2, stratify=y2, random_state=13424)

In [41]:
logreg2 = LogisticRegression(max_iter=1000)
logreg2.fit(X2_train,y2_train)

LogisticRegression(max_iter=1000)

In [42]:
logreg2.score(X2_train,y2_train), logreg2.score(X2_test,y2_test)

(0.8492704843856669, 0.8515431510763956)

## coefficient interpretation

In [45]:
list(zip(df.columns, logreg.coef_[0]))

[('Driver At Fault', -0.013278099485928174),
 ('Speed Limit', -0.220162298281299),
 ('Crash Time of Day_Evening', -0.12934211306733132),
 ('Crash Time of Day_Late Night', -0.08205132775966555),
 ('Crash Time of Day_Morning', -0.18326635293665372),
 ('Crash Time of Day_Night', -0.13656029780615558),
 ('Crash Time of Day_Noon', 0.015654131943725797),
 ('Collision Type_ANGLE MEETS LEFT TURN', 0.06269348478167906),
 ('Collision Type_ANGLE MEETS RIGHT TURN', 0.6242983208532685),
 ('Collision Type_HEAD ON', -0.051043955167613814),
 ('Collision Type_HEAD ON LEFT TURN', 0.0693405179639358),
 ('Collision Type_OPPOSITE DIR BOTH LEFT TURN', 1.042194619470996),
 ('Collision Type_OPPOSITE DIRECTION SIDESWIPE', -0.5935722104788969),
 ('Collision Type_SAME DIR BOTH LEFT TURN', 2.9250665562398206),
 ('Collision Type_SAME DIR REAR END', 1.442061953321955),
 ('Collision Type_SAME DIR REND LEFT TURN', 1.8572175546048175),
 ('Collision Type_SAME DIR REND RIGHT TURN', -0.0033405897626106306),
 ('Collision 

In [46]:
coefs = {entry[0]:entry[1] for entry in list(zip(df.columns, logreg.coef_[0]))}

In [53]:
max(coefs.values())

4.728813672017089

In [54]:
min(coefs.values())

-3.752051525557381

In [61]:
strong_positive_coefs = {key:coefs[key]  for key in coefs.keys() if coefs[key] >1}
strong_positive_coefs

{'Collision Type_OPPOSITE DIR BOTH LEFT TURN': 1.042194619470996,
 'Collision Type_SAME DIR BOTH LEFT TURN': 2.9250665562398206,
 'Collision Type_SAME DIR REAR END': 1.442061953321955,
 'Collision Type_SAME DIR REND LEFT TURN': 1.8572175546048175,
 'Collision Type_SAME DIRECTION SIDESWIPE': 2.7811370314627206,
 'Traffic Control_YIELD SIGN': 3.085890746344072,
 'Driver Substance Abuse_Yes': 3.95688270823626,
 'Vehicle Body Type_VAN': 4.728813672017089,
 'Vehicle Movement_BACKING': 2.1326830222300277,
 'Vehicle Movement_DRIVERLESS MOVING VEH.': 1.5521166677991596,
 'Vehicle Movement_MAKING RIGHT TURN': 2.0073746388451013,
 'Vehicle Movement_PARKING': 1.6272277975273552,
 'Vehicle Movement_PASSING': 1.1653491443890838}

In [62]:
strong_negative_coefs = {key:coefs[key]  for key in coefs.keys() if coefs[key] < -1}
strong_negative_coefs

{'Vehicle First Impact Location_SEVEN OCLOCK': -2.9571760548578654,
 'Vehicle Second Impact Location_SEVEN OCLOCK': -1.1759664666152532,
 'Vehicle Movement_MAKING U TURN': -1.5906888977028817,
 'Vehicle Movement_NEGOTIATING A CURVE': -1.869539969406618,
 'Vehicle Movement_SKIDDING': -1.11890252937009,
 'Vehicle Movement_STARTING FROM PARKED': -3.752051525557381,
 'Vehicle Movement_STOPPED IN TRAFFIC LANE': -1.869539969406618}

In [63]:
mild_coefs = {key:coefs[key] for key in coefs.keys() if 0.5 < coefs[key]<=1 or -0.5>= coefs[key] > -1 }
mild_coefs

{'Collision Type_ANGLE MEETS RIGHT TURN': 0.6242983208532685,
 'Collision Type_OPPOSITE DIRECTION SIDESWIPE': -0.5935722104788969,
 'Collision Type_SAME DIRECTION RIGHT TURN': 0.7071807325504407,
 'Collision Type_STRAIGHT MOVEMENT ANGLE': 0.6069060189127644,
 'Surface Condition_ICE': -0.7644816202592588,
 'Surface Condition_SLUSH': 0.8577364921729975,
 'Traffic Control_NO CONTROLS': -0.6575032064181138,
 'Vehicle First Impact Location_NINE OCLOCK': 0.5039086481246337,
 'Vehicle First Impact Location_NON-COLLISION': 0.8478351583477418,
 'Vehicle First Impact Location_ONE OCLOCK': -0.6560164759089363,
 'Vehicle First Impact Location_THREE OCLOCK': 0.6927395413140754,
 'Vehicle First Impact Location_TWELVE OCLOCK': 0.6271148980318277,
 'Vehicle First Impact Location_TWO OCLOCK': 0.9496065934851255,
 'Vehicle Second Impact Location_ELEVEN OCLOCK': 0.5322032569753381,
 'Vehicle Second Impact Location_FIVE OCLOCK': 0.8449065030902546,
 'Vehicle Second Impact Location_TEN OCLOCK': 0.563103986