In [64]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [65]:
# Read dataset
df = pd.read_csv('data.csv')
df.head(10)

Unnamed: 0,Year,Crash_Level,Day,Police Report,Lighting Conditions,Municipality,Collision Type Descriptor,County Name,Road Descriptor,Weather Conditions,Traffic Control Device,Road Surface Conditions,Pedestrian Bicyclist Action,Number of Vehicles Involved
0,2014,1,5,Y,Daylight,WATERTOWN,OVERTAKING,JEFFERSON,Straight and Level,Clear,,Snow/Ice,Not Applicable,2
1,2014,2,5,Y,Daylight,WATERTOWN,REAR END,JEFFERSON,Straight and Level,Cloudy,Traffic Signal,Snow/Ice,Not Applicable,2
2,2014,2,6,Y,Daylight,WATERTOWN,OTHER,JEFFERSON,Straight and Grade,Clear,,Snow/Ice,Not Applicable,1
3,2014,1,5,Y,Daylight,WATERTOWN,RIGHT ANGLE,JEFFERSON,Straight and Level,Clear,Stop Sign,Snow/Ice,Not Applicable,2
4,2014,1,6,Y,Daylight,WATERTOWN,RIGHT ANGLE,JEFFERSON,Straight and Level,Cloudy,,Wet,Not Applicable,2
5,2014,1,6,Y,Dark-Road Unlighted,FRANKFORT,OTHER,HERKIMER,Straight and Level,Cloudy,,Snow/Ice,Not Applicable,1
6,2014,3,5,Y,Daylight,WARSAW,REAR END,WYOMING,Straight and Level,Clear,,Snow/Ice,Not Applicable,2
7,2014,1,5,Y,Daylight,ROTTERDAM,OTHER,SCHENECTADY,Curve and Level,Snow,,Snow/Ice,Not Applicable,1
8,2014,1,5,Y,Daylight,EAST AURORA,RIGHT ANGLE,ERIE,Straight and Level,Clear,,Dry,Not Applicable,2
9,2014,2,5,Y,Daylight,AURORA,OTHER,ERIE,Curve and Grade,Cloudy,,Snow/Ice,Not Applicable,1


In [66]:
# Convert data into numeric data
df['Day'] = df['Day'].replace(['Tuesday'],2)
df['Police Report'] = df['Police Report'].replace(['Y','N'],[1,0])
df['Lighting Conditions'] = df['Lighting Conditions'].replace(['Daylight','Dark-Road Lighted', 'Dark-Road Unlighted', 'Unknown', 'Dusk', 'Dawn'],[1,2,3,4,5,6])
df['Collision Type Descriptor'] = df['Collision Type Descriptor'].replace(['OTHER','REAR END', 'RIGHT ANGLE', 'OVERTAKING', 'LEFT TURN (3)', 'SIDESWIPE', 'LEFT TURN (0)', 'HEAD ON','RIGHT TURN (6)', 'RIGHT TURN (5)', 'Unknown'],[1,2,3,4,5,6,5,2,3,3,1])
df['Road Descriptor'] = df['Road Descriptor'].replace(['Straight and Level','Straight and Grade', 'Unknown', 'Curve and Level', 'Curve and Grade', 'Straight at Hill Crest', 'Curve at Hill Crest'],[1,2,3,4,5,6,7])
df['Weather Conditions'] = df['Weather Conditions'].replace(['Clear','Cloudy', 'Rain', 'Unknown', 'Snow', 'Sleet/Hail/Freezing Rain', 'Fog/Smog/Smoke', 'Other*'],[1,2,3,4,5,6,7,8])
df['Traffic Control Device'] = df['Traffic Control Device'].replace(['None','Traffic Signal', 'Stop Sign', 'No Passing Zone', 'Unknown', 'Other', 'Yield Sign', 'Construction Work Area', 'Flashing Light', 'Not Applicable', 'RR Crossing Gates', 'Police/Fire Emergency', 'Stopped School Bus-Red Lights Flashing', 'Officer/Guard', 'Maintenance Work Area', 'School Zone', 'RR Crossing Sign', 'Utility Work Area', 'RR Crossing Flashing Light'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])
df['Road Surface Conditions'] = df['Road Surface Conditions'].replace(['Dry','Wet', 'Snow/Ice', 'Unknown', 'Slush', 'Other', 'Muddy', 'Flooded Water'],[1,2,3,4,5,6,7,8])
df['Pedestrian Bicyclist Action'] = df['Pedestrian Bicyclist Action'].replace(['Not Applicable','Crossing, With Signal', 'Crossing, No Signal or Crosswalk', 'Unknown', 'Riding/Walking/Skating Along Highway With Traffic', 'Crossing, Against Signal', 'Crossing, No Signal, Marked Crosswalk', 'Other Actions in Roadway', 'Not in Roadway (Indicate)', 'Riding/Walking/Skating Along Highway Against Traffic', 'Emerging from in Front of/Behind Parked Vehicle', 'Working in Roadway', 'Getting On/Off Vehicle Other than School Bus', 'Playing in Roadway', 'Going to/From Stopped School Bus', 'Pushing/Working On Car'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])


In [67]:
df.shape

(895916, 14)

In [68]:
# Change datatype
df['Police Report'].astype('int')
df['Lighting Conditions'].astype('int')
df['Collision Type Descriptor'].astype('int')
df['Road Descriptor'].astype('int')
df['Weather Conditions'].astype('int')
df['Traffic Control Device'].astype('int')
df['Road Surface Conditions'].astype('int')
df['Pedestrian Bicyclist Action'].astype('int')
df['Day'].astype('int')

0         5
1         5
2         6
3         5
4         6
         ..
895911    4
895912    4
895913    4
895914    4
895915    4
Name: Day, Length: 895916, dtype: int64

In [70]:
# Correlation matrix
df.corr()

Unnamed: 0,Year,Crash_Level,Police Report,Lighting Conditions,Collision Type Descriptor,Road Descriptor,Weather Conditions,Traffic Control Device,Road Surface Conditions,Pedestrian Bicyclist Action,Number of Vehicles Involved
Year,1.0,0.013685,0.014479,-0.00448,0.01791,-0.014585,-0.046525,-0.007106,-0.046542,-0.00308,0.014458
Crash_Level,0.013685,1.0,0.10814,-0.060486,-0.101349,-0.07968,-0.094148,0.002071,-0.075178,0.353584,0.01627
Police Report,0.014479,0.10814,1.0,-0.366653,0.082961,-0.200672,-0.296449,-0.362544,-0.465954,0.069167,0.01074
Lighting Conditions,-0.00448,-0.060486,-0.366653,1.0,-0.147141,0.192834,0.29651,0.215992,0.378083,-0.036533,-0.159176
Collision Type Descriptor,0.01791,-0.101349,0.082961,-0.147141,1.0,-0.118386,-0.080794,-0.015534,-0.087166,-0.175503,0.284372
Road Descriptor,-0.014585,-0.07968,-0.200672,0.192834,-0.118386,1.0,0.237201,0.18551,0.288999,-0.075413,-0.161896
Weather Conditions,-0.046525,-0.094148,-0.296449,0.29651,-0.080794,0.237201,1.0,0.205694,0.730953,-0.070247,-0.088797
Traffic Control Device,-0.007106,0.002071,-0.362544,0.215992,-0.015534,0.18551,0.205694,1.0,0.309549,-0.030327,0.014133
Road Surface Conditions,-0.046542,-0.075178,-0.465954,0.378083,-0.087166,0.288999,0.730953,0.309549,1.0,-0.067855,-0.069019
Pedestrian Bicyclist Action,-0.00308,0.353584,0.069167,-0.036533,-0.175503,-0.075413,-0.070247,-0.030327,-0.067855,1.0,-0.240978


In [71]:
# Accordings to the matrix remove highly correlated features
X = df[['Year' ,'Crash_Level', 'Police Report', 'Road Descriptor' ,'Road Surface Conditions' , 'Number of Vehicles Involved', 'Day']]
Y = df['Collision Type Descriptor']

In [None]:
#Logistic regression Model
model = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
#Make class Predictions with the model
predictions = model.predict_proba(X_test)
print('%s' % predictions[0])