In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Link to dataset: https://drive.google.com/file/d/1bwzKE5z5JEQE5AcQlKg9h3GUqTjkXSTs/view?usp=sharing

In [4]:
import pandas as pd
import sklearn as sk
import numpy as np

In [5]:
path="/content/drive/MyDrive/Accidents/Traffic_Crashes_-_Crashes.csv"
data = pd.read_csv(path)

In [6]:
df= data[['LATITUDE','LONGITUDE','WEATHER_CONDITION','LIGHTING_CONDITION','ROADWAY_SURFACE_COND','ROAD_DEFECT','TRAFFIC_CONTROL_DEVICE','POSTED_SPEED_LIMIT','MOST_SEVERE_INJURY']]

In [7]:
df = df.dropna()

Selecting the environmental attributes from the dataset

For each attribute considered, 
we logically fuse similar values and remove the rows whose values are ambiguous 




In [8]:
# ROAD_DEFECT 
df.ROAD_DEFECT.unique()

array(['NO DEFECTS', 'UNKNOWN', 'RUT, HOLES', 'SHOULDER DEFECT', 'OTHER',
       'WORN SURFACE', 'DEBRIS ON ROADWAY'], dtype=object)

In [9]:
# A rut is a depression or groove worn into a road or path by the travel of wheels
# We fuse 'rut,holes' and 'worn' to reduce number of attributes

df = df[df.ROADWAY_SURFACE_COND !='UNKNOWN']
df = df[df.ROADWAY_SURFACE_COND !='OTHER']
df.loc[df.ROADWAY_SURFACE_COND == 'RUT, HOLES','ROADWAY_SURFACE_COND'] = 'WORN SURFACE'

In [10]:
df.ROADWAY_SURFACE_COND.unique()

array(['DRY', 'WET', 'SNOW OR SLUSH', 'ICE', 'SAND, MUD, DIRT'],
      dtype=object)

In [11]:
# ROADWAY_SURFACE_CONDITION
df = df[df.ROADWAY_SURFACE_COND !='UNKNOWN']
df = df[df.ROADWAY_SURFACE_COND !='OTHER']

In [12]:
# WEATHER_CONDITION
df.WEATHER_CONDITION.unique()

array(['CLEAR', 'RAIN', 'CLOUDY/OVERCAST', 'UNKNOWN', 'SNOW',
       'SLEET/HAIL', 'FOG/SMOKE/HAZE', 'FREEZING RAIN/DRIZZLE', 'OTHER',
       'BLOWING SNOW', 'BLOWING SAND, SOIL, DIRT',
       'SEVERE CROSS WIND GATE'], dtype=object)

In [13]:
# Fuse all 'freezing conditions/cold' attributes
# Fuse all 'obstructing wind' attributes
df.loc[df.WEATHER_CONDITION.isin(['SLEET/HAIL','FREEZING RAIN/DRIZZLE']),'WEATHER_CONDITION'] = 'SNOW'
df.loc[df.WEATHER_CONDITION == 'BLOWING SNOW','WEATHER_CONDITION'] = 'BLOWING SAND, SOIL, DIRT'
df.loc[df.WEATHER_CONDITION == 'BLOWING SAND, SOIL, DIRT','WEATHER_CONDITION'] = 'BLOWING SAND, SOIL, DIRT'
df = df[df.WEATHER_CONDITION!='UNKNOWN']
df = df[df.WEATHER_CONDITION !='OTHER']

In [14]:
# LIGHTING_CONDITION
df.LIGHTING_CONDITION.unique()

array(['DAYLIGHT', 'DARKNESS, LIGHTED ROAD', 'DAWN', 'DARKNESS', 'DUSK',
       'UNKNOWN'], dtype=object)

In [15]:
df = df[df.LIGHTING_CONDITION !='UNKNOWN']
df = df[df.LIGHTING_CONDITION !='OTHER']

In [16]:
# TRAFFIC_CONTROL_DEVICE
df.TRAFFIC_CONTROL_DEVICE.unique()
# Fuse all sign boards
# Fuse all traffic lights and flashing signals into Traffic Light Signal

array(['STOP SIGN/FLASHER', 'TRAFFIC SIGNAL', 'NO CONTROLS',
       'PEDESTRIAN CROSSING SIGN', 'OTHER', 'UNKNOWN', 'YIELD',
       'OTHER REG. SIGN', 'LANE USE MARKING', 'POLICE/FLAGMAN',
       'RAILROAD CROSSING GATE', 'SCHOOL ZONE', 'OTHER RAILROAD CROSSING',
       'NO PASSING', 'RR CROSSING SIGN', 'BICYCLE CROSSING SIGN'],
      dtype=object)

In [17]:
df.loc[df.TRAFFIC_CONTROL_DEVICE.isin(['STOP SIGN/FLASHER','TRAFFIC SIGNAL','FLASHING CONTROL SIGNAL']),'TRAFFIC_CONTROL_DEVICE'] = 'TRAFFIC LIGHT SIGNAL'
df.loc[df.TRAFFIC_CONTROL_DEVICE.isin(['YIELD','OTHER WARNING SIGN','OTHER REG. SIGN','DELINEATORS','RAILROAD CROSSING GATE','PEDESTRIAN CROSSING SIGN','SCHOOL ZONE','BICYCLE CROSSING SIGN','NO PASSING','OTHER RAILROAD CROSSING' ,'RR CROSSING SIGN']),'TRAFFIC_CONTROL_DEVICE'] = 'VISUAL SIGN BOARD'
df = df[df.TRAFFIC_CONTROL_DEVICE!='UNKNOWN']
df = df[df.TRAFFIC_CONTROL_DEVICE!='OTHER']

In [18]:
df.isna().sum()

LATITUDE                  0
LONGITUDE                 0
WEATHER_CONDITION         0
LIGHTING_CONDITION        0
ROADWAY_SURFACE_COND      0
ROAD_DEFECT               0
TRAFFIC_CONTROL_DEVICE    0
POSTED_SPEED_LIMIT        0
MOST_SEVERE_INJURY        0
dtype: int64

In [19]:
df = pd.get_dummies(df, columns=["WEATHER_CONDITION",'LIGHTING_CONDITION','ROADWAY_SURFACE_COND','TRAFFIC_CONTROL_DEVICE','ROAD_DEFECT']) 

In [20]:
df = df.replace({'MOST_SEVERE_INJURY':{'NO INDICATION OF INJURY':1,'NONINCAPACITATING INJURY':2,'INCAPACITATING INJURY':3,'REPORTED, NOT EVIDENT':0,'FATAL':4}})

In [21]:
df.head()

Unnamed: 0,LATITUDE,LONGITUDE,POSTED_SPEED_LIMIT,MOST_SEVERE_INJURY,"WEATHER_CONDITION_BLOWING SAND, SOIL, DIRT",WEATHER_CONDITION_CLEAR,WEATHER_CONDITION_CLOUDY/OVERCAST,WEATHER_CONDITION_FOG/SMOKE/HAZE,WEATHER_CONDITION_RAIN,WEATHER_CONDITION_SEVERE CROSS WIND GATE,WEATHER_CONDITION_SNOW,LIGHTING_CONDITION_DARKNESS,"LIGHTING_CONDITION_DARKNESS, LIGHTED ROAD",LIGHTING_CONDITION_DAWN,LIGHTING_CONDITION_DAYLIGHT,LIGHTING_CONDITION_DUSK,ROADWAY_SURFACE_COND_DRY,ROADWAY_SURFACE_COND_ICE,"ROADWAY_SURFACE_COND_SAND, MUD, DIRT",ROADWAY_SURFACE_COND_SNOW OR SLUSH,ROADWAY_SURFACE_COND_WET,TRAFFIC_CONTROL_DEVICE_LANE USE MARKING,TRAFFIC_CONTROL_DEVICE_NO CONTROLS,TRAFFIC_CONTROL_DEVICE_POLICE/FLAGMAN,TRAFFIC_CONTROL_DEVICE_TRAFFIC LIGHT SIGNAL,TRAFFIC_CONTROL_DEVICE_VISUAL SIGN BOARD,ROAD_DEFECT_DEBRIS ON ROADWAY,ROAD_DEFECT_NO DEFECTS,ROAD_DEFECT_OTHER,"ROAD_DEFECT_RUT, HOLES",ROAD_DEFECT_SHOULDER DEFECT,ROAD_DEFECT_UNKNOWN,ROAD_DEFECT_WORN SURFACE
2,41.741804,-87.740954,35,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
3,41.741804,-87.740954,30,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
4,41.953647,-87.732082,35,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
5,41.958987,-87.933994,35,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
6,41.903825,-87.643286,30,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [22]:
# df.to_csv(r'/content/drive/MyDrive/Accidents/Model_Data.csv',index=False)

In [23]:
df.dropna(inplace=True)

In [24]:
data = df.sample(frac=1).reset_index(drop=True)

In [25]:
data=data.iloc[:,data.columns != "LATITUDE"]
data=data.iloc[:,data.columns != "LONGITUDE"]

In [27]:
data.head()

Unnamed: 0,POSTED_SPEED_LIMIT,MOST_SEVERE_INJURY,"WEATHER_CONDITION_BLOWING SAND, SOIL, DIRT",WEATHER_CONDITION_CLEAR,WEATHER_CONDITION_CLOUDY/OVERCAST,WEATHER_CONDITION_FOG/SMOKE/HAZE,WEATHER_CONDITION_RAIN,WEATHER_CONDITION_SEVERE CROSS WIND GATE,WEATHER_CONDITION_SNOW,LIGHTING_CONDITION_DARKNESS,"LIGHTING_CONDITION_DARKNESS, LIGHTED ROAD",LIGHTING_CONDITION_DAWN,LIGHTING_CONDITION_DAYLIGHT,LIGHTING_CONDITION_DUSK,ROADWAY_SURFACE_COND_DRY,ROADWAY_SURFACE_COND_ICE,"ROADWAY_SURFACE_COND_SAND, MUD, DIRT",ROADWAY_SURFACE_COND_SNOW OR SLUSH,ROADWAY_SURFACE_COND_WET,TRAFFIC_CONTROL_DEVICE_LANE USE MARKING,TRAFFIC_CONTROL_DEVICE_NO CONTROLS,TRAFFIC_CONTROL_DEVICE_POLICE/FLAGMAN,TRAFFIC_CONTROL_DEVICE_TRAFFIC LIGHT SIGNAL,TRAFFIC_CONTROL_DEVICE_VISUAL SIGN BOARD,ROAD_DEFECT_DEBRIS ON ROADWAY,ROAD_DEFECT_NO DEFECTS,ROAD_DEFECT_OTHER,"ROAD_DEFECT_RUT, HOLES",ROAD_DEFECT_SHOULDER DEFECT,ROAD_DEFECT_UNKNOWN,ROAD_DEFECT_WORN SURFACE
0,25,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
2,45,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,30,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
4,30,2,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0


In [28]:
n = len(data.columns)
X = data.iloc[:, data.columns != 'MOST_SEVERE_INJURY']
#Last Column 'MOST_SEVERE_INJURY' is output
# Y = data.iloc[:,-1:]
Y = data.iloc[:, data.columns == 'MOST_SEVERE_INJURY']


**Splitting train,test data at a 80:20 ratio**

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=0)

In [30]:
len(x_train)

318044

**Feature Scaling**

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [32]:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics

**Classification Models** 

**Decision Tree**

In [33]:
from sklearn import tree

DT = tree.DecisionTreeClassifier()
DT.fit(x_train,y_train.values.ravel())
predictions = DT.predict(x_test)

tree_confMat=metrics.confusion_matrix(y_test, predictions)
print(tree_confMat)

MAE_DT = metrics.mean_absolute_error(y_test, predictions)
acc_DT = metrics.accuracy_score(y_test, predictions)
MSE_DT = metrics.mean_squared_error(y_test, predictions)

[[    7  3403     3     1     0]
 [   38 68614    30    12     1]
 [    5  6005     2     1     0]
 [    2  1295     1     0     0]
 [    0    91     0     0     0]]


In [34]:
print(MSE_DT)
print(acc_DT)
print(MAE_DT)

0.19611122989271926
0.8630629724189106
0.15586522619511767


**Multilayer Perceptron Neural Network**

In [35]:
from sklearn.neural_network import MLPClassifier


NN = MLPClassifier(hidden_layer_sizes=(10,),solver='sgd', alpha=0.0001)
NN.fit(x_train, y_train.values.ravel())
predictions = NN.predict(x_test)
round(NN.score(x_train, y_train.values.ravel()), 4)

nn_confMat=metrics.confusion_matrix(y_test, predictions)
print(nn_confMat)

MAE_NN = metrics.mean_absolute_error(y_test, predictions)
MSE_NN = metrics.mean_squared_error(y_test, predictions)
acc_NN = metrics.accuracy_score(y_test, predictions)

[[    0  3414     0     0     0]
 [    0 68695     0     0     0]
 [    0  6013     0     0     0]
 [    0  1298     0     0     0]
 [    0    91     0     0     0]]


In [36]:
print(MAE_NN)
print(MSE_NN)
print(acc_NN)

0.15464526920803412
0.19416181408861666
0.8639685075021066


**Regression Models**

**Linear Regression**

In [38]:
!pip install mord

Collecting mord
  Downloading https://files.pythonhosted.org/packages/67/9d/c791c841501d9ff4ecb76b57f208dec6cf9f925109c59c995ddec80f9b32/mord-0.6.tar.gz
Building wheels for collected packages: mord
  Building wheel for mord (setup.py) ... [?25l[?25hdone
  Created wheel for mord: filename=mord-0.6-cp36-none-any.whl size=6007 sha256=9cb00cf26d8ed3419d964cdd9329ef5a25d647ddb2845e4a88c4df31511304c0
  Stored in directory: /root/.cache/pip/wheels/98/14/b2/244c2cec93a0c6edb29b488bd6b2710ded7e9d457033b86366
Successfully built mord
Installing collected packages: mord
Successfully installed mord-0.6


In [39]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from mord import LogisticAT

# instantiate models
model_linear = LinearRegression()

In [40]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np

# divide data into features matrix and target vector
features = data.iloc[:, data.columns != 'MOST_SEVERE_INJURY']
target = data.iloc[:, data.columns == 'MOST_SEVERE_INJURY']

MAE = make_scorer(mean_absolute_error)
MSE = make_scorer(mean_squared_error)

folds = 5

print('Mean absolute error:' )
MAE_linear = cross_val_score(model_linear,
    features,
    target,
    cv=folds,
    scoring=MAE)
print('Linear regression: ', np.mean(MAE_linear))

print('Mean absolute error:' )
MSE_linear = cross_val_score(model_linear,
    features,
    target,
    cv=folds,
    scoring=MSE)
print('Linear regression: ', np.mean(MSE_linear))

Mean absolute error:
Linear regression:  0.20811863412568438
Mean absolute error:
Linear regression:  0.1873538408273549


In [41]:
from sklearn.metrics import accuracy_score

def acc_fun(target_true, target_fit):
    target_fit = np.round(target_fit)
    target_fit.astype('int')
    return accuracy_score(target_true, target_fit)

acc = make_scorer(acc_fun)
folds = 5

print('Accuracy:' )
acc_linear = cross_val_score(model_linear,
    features,
    target,
    cv=folds,
    scoring=acc)
print('Linear regression: ', np.mean(acc_linear))

Accuracy:
Linear regression:  0.8662373759605589


In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from mord import LogisticAT

# instantiate models
model_linear = LinearRegression()

MAE_linear = cross_val_score(model_linear,features,target, cv=folds, scoring=MAE)
acc_linear = cross_val_score(model_linear, features, target, cv=folds, scoring=acc)
MSE_linear = cross_val_score(model_linear, features, target, cv=folds, scoring=MSE)

**Ordinal Regression**

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from mord import LogisticAT

In [None]:
model_ordinal = LogisticAT() 
model_ordinal.fit(x_train,y_train.values.ravel())
predictions = model_ordinal.predict(x_test)
acc_RF = metrics.accuracy_score(y_test, predictions)
MAE_RF = metrics.mean_absolute_error(y_test, predictions)
MSE_RF = metrics.mean_squared_error(y_test, predictions)

In [None]:
print(acc_RF)
print(MAE_RF)
print(MSE_RF)