In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.core.display import display, HTML
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.filterwarnings('ignore')

# Code from Web, increase width of Jupyter Notebook

In [50]:
display(HTML("<style>.container { width:90% !important; }</style>"))

In [51]:
#read in explored/modified csv from previous workbook
df = pd.read_csv('../data/interim/ukexplored.csv', index_col=0, low_memory=False)

In [52]:
df.head(2)#just checking to make sure it looks ok

Unnamed: 0,Accident_Index,1st_Road_Class,Accident_Severity,Day_of_Week,Junction_Control,Junction_Detail,Light_Conditions,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Road_Surface_Conditions,...,Junction_Location,Propulsion_Code,Sex_of_Driver,Towing_and_Articulation,Vehicle_Leaving_Carriageway,Vehicle_Manoeuvre,Vehicle_Reference,Vehicle_Type,Was_Vehicle_Left_Hand_Drive,Datetime
0,200501BS00002,B,Slight,Wednesday,Auto traffic signal,Crossroads,Darkness - lights lit,0.0,5.0,Dry,...,Leaving roundabout,Heavy oil,Male,No tow/articulation,Did not leave carriageway,Slowing or stopping,1,Bus or coach (17 or more pass seats),No,2005-01-05 17:36:00
1,200501BS00003,C,Slight,Thursday,Data missing or out of range,Not at junction or within 20 metres,Darkness - lights lit,0.0,0.0,Dry,...,Not at or within 20 metres of junction,Heavy oil,Male,No tow/articulation,Did not leave carriageway,Going ahead right-hand bend,1,Bus or coach (17 or more pass seats),No,2005-01-06 00:15:00


# Actions/Create Month and Hour of Day Variables

In [53]:


df['Datetime'] = pd.to_datetime(df['Datetime']) #convert to datetime
df['Hour_of_Accident'] = df['Datetime'].dt.hour #create new feature - Hour of day of accident(0-23)
df['Month_of_Accident'] = df['Datetime'].dt.month #create new feature - Month of Accident (1-12)
df['Hour_of_Accident'] = df['Hour_of_Accident'].astype(object) #make categorical
df['Month_of_Accident'] = df['Month_of_Accident'] .astype(object) #make categorical 
df.drop('Datetime', axis=1, inplace=True) #drop Datetime after extracting Month and Hour


In [54]:
df.head(4).T #transpose and look at all variables in df

Unnamed: 0,0,1,2,3
Accident_Index,200501BS00002,200501BS00003,200501BS00004,200501BS00005
1st_Road_Class,B,C,A,Unclassified
Accident_Severity,Slight,Slight,Slight,Slight
Day_of_Week,Wednesday,Thursday,Friday,Monday
Junction_Control,Auto traffic signal,Data missing or out of range,Data missing or out of range,Data missing or out of range
Junction_Detail,Crossroads,Not at junction or within 20 metres,Not at junction or within 20 metres,Not at junction or within 20 metres
Light_Conditions,Darkness - lights lit,Darkness - lights lit,Daylight,Darkness - lighting unknown
Pedestrian_Crossing-Human_Control,0,0,0,0
Pedestrian_Crossing-Physical_Facilities,5,0,0,0
Road_Surface_Conditions,Dry,Dry,Dry,Wet or damp


# Split into Explanatory 'X' and Target 'y'

In [55]:
y = df['Accident_Severity']
X = df.drop(columns = ['Accident_Severity'])

In [56]:
X.head(3).T #verify correct columns are in X

Unnamed: 0,0,1,2
Accident_Index,200501BS00002,200501BS00003,200501BS00004
1st_Road_Class,B,C,A
Day_of_Week,Wednesday,Thursday,Friday
Junction_Control,Auto traffic signal,Data missing or out of range,Data missing or out of range
Junction_Detail,Crossroads,Not at junction or within 20 metres,Not at junction or within 20 metres
Light_Conditions,Darkness - lights lit,Darkness - lights lit,Daylight
Pedestrian_Crossing-Human_Control,0,0,0
Pedestrian_Crossing-Physical_Facilities,5,0,0
Road_Surface_Conditions,Dry,Dry,Dry
Road_Type,Dual carriageway,Single carriageway,Single carriageway


In [57]:
#Select categorical columns
cats = X.select_dtypes(include=['object']).columns.to_list()#Select categorical columns
cats

['Accident_Index',
 '1st_Road_Class',
 'Day_of_Week',
 'Junction_Control',
 'Junction_Detail',
 'Light_Conditions',
 'Road_Surface_Conditions',
 'Road_Type',
 'Special_Conditions_at_Site',
 'Weather_Conditions',
 'Age_Band_of_Driver',
 'Journey_Purpose_of_Driver',
 'Junction_Location',
 'Propulsion_Code',
 'Sex_of_Driver',
 'Towing_and_Articulation',
 'Vehicle_Leaving_Carriageway',
 'Vehicle_Manoeuvre',
 'Vehicle_Type',
 'Was_Vehicle_Left_Hand_Drive',
 'Hour_of_Accident',
 'Month_of_Accident']

In [58]:
#setting up a Fresh index col

totrow = X.shape[0] # get total rows
ind= np.arange(1,totrow+1) #make array length of df that starts at 1
len(ind) == totrow #check to make sure these match
X['newindex'] = ind #create colums 'new index'
X = X.set_index(ind)#set new index to ind variable above
X.head(3).T #transpost to ame sure it looks correct

Unnamed: 0,1,2,3
Accident_Index,200501BS00002,200501BS00003,200501BS00004
1st_Road_Class,B,C,A
Day_of_Week,Wednesday,Thursday,Friday
Junction_Control,Auto traffic signal,Data missing or out of range,Data missing or out of range
Junction_Detail,Crossroads,Not at junction or within 20 metres,Not at junction or within 20 metres
Light_Conditions,Darkness - lights lit,Darkness - lights lit,Daylight
Pedestrian_Crossing-Human_Control,0,0,0
Pedestrian_Crossing-Physical_Facilities,5,0,0
Road_Surface_Conditions,Dry,Dry,Dry
Road_Type,Dual carriageway,Single carriageway,Single carriageway


In [59]:
X.index #verify it looks correct

Int64Index([      1,       2,       3,       4,       5,       6,       7,
                  8,       9,      10,
            ...
            2055868, 2055869, 2055870, 2055871, 2055872, 2055873, 2055874,
            2055875, 2055876, 2055877],
           dtype='int64', length=2055877)

In [60]:
start = pd.DataFrame(X.index) # set up new df based on created index above
start = start.set_index(0) #set index to first column of new df
start.head(8) #verify index starts at 1 and continues on

1
2
3
4
5
6
7
8


In [61]:
cats[1:] #list of categoricals - accident index

['1st_Road_Class',
 'Day_of_Week',
 'Junction_Control',
 'Junction_Detail',
 'Light_Conditions',
 'Road_Surface_Conditions',
 'Road_Type',
 'Special_Conditions_at_Site',
 'Weather_Conditions',
 'Age_Band_of_Driver',
 'Journey_Purpose_of_Driver',
 'Junction_Location',
 'Propulsion_Code',
 'Sex_of_Driver',
 'Towing_and_Articulation',
 'Vehicle_Leaving_Carriageway',
 'Vehicle_Manoeuvre',
 'Vehicle_Type',
 'Was_Vehicle_Left_Hand_Drive',
 'Hour_of_Accident',
 'Month_of_Accident']

In [62]:
#Iterate over columns to create dummy features
for i in cats[1:]:
    #print(i)
    interiumdf = pd.get_dummies(X[i], prefix=i)
    start= pd.merge(start, interiumdf, how ='left', left_index=True, right_index=True)
start#all categoricals in one df

Unnamed: 0_level_0,1st_Road_Class_A,1st_Road_Class_A(M),1st_Road_Class_B,1st_Road_Class_C,1st_Road_Class_Motorway,1st_Road_Class_Unclassified,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,...,Month_of_Accident_3,Month_of_Accident_4,Month_of_Accident_5,Month_of_Accident_6,Month_of_Accident_7,Month_of_Accident_8,Month_of_Accident_9,Month_of_Accident_10,Month_of_Accident_11,Month_of_Accident_12
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055873,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2055874,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2055875,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2055876,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [64]:
numcols = X.select_dtypes(include=['int64', 'float']).columns.to_list()#Select numerical columns
numcols

['Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Speed_limit',
 'Age_of_Vehicle',
 'Engine_Capacity_.CC.',
 'Vehicle_Reference']

In [72]:
Xnum = X[numcols]#df from numerical columns
Xnum = Xnum.set_index(ind) # fix index 
Xnum.index


Int64Index([      1,       2,       3,       4,       5,       6,       7,
                  8,       9,      10,
            ...
            2055868, 2055869, 2055870, 2055871, 2055872, 2055873, 2055874,
            2055875, 2055876, 2055877],
           dtype='int64', length=2055877)

In [81]:
scaler=StandardScaler() #intantiate scaler
Xnum = pd.DataFrame(scaler.fit_transform(Xnum),columns = numcols) #scale - fit_transform the df of numerics and assign column name
X_num = Xnum.set_index(ind)
X_num.index
start.index

Int64Index([      1,       2,       3,       4,       5,       6,       7,
                  8,       9,      10,
            ...
            2055868, 2055869, 2055870, 2055871, 2055872, 2055873, 2055874,
            2055875, 2055876, 2055877],
           dtype='int64', name=0, length=2055877)

In [86]:
X_combined = pd.merge(X_num, start, how='left', left_index=True, right_index=True) #merge dataframes
X_combined

Unnamed: 0,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Speed_limit,Age_of_Vehicle,Engine_Capacity_.CC.,Vehicle_Reference,1st_Road_Class_A,1st_Road_Class_A(M),1st_Road_Class_B,1st_Road_Class_C,...,Month_of_Accident_3,Month_of_Accident_4,Month_of_Accident_5,Month_of_Accident_6,Month_of_Accident_7,Month_of_Accident_8,Month_of_Accident_9,Month_of_Accident_10,Month_of_Accident_11,Month_of_Accident_12
1,-0.070477,2.354227,-0.668060,-0.641979,3.495725,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.070477,-0.400577,-0.668060,-0.232590,3.513504,-0.712284,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,-0.070477,-0.400577,-0.668060,-0.437285,-0.115081,-0.712284,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.070477,-0.400577,-0.668060,0.790881,-1.050701,-0.712284,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.070477,-0.400577,-0.668060,-1.051368,0.555521,-0.712284,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055873,-0.070477,-0.400577,1.385382,1.609659,-0.210087,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2055874,-0.070477,-0.400577,1.385382,-1.051368,-0.210087,0.575053,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2055875,-0.070477,-0.400577,2.069863,-1.051368,-0.210087,-0.712284,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2055876,-0.070477,-0.400577,0.016420,1.404964,-0.100080,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [76]:
Xlist  = Xcombined.columns.to_list() #check list of all columns
Xlist #check to make sure all columsn are in new df


['Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Speed_limit',
 'Age_of_Vehicle',
 'Engine_Capacity_.CC.',
 'Vehicle_Reference',
 '1st_Road_Class_A',
 '1st_Road_Class_A(M)',
 '1st_Road_Class_B',
 '1st_Road_Class_C',
 '1st_Road_Class_Motorway',
 '1st_Road_Class_Unclassified',
 'Day_of_Week_Friday',
 'Day_of_Week_Monday',
 'Day_of_Week_Saturday',
 'Day_of_Week_Sunday',
 'Day_of_Week_Thursday',
 'Day_of_Week_Tuesday',
 'Day_of_Week_Wednesday',
 'Junction_Control_Authorised person',
 'Junction_Control_Auto traffic signal',
 'Junction_Control_Data missing or out of range',
 'Junction_Control_Give way or uncontrolled',
 'Junction_Control_Not at junction or within 20 metres',
 'Junction_Control_Stop sign',
 'Junction_Detail_Crossroads',
 'Junction_Detail_Data missing or out of range',
 'Junction_Detail_Mini-roundabout',
 'Junction_Detail_More than 4 arms (not roundabout)',
 'Junction_Detail_Not at junction or within 20 metres',
 'Junction_Detail_Other juncti

In [90]:
X = X_combined # set X equal to scaled/encoded/merged df
X

Unnamed: 0,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Speed_limit,Age_of_Vehicle,Engine_Capacity_.CC.,Vehicle_Reference,1st_Road_Class_A,1st_Road_Class_A(M),1st_Road_Class_B,1st_Road_Class_C,...,Month_of_Accident_3,Month_of_Accident_4,Month_of_Accident_5,Month_of_Accident_6,Month_of_Accident_7,Month_of_Accident_8,Month_of_Accident_9,Month_of_Accident_10,Month_of_Accident_11,Month_of_Accident_12
1,-0.070477,2.354227,-0.668060,-0.641979,3.495725,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.070477,-0.400577,-0.668060,-0.232590,3.513504,-0.712284,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,-0.070477,-0.400577,-0.668060,-0.437285,-0.115081,-0.712284,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.070477,-0.400577,-0.668060,0.790881,-1.050701,-0.712284,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.070477,-0.400577,-0.668060,-1.051368,0.555521,-0.712284,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055873,-0.070477,-0.400577,1.385382,1.609659,-0.210087,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2055874,-0.070477,-0.400577,1.385382,-1.051368,-0.210087,0.575053,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2055875,-0.070477,-0.400577,2.069863,-1.051368,-0.210087,-0.712284,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2055876,-0.070477,-0.400577,0.016420,1.404964,-0.100080,-0.712284,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [106]:
y= y.set_index(ind)
y = pd.get_dummies(y) # get dummies for y
y.index

Int64Index([      1,       2,       3,       4,       5,       6,       7,
                  8,       9,      10,
            ...
            2055868, 2055869, 2055870, 2055871, 2055872, 2055873, 2055874,
            2055875, 2055876, 2055877],
           dtype='int64', length=2055877)

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) #train test split our df

In [105]:
y_train.shape #check shape

(1439113, 3)

# Simple Model Functionality Test  - Just Testing

In [107]:
from sklearn.ensemble import RandomForestClassifier #import Random Forest Classifer

In [108]:
clf = RandomForestClassifier(max_depth=5) #just testing instantiate Random Forest

In [109]:
clf.fit(X_train, y_train) #fit classifier

RandomForestClassifier(max_depth=5)

In [112]:
y_pred = clf.predict(X_test) #predict on test set

In [111]:
from sklearn import metrics

In [99]:
metrics.accuracy_score(y_test, y_pred) #evaluate accuracy

0.8575646438508084

In [113]:
X.to_csv('../data/interim/explantorydf.csv') #write df to csv for next step.
y.to_csv('../data/interim/targetdf.csv')

In [114]:
y

Unnamed: 0,Fatal,Serious,Slight
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
5,0,0,1
...,...,...,...
2055873,0,0,1
2055874,0,0,1
2055875,0,0,1
2055876,0,0,1
