In [22]:
import numpy as np
import sklearn as sk
import pandas as pd
import random

pd.set_option('display.float_format', lambda x: '%.3f' % x)

# For speed during testing load only small percentage of the data
p = 0.01  # 1% of the lines
skip_rows = lambda i: i > 0 and random.random() > p
accidents_df = pd.read_csv("./data/Accident_Information.csv",
                           skiprows=skip_rows)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 3169: invalid start byte
vehicle_df = pd.read_csv("./data/Vehicle_Information.csv", encoding="latin1")

In [23]:
df = pd.merge(accidents_df, vehicle_df, on="Accident_Index", how="left")
del vehicle_df
del accidents_df
df

Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,...,Skidding_and_Overturning,Towing_and_Articulation,Vehicle_Leaving_Carriageway,Vehicle_Location.Restricted_Lane,Vehicle_Manoeuvre,Vehicle_Reference,Vehicle_Type,Was_Vehicle_Left_Hand_Drive,X1st_Point_of_Impact,Year_y
0,200501BS70153,A,4,A,3220.000,Slight,,2005-04-13,Wednesday,1.000,...,,,,,,,,,,
1,200501BS70343,A,3216,,0.000,Slight,,2005-07-09,Saturday,1.000,...,,,,,,,,,,
2,200501BS70542,A,315,Unclassified,0.000,Slight,,2005-08-28,Sunday,1.000,...,,No tow/articulation,Did not leave carriageway,0.000,U-turn,1.000,Car,No,Offside,2005.000
3,200501BS70679,A,402,Unclassified,0.000,Slight,,2005-11-13,Sunday,1.000,...,,,,,,,,,,
4,200501BS70751,C,0,,0.000,Slight,,2005-11-25,Friday,2.000,...,,No tow/articulation,Did not leave carriageway,0.000,Going ahead other,2.000,Car,No,Front,2005.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26954,201797UD00702,A,77,,0.000,Slight,,2017-02-09,Thursday,1.000,...,,,,,,,,,,
26955,201797UD01210,A,719,Unclassified,0.000,Slight,,2017-10-26,Thursday,1.000,...,,,,,,,,,,
26956,201797UD70205,B,730,Unclassified,0.000,Serious,,2017-05-09,Tuesday,1.000,...,,,,,,,,,,
26957,201797UD71103,A,77,Unclassified,0.000,Serious,,2017-03-26,Sunday,1.000,...,,,,,,,,,,


In [24]:
df.dtypes

Accident_Index                                  object
1st_Road_Class                                  object
1st_Road_Number                                  int64
2nd_Road_Class                                  object
2nd_Road_Number                                float64
Accident_Severity                               object
Carriageway_Hazards                             object
Date                                            object
Day_of_Week                                     object
Did_Police_Officer_Attend_Scene_of_Accident    float64
Junction_Control                                object
Junction_Detail                                 object
Latitude                                       float64
Light_Conditions                                object
Local_Authority_(District)                      object
Local_Authority_(Highway)                       object
Location_Easting_OSGR                          float64
Location_Northing_OSGR                         float64
Longitude 

In [25]:
df.nunique()

Accident_Index                                 20346
1st_Road_Class                                     6
1st_Road_Number                                 2633
2nd_Road_Class                                     6
2nd_Road_Number                                 1734
Accident_Severity                                  3
Carriageway_Hazards                                6
Date                                            4667
Day_of_Week                                        7
Did_Police_Officer_Attend_Scene_of_Accident        3
Junction_Control                                   6
Junction_Detail                                   10
Latitude                                       20161
Light_Conditions                                   5
Local_Authority_(District)                       416
Local_Authority_(Highway)                        207
Location_Easting_OSGR                          16409
Location_Northing_OSGR                         16521
Longitude                                     

# Problems 
* Unclassified and NaN in the same columns -> we have to merge
* Categorical values and not 1 hot encoded 
* 

In [26]:
# Consolidate NaN
df.replace(
    [
        "Not known",
        "Data missing or out of range",
        "Unclassified"
    ],
    np.nan, inplace=True)
df.fillna(np.nan, inplace=True)
# Get percentage of NaN values for each column
na_percentage = df.isna().mean() * 100
na_percentage.sort_values(ascending=False, inplace=True)
na_percentage

Carriageway_Hazards                           98.338
Special_Conditions_at_Site                    97.663
Hit_Object_in_Carriageway                     96.999
Hit_Object_off_Carriageway                    93.427
Skidding_and_Overturning                      90.408
2nd_Road_Class                                80.634
Journey_Purpose_of_Driver                     55.469
Driver_IMD_Decile                             49.542
Junction_Control                              37.112
Age_of_Vehicle                                37.027
model                                         35.728
Driver_Home_Area_Type                         35.276
Engine_Capacity_.CC.                          33.863
Propulsion_Code                               33.180
Age_Band_of_Driver                            30.235
make                                          28.554
1st_Road_Class                                27.623
Sex_of_Driver                                 26.974
Was_Vehicle_Left_Hand_Drive                   

# NaN problems
Some columns have too much missing values so we will drop them

In [27]:
to_drop = na_percentage[na_percentage > 40]
to_drop

Carriageway_Hazards          98.338
Special_Conditions_at_Site   97.663
Hit_Object_in_Carriageway    96.999
Hit_Object_off_Carriageway   93.427
Skidding_and_Overturning     90.408
2nd_Road_Class               80.634
Journey_Purpose_of_Driver    55.469
Driver_IMD_Decile            49.542
dtype: float64

In [28]:
df.drop(columns=to_drop.index, inplace=True)

# TODO 
To have equal distribution of the classes add straticitiaon for leave out severity 1
DONE To drop accident index, road number, police force (that is info after the accident)
DONE Normalize time to <0, 1>
Normalize week to one-hot? 

DONE Join cars table on accident index


In [29]:
# Normalize time of day to (0,1)
min_in_day = 60 * 24
print(f"Dropping {df.Time.isna().sum()} rows with NaN")
df.dropna(subset=["Time"], inplace=True)

normalized_time = pd.to_datetime(df['Time'], format="%H:%M")
# (60*hours + minutes) / min_in_day
df["normalized_time"] = (normalized_time.dt.minute + normalized_time.dt.hour * 60) / min_in_day

df.drop(columns="Time", inplace=True)
# Check min and max values
df["normalized_time"].describe()

Dropping 3 rows with NaN


count   26956.000
mean        0.585
std         0.211
min         0.001
25%         0.431
50%         0.614
75%         0.740
max         0.999
Name: normalized_time, dtype: float64

In [30]:
# Normalize time of date to (0,1)
normalized_date = pd.to_datetime(df['Date'], format="%Y-%m-%d")
df["normalized_date"] = normalized_date.dt.dayofyear / 365
df.drop(columns="Date", inplace=True)
df["normalized_date"].describe()
# df.Date

count   26956.000
mean        0.511
std         0.284
min         0.003
25%         0.268
50%         0.512
75%         0.756
max         1.003
Name: normalized_date, dtype: float64

In [31]:
to_drop = [
    "Accident_Index",  # ID -> No predictive value
    # "1st_Road_Class",
    # "1st_Road_Number",
    # "2nd_Road_Number",
    # "Accident_Severity", # Target value 
    # "Day_of_Week",
    "Did_Police_Officer_Attend_Scene_of_Accident",  # Happened after the accident
    # "Junction_Control",
    # "Junction_Detail",
    "Latitude",  # Location data will be ignored
    # "Light_Conditions",
    # "Local_Authority_(District)",
    # "Local_Authority_(Highway)",
    "Location_Easting_OSGR",
    "Location_Northing_OSGR",
    "Longitude",
    "LSOA_of_Accident_Location",
    "Number_of_Casualties",  # Leaks info about accident severity
    "Number_of_Vehicles",  # Leaks info about accident severity
    # "Pedestrian_Crossing-Human_Control",
    # "Pedestrian_Crossing-Physical_Facilities",
    "Police_Force",
    # "Road_Surface_Conditions",
    # "Road_Type",
    # "Speed_limit",
    # "Urban_or_Rural_Area",
    # "Weather_Conditions",
    # "Year_x",
    "InScotland",  # We dont discriminate Scotland
    # "Age_Band_of_Driver",
    # "Age_of_Vehicle",
    # "Driver_Home_Area_Type",
    # "Engine_Capacity_.CC.",
    # "Junction_Location",
    # "make",
    # "model",
    # "Propulsion_Code",
    # "Sex_of_Driver",
    # "Towing_and_Articulation",
    # "Vehicle_Leaving_Carriageway",
    # "Vehicle_Location.Restricted_Lane",
    # "Vehicle_Manoeuvre",
    # "Vehicle_Reference",
    # "Vehicle_Type",
    # "Was_Vehicle_Left_Hand_Drive",
    # "X1st_Point_of_Impact",
    # "Year_y",
    # "minutes_since_midnight",
    # "normalized_time",
]

In [32]:
# Do one hot encoding
unique = df.nunique()
nominal_variables = [
    # "Age_of_Vehicle",
    "Vehicle_Type",
    "Vehicle_Manoeuvre",
    # "Year_x",
    # "Year_y",
    "Age_Band_of_Driver",
    "Vehicle_Location.Restricted_Lane",
    "Junction_Location",
    "Weather_Conditions",
    "Junction_Detail",
    "Vehicle_Leaving_Carriageway",
    "Speed_limit",
    "Vehicle_Reference",
    "Propulsion_Code",
    "Day_of_Week",
    "Towing_and_Articulation",
    "Road_Type",
    "Pedestrian_Crossing-Physical_Facilities",
    "X1st_Point_of_Impact",
    "Junction_Control",
    "1st_Road_Class",
    "Light_Conditions",
    "Road_Surface_Conditions",
    "Driver_Home_Area_Type",
    # "Accident_Severity",
    "Urban_or_Rural_Area",
    "Pedestrian_Crossing-Human_Control",
    "Sex_of_Driver",
    "Was_Vehicle_Left_Hand_Drive",
]
unique[unique < 50].sort_values(ascending=False)

Age_of_Vehicle                                 42
Vehicle_Manoeuvre                              18
Vehicle_Type                                   17
Vehicle_Reference                              15
Year_x                                         13
Number_of_Casualties                           13
Number_of_Vehicles                             12
Year_y                                         12
Age_Band_of_Driver                             11
Vehicle_Location.Restricted_Lane               10
Weather_Conditions                              9
Junction_Detail                                 9
Vehicle_Leaving_Carriageway                     9
Junction_Location                               9
Day_of_Week                                     7
Propulsion_Code                                 7
Towing_and_Articulation                         6
Pedestrian_Crossing-Physical_Facilities         6
Road_Type                                       6
Speed_limit                                     6


In [33]:
from sklearn.compose import ColumnTransformer  # applies transformers to columns
from sklearn.pipeline import Pipeline  # assemble several steps
from sklearn import set_config  # global scikit-learn configuration
from sklearn.impute import SimpleImputer  # replace missing values using a descriptive statistic (e.g. mean, median,...)
from sklearn.preprocessing import OrdinalEncoder  # encode categorical features as an integer array
from sklearn.preprocessing import OneHotEncoder  # encode categorical features as a one-hot numeric array
from sklearn.preprocessing import \
    StandardScaler  # standardize features by removing the mean and scaling to unit variance

# here we call the new API set_config to tell sklearn we want to output a pandas DF
set_config(transform_output="pandas")
# num_pipe = Pipeline([
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler())
# ])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(drop='if_binary',
                             sparse_output=False)),
])

transformers_list = [
    # ('numerical', num_pipe, numerical_variables),
    ('nominal', cat_pipe, nominal_variables),
    # ('ordinal', OrdinalEncoder(), ordinal_variables),
    # ('delete', 'drop', to_drop)
    #these would be removed by default, this is not necessary but better be explicit
]

# If True, ColumnTransformer.get_feature_names_out will prefix all feature names with the name of the transformer that generated that feature.
column_transformer = ColumnTransformer(transformers_list,
                                       verbose_feature_names_out=False,
                                       verbose=True)

transformed_raw_train = column_transformer.fit_transform(df)
transformed_raw_train.dtypes

[ColumnTransformer] ....... (1 of 1) Processing nominal, total=   0.8s


Vehicle_Type_Agricultural vehicle                    float64
Vehicle_Type_Bus or coach (17 or more pass seats)    float64
Vehicle_Type_Car                                     float64
Vehicle_Type_Goods 7.5 tonnes mgw and over           float64
Vehicle_Type_Goods over 3.5t. and under 7.5t         float64
                                                      ...   
Pedestrian_Crossing-Human_Control_0.0                float64
Pedestrian_Crossing-Human_Control_1.0                float64
Pedestrian_Crossing-Human_Control_2.0                float64
Sex_of_Driver_Male                                   float64
Was_Vehicle_Left_Hand_Drive_Yes                      float64
Length: 181, dtype: object