# UK Traffic Accidents Severity Classification
4IZ565 – Programming for Data Science in Python

Spring Semester 2023/2024 Coursework

Authors: Terka Lukešová, Jan Štipl

## Dataset Description
The dataset contains statistics on traffic accidents in the UK, including many attributes, such as the route category and hazards, the vehicles involved, and some information about the drivers. Our goal will be to predict the target attribute Accident Severity, whose values are Slight/Serious/Fatal.

Dataset source: https://www.kaggle.com/datasets/tsiaras/uk-road-safety-accidents-and-vehicles

## Exploratory Data Analysis

### Initial settings

In [None]:
import random
import numpy as np
import sklearn
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
sklearn.set_config(transform_output="pandas")

RANDOM_STATE = 42 # used throughout the notebook to make the results reproducable
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

### Load CSV tables and merge them 

In [None]:
# For speed during testing load only small percentage of the data
p = 0.01  # 1% of the lines
skip_rows = lambda i: i > 0 and random.random() > p
# skip_rows = None

accidents_df = pd.read_csv("./data/Accident_Information.csv",
                           skiprows=skip_rows)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 3169: invalid start byte
vehicle_df = pd.read_csv("./data/Vehicle_Information.csv", encoding="latin1")

In [None]:
df = pd.merge(accidents_df, vehicle_df, on="Accident_Index", how="inner")
del vehicle_df
del accidents_df

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.select_dtypes(include=['object']).nunique()

### Problems
* Unclassified and NaN in the same columns -> we have to merge
* Categorical values and not one-hot encoded

## Data preprocessing

### Separate the target class from attributes

In [None]:
target_variable = 'Accident_Severity'
X, y = df.loc[:, df.columns != target_variable], df[target_variable]
del df

### Consolidate string values Unknown/Unclassified to NaN

In [None]:
X.replace(
    [
        "Not known",
        "Data missing or out of range",
        "Unclassified"
    ],
    np.nan, inplace=True)
X.fillna(np.nan, inplace=True)

### Train-test split
Our train-test ratio is 80:20. We make use of stratification because the target class is imbalanced.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
del X
del y

### Drop columns with too many missing values
The columns are identified on the train set, the drop is then applied to the test set too

In [None]:
# Get percentage of NaN values for each column
na_percentage = X_train.isna().mean() * 100
na_percentage.sort_values(ascending=False, inplace=True)
na_percentage

In [None]:
to_drop = na_percentage[na_percentage > 40]
to_drop

In [None]:
X_train.drop(columns=to_drop.index, inplace=True)
X_test.drop(columns=to_drop.index, inplace=True)

### Transformers

In [None]:
# Extract month from date
month_extractor = lambda x: pd.to_datetime(x['Date'], format='%Y-%m-%d').dt.month
X_train['month'] = month_extractor(X_train)
X_test['month'] = month_extractor(X_test)

# Extract hour from time
hour_extractor = lambda x: pd.to_datetime(x['Time'], format='%H:%M').dt.hour
X_train['hour'] = month_extractor(X_train)
X_test['hour'] = month_extractor(X_test)

In [None]:
numerical_variables = [
     # 'Number_of_Casualties', # Leaks info about accident severity
     # 'Number_of_Vehicles', # Leaks info about accident severity
    'Speed_limit',
    'Age_of_Vehicle',
    'Engine_Capacity_.CC.',
]

nominal_variables = [
    'Day_of_Week',
    'month',
    'hour',
    'Carriageway_Hazards',
    'Junction_Detail',
    'Pedestrian_Crossing-Human_Control',
    'Pedestrian_Crossing-Physical_Facilities',
    'Road_Type',
    'Special_Conditions_at_Site',
    'Urban_or_Rural_Area',
    'Weather_Conditions',
    'Driver_Home_Area_Type',
    'Hit_Object_in_Carriageway',
    'Hit_Object_off_Carriageway',
    'Junction_Location',
    'make',
    'Propulsion_Code',
    'Sex_of_Driver',
    'Skidding_and_Overturning',
    'Towing_and_Articulation',
    'Vehicle_Leaving_Carriageway',
    'Vehicle_Location.Restricted_Lane',
    'Vehicle_Manoeuvre',
    'Vehicle_Type',
    'Was_Vehicle_Left_Hand_Drive',
    'X1st_Point_of_Impact',
]

# Order of the values can be determined lexicographically
ordinal_variables_auto = [
    'Age_Band_of_Driver',
    'Driver_IMD_Decile',
]

In [None]:
# Correct order must be set manually
ordinal_variables_manual = [
    '1st_Road_Class',
    'Junction_Control',
    'Light_Conditions',
    'Road_Surface_Conditions',
]

road_class_categories = ['Motorway', 'A(M)', 'A', 'B', 'C']

junction_control_categories = [
    'Not at junction or within 20 metres',
    'Authorised person',
    'Auto traffic signal',
    'Stop sign',
    'Give way or uncontrolled',
]

light_conditions_categories = [
    'Daylight',
    'Darkness - lights lit',
    'Darkness - lighting unknown',
    'Darkness - lights unlit',
    'Darkness - no lighting',
]

road_surface_conditions_categories = [
    'Dry',
    'Wet or damp',
    'Snow',
    'Frost or ice',
    'Flood over 3cm. deep',
]

manual_categories = [
    road_class_categories,
    junction_control_categories,
    light_conditions_categories,
    road_surface_conditions_categories,
]

In [None]:
X_train.filter(nominal_variables + ordinal_variables_auto + ordinal_variables_manual).nunique()

In [None]:
from sklearn.compose import ColumnTransformer  # applies transformers to columns
from sklearn.pipeline import Pipeline  # assemble several steps
from sklearn.impute import SimpleImputer  # replace missing values using a descriptive statistic (e.g. mean, median,...)
from sklearn.preprocessing import OrdinalEncoder  # encode categorical features as an integer array
from sklearn.preprocessing import OneHotEncoder  # encode categorical features as a one-hot numeric array
from sklearn.preprocessing import StandardScaler  # standardize features by removing the mean and scaling to unit variance

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(drop='if_binary', sparse_output=False)),
])

ord_pipe_auto = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder()),
])

ord_pipe_manual = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(categories=manual_categories)),
])

transformers_list = [
    ('numerical', num_pipe, numerical_variables),
    ('nominal', cat_pipe, nominal_variables),
    ('ordinal_auto', ord_pipe_auto, ordinal_variables_auto),
    ('ordinal_manual', ord_pipe_manual, ordinal_variables_manual),
]

column_transformer = ColumnTransformer(transformers_list,
                                       remainder='drop',
# If True, ColumnTransformer.get_feature_names_out will prefix all feature names with the name of the transformer that generated that feature.
                                       verbose_feature_names_out=False,
                                       verbose=False)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
label_encoder.classes_

## Modeling

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from joblib import parallel_backend

decision_tree = DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced')
feaure_selection = RFE(estimator=decision_tree, n_features_to_select=50, step=1, verbose=2)

### Logistic Regression

In [None]:
param_grid_regression = {
    'clf__penalty': ('l2', 'l1', 'elasticnet'),
    'clf__C': (0.1, 1, 10, 100),
}

classification_pipeline_regression = Pipeline([
    ('prep', column_transformer),
    ('select', feaure_selection),
    ('clf', LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced', verbose=2))
])

search_acc_regression = GridSearchCV(estimator=classification_pipeline_regression, param_grid=param_grid_regression, scoring='accuracy', verbose=2, n_jobs=-1)

with parallel_backend('threading', n_jobs=-1):
    search_acc_regression.fit(X_train, y_train)

In [None]:
search_acc_regression.best_estimator_

### Random Forest

In [None]:
param_grid_forest = {
    'clf__criterion' : ('gini', 'entropy'),
    'clf__max_depth': (1,2,3,4,5,6,7,8,9,10),
    'clf__min_samples_leaf': (1,2,3,4,5,6,7,8,9,10),
}

classification_pipeline_forest = Pipeline([
    ('prep', column_transformer),
    ('select', feaure_selection),
    ('clf', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))
])

search_acc_forest = GridSearchCV(estimator=classification_pipeline_forest, param_grid=param_grid_forest, scoring='accuracy')

with parallel_backend('threading', n_jobs=-1):
    search_acc_forest.fit(X_train, y_train)

In [None]:
search_acc_forest.best_estimator_

## Results and Evaluation

### Logistic Regression

In [None]:
search_acc_regression.best_score_

In [None]:
search_acc_regression.score(X_test, y_test)

### Random Forest

In [None]:
search_acc_forest.best_score_

In [None]:
search_acc_forest.score(X_test, y_test)

### Conclusion
TODO
