# Exploratory Data Analysis

In [None]:
# Import necessary libraries and functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from scripts import load_data, drop_unneeded_columns, plot_distribution, plot_feature_vs_target, handle_missing_values, plot_accidents_by_time, plot_accidents_by_month, plot_accidents_by_day_of_week


## 1. Load data

In [None]:
# call function to read dataset
data = load_data()

# load data head
data.head()

## 2. Data wrangling

### Overall data overview and contained features
### Conduct wrangling, fix missing values, and overall cleaning

In [None]:
data.shape

In [None]:
#Check information about datatype for individual columns
data.info()

In [None]:
# Statistical summaries for numerical columns
data.describe()

In [None]:
# Check for missing values
missing_values = data.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_values / len(data)) * 100
print(pd.DataFrame({"Missing Values": missing_values, "Percentage": missing_percentage}))

# Visualise missing data
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()


We're seeing quite a few variables with a signficant share of missing variables, we should drop them as it is unlikely they will help in predictive modelling, and any imputes would likely be substantially sensitive to our assumptions.

In [None]:
columns_to_drop = ['Carriageway_Hazards', 'Special_Conditions_at_Site','2nd_Road_Class','1st_Road_Class','LSOA_of_Accident_Location']
drop_unneeded_columns(data, columns_to_drop)

## 3. Analysis & Visualisation

### Identifying columns to analyse

In [None]:
data.columns

In [None]:
# Identifying columns
columns_to_analyse = ['Accident_Severity','Date','Day_of_Week','Did_Police_Officer_Attend_Scene_of_Accident','Latitude','Longitude',
                      'Light_Conditions','Local_Authority_(District)','Number_of_Casualties','Road_Surface_Conditions',
                      'Road_Type','Speed_limit','Time','Urban_or_Rural_Area', 'Weather_Conditions','Year']

# Reasons to drop:
#1. **`2nd_Road_Class`**: secondary road details are less relevant.
#2. **`1st_Road_Class`**: redundant given other road-related features.
#3. **`LSOA_of_Accident_Location`**: too granular for analysis.
#4. **`2nd_Road_Number`**: road identifiers unlikely to predict severity.
#5. **`Location_Easting_OSGR`, `Location_Northing_OSGR`**: Geographic coordinates too granular.
#6. **`InScotland`**: Binary indicator; less relevant in its current form.

data_filtered = data[columns_to_analyse]

In [None]:
#Previewing data
data_filtered.head()

In [None]:
# Check for missing values within filtered dataframe
missing_values = data_filtered.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_values / len(data)) * 100
print(pd.DataFrame({"Missing Values": missing_values, "Percentage": missing_percentage}))

Let's call in a function to drop rows with missing values, given the top features with missing values now are mostly objects, wouldn't make a lot of sense imputing. Moreover, given we have quite a bit of data post drop, we should be ok to still proceed without significantly affecting prediction. 

In [None]:
data_filtered = handle_missing_values(df=data_filtered, drop_na_columns=None)

### Target variable distribution

In [None]:
## Check for readings within target variable
target = data_filtered['Accident_Severity']
target_values = target.unique()
target_values

In [None]:
# Plot the distribution of the target variable
plot_distribution(data_filtered, "Accident_Severity")

### Local authority analysis

Let's first look at prevalence by local authority district

In [None]:
data_filtered.rename(columns={'Local_Authority_(District)':'LA'},inplace=True)
data_filtered.head()
# top 30 authorities by accident
LA_by_collisions = data_filtered.LA.value_counts(ascending=False)
LA_by_collisions[:30]
LA_by_collisions[:30].plot(kind='bar')

Birmingham showing the highest prevalence of accidents, followed by Leeds, and Manchester.

### Investigating the effect of weather conditions

Weather conditions, including rain, has a strong likelihood of increasing accident prevalence, and by extension, severity. 

In [None]:
## Check for readings within weather variable
target = data_filtered['Weather_Conditions']
target_values = target.unique()
target_values

In [None]:
# Plot the distribution of the weather variable
plot_distribution(data_filtered, "Weather_Conditions")

Interesting, most accidents happened with no adverse weather conditions, let's now look at how the weather conditions vary with our target, to see whether severity could have been affected by weather conditions.

In [None]:
# Plot the distribution of the feature against the target
plot_feature_vs_target(data_filtered, "Weather_Conditions", "Accident_Severity", kind = "bar")

For each severity, most accidents happened with no adverse weather conditions, though for all other weather conditons less fine and no high winds, this was more evenly distributed, suggesting that there is a chance that weather conditions could have made an impact.

### Investigating the effect of road conditions

Perhaps weather conditions is more of a secondary variable influencing road conditions. Wet roads due to heavy rain, or icy roads due to snow could have significant affect on a drive.

In [None]:
## Check for readings within road surface variable
target = data_filtered['Road_Surface_Conditions']
target_values = target.unique()
target_values

In [None]:
# Plot the distribution of the road surface variable
plot_distribution(data_filtered, "Road_Surface_Conditions")

Most accidents happened on dry roads, but we're also seeing quite a few accidents on wet or damp roads, which aligns with our hypthesis

In [None]:
# Plot the distribution of the road surface conditions against weather conditions
plot_feature_vs_target(data_filtered, "Road_Surface_Conditions", "Weather_Conditions", kind = "bar")

Again, our hypothesis seems right here, wet weather conditions lead to wet roads, which affect drive

In [None]:
# Plot the distribution of the road surface conditions against weather conditions
plot_feature_vs_target(data_filtered, "Road_Surface_Conditions", "Accident_Severity", kind = "bar")

### Investigating the idea of date and time, and whether certain dates or times are associated with greater severity

In [None]:
#Date
data_filtered['Date']


In [None]:
#Time
data_filtered['Time']

In [None]:
# Creating a new DateTime variable
data_filtered['DateTime'] = data_filtered['Date']+' '+ data_filtered['Time']
# Convert Datetime to pandas datetime
data_filtered['DateTime'] = pd.to_datetime(data_filtered['DateTime'], format="%d/%m/%Y %H:%M")
data_filtered.head()


In [None]:
plot_accidents_by_day_of_week(data_filtered)  # Plot by date
plot_accidents_by_month(data_filtered)  # Plot by month
plot_accidents_by_time(data_filtered)  # Plot by time

Accidents more likely to happen on a Fri, in October & November, and around rush hour (8am, and 4pm to 5pm)

### Correlation of features with target

Let's have a look at the feature types within the current iteration of the dataset

In [None]:
data_filtered.dtypes

So, there is still quite a bit of pre-processing we must do here to get this ready for the modelling. First, lets pull out a correlation heatmap to guide our feature selection, that way we can do a bit of pre-processing automatically, and also check which features to choose, based on coorelation with our target (Accident Severity) and also pinpoint indicators with high multicollinearity

In [None]:
# Drop irrelevant columns
irrelevant_cols = ["Date", "Time", "Month", "Year", "Hour"]
data_filtered = data_filtered.drop(columns=irrelevant_cols, errors="ignore")

# Encode target variable (Accident_Severity)
severity_mapping = {"Slight": 0, "Serious": 1, "Fatal": 2}
data_filtered["Accident_Severity"] = data_filtered["Accident_Severity"].map(severity_mapping)

# Encode categorical variables
categorical_columns = data_filtered.select_dtypes(include=["object"]).columns
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data_filtered[col] = le.fit_transform(data_filtered[col].astype(str))  # Convert to string for consistency
    label_encoders[col] = le  # Save encoder for future use

# Handle missing values (drop rows with missing values)
data_filtered = data_filtered.dropna()

# Draw correlation heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = data_filtered.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap of Features")
plt.show()