In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns

from accident_risk.config import RAW_TRAIN_PATH

In [None]:
data = pd.read_csv("../" + RAW_TRAIN_PATH)

In [None]:
train, validation = train_test_split(data, shuffle=False)

In [None]:
print("Train len: ", len(train))
print("Validation len: ", len(validation))


# EDA

In [None]:
train.info()

## Single variable exploration

In [None]:
# categorical variable
train["road_type"].value_counts().plot.bar()

In [None]:
# ordinal variable
train["num_lanes"].value_counts().plot.bar()

In [None]:
# continuous, values in [0, 1]
train["curvature"].describe()

In [None]:
data["curvature"].plot.kde(xlim=(0, 1))


In [None]:
# discrete, few (5) distinct values
train["speed_limit"].value_counts().plot.bar()

In [None]:
# categorical
train["lighting"].value_counts().plot.bar()

In [None]:
# categorical
train["weather"].value_counts().plot.bar()

In [None]:
# binary
train["road_signs_present"].value_counts().plot.bar()


In [None]:
# binary
train["public_road"].value_counts().plot.bar()


In [None]:
# categorical
train["time_of_day"].value_counts().plot.bar()

In [None]:
# binary
train["holiday"].value_counts().plot.bar()

In [None]:
# binary
train["school_season"].value_counts().plot.bar()

In [None]:
# counts/discrete, few distinct values (8)
train["num_reported_accidents"].hist(bins=7)

In [None]:
train["num_reported_accidents"].value_counts()

In [None]:
# target variable, accident risk probability [0, 1]
train["accident_risk"].describe()

In [None]:
# looks normally distributed
train["accident_risk"].plot.kde()

In [None]:
from scipy.stats import normaltest

stat, p = normaltest(train["accident_risk"])

# if p-value < 0.05, we reject the null hypothesis that the data is normally distributed
print(f'Stat={stat:.3f}, p={p:.3f}')

In [None]:
# double-check with a QQ plot, 
# since for large data - even small deviations may cause a false rejection
import scipy.stats as st
import matplotlib.pyplot as plt

st.probplot(train["accident_risk"], dist="norm", plot=plt)

# Add labels and title
plt.title("Q-Q Plot")
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Sample Quantiles")
plt.show()

# roughly normal, but bounded at the tails
# makes sense since data is in [0, 1]
# a logit transformation might be useful

In [None]:
import numpy as np

epsilon = 1e-6
target = train["accident_risk"].values

target = np.clip(target, epsilon, 1 - epsilon)

target = np.log(target / (1 - target))

In [None]:
stat, p = normaltest(target)

# if p-value < 0.05, we reject the null hypothesis that the data is normally distributed
print(f'Stat={stat:.3f}, p={p:.3f}')

In [None]:
st.probplot(target, dist="norm", plot=plt)

# Add labels and title
plt.title("Q-Q Plot")
plt.xlabel("Theoretical Quantiles")
plt.ylabel("Sample Quantiles")
plt.show()

# tails are not normally distributed
# takeaway: be careful with models that are sensitive to extreme values

## Explore pairwise correlations 

In [None]:
# curvature, speed limit, and number of reported accidents seem to be correlated with accident risk
# there is also some correlation between curvature and the number of reported accidents
sns.heatmap(train.corr(numeric_only=True), annot=True, cmap="coolwarm", center=0, fmt=".2f",)  # annot=True shows values
plt.show()