# Road Accident Severity Prediction
## EDA

## UNIVARIATE EDA

In [None]:
# ============================================================
# 6. IMPORTANT UNIVARIATE EDA
#    (only plots that help with modelling intuition)
# ============================================================

# 6.1 Target distribution
plt.figure()
sns.countplot(data=acc, x=target_col)
plt.title("Severity Distribution")
plt.show()
print(acc[target_col].value_counts(normalize=True))

# 6.2 Key numeric features
num_eda_cols = ["Number_of_Vehicles", "Number_of_Casualties",
                "Speed_limit", "veh_count", "veh_age_mean", "engine_mean"]

for col in num_eda_cols:
    plt.figure()
    sns.histplot(acc[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

# 6.3 Key categorical features
cat_eda_cols = [
    "Weather_Conditions",
    "Light_Conditions",
    "Road_Type",
    "Road_Surface_Conditions",
    "Urban_or_Rural_Area",
    "Carriageway_Hazards",
    "Special_Conditions_at_Site"
]

for col in cat_eda_cols:
    if col in acc.columns:
        plt.figure(figsize=(9,4))
        sns.countplot(data=acc, x=col)
        plt.xticks(rotation=45, ha="right")
        plt.title(f"{col} Distribution")
        plt.tight_layout()
        plt.show()


## Bivariate Analysis

In [None]:
# ============================================================
# 7. IMPORTANT BIVARIATE EDA (Severity vs key factors)
# ============================================================

# helper to plot countplots with Severity hue
def plot_severity_vs(col):
    if col in acc.columns:
        plt.figure(figsize=(9,4))
        sns.countplot(data=acc, x=col, hue=target_col)
        plt.xticks(rotation=45, ha="right")
        plt.title(f"{col} vs {target_col}")
        plt.tight_layout()
        plt.show()

for col in ["Weather_Conditions", "Light_Conditions", "Road_Type",
            "Road_Surface_Conditions", "Urban_or_Rural_Area",
            "Carriageway_Hazards", "Special_Conditions_at_Site"]:
    plot_severity_vs(col)

# Boxplots: severity vs numeric variables
plt.figure()
sns.boxplot(data=acc, x=target_col, y="Speed_limit")
plt.title("Speed Limit vs Severity")
plt.show()

plt.figure()
sns.boxplot(data=acc, x=target_col, y="Number_of_Vehicles")
plt.title("Number of Vehicles vs Severity")
plt.show()

plt.figure()
sns.boxplot(data=acc, x=target_col, y="Number_of_Casualties")
plt.title("Number of Casualties vs Severity")
plt.show()


## Multivariate EDA

In [None]:
# ============================================================
# 8. MULTIVARIATE EDA (ONLY A FEW USEFUL ONES)
# ============================================================

# 8.1 Numeric correlation heatmap
sel_numeric = ["Number_of_Vehicles", "Number_of_Casualties",
               "Speed_limit", "Hour", "Month", target_col]

plt.figure(figsize=(8,6))
sns.heatmap(acc[sel_numeric].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Accident Numeric Feature Correlation")
plt.show()

# 8.2 Speed vs Light Conditions vs Severity
if "Light_Conditions" in acc.columns:
    plt.figure(figsize=(9,4))
    sns.boxplot(data=acc, x="Light_Conditions", y="Speed_limit", hue=target_col)
    plt.xticks(rotation=45, ha="right")
    plt.title("Speed Limit by Light Conditions and Severity")
    plt.tight_layout()
    plt.show()

# 8.3 Hour vs Urban/Rural
if "Urban_or_Rural_Area" in acc.columns:
    plt.figure(figsize=(9,4))
    sns.countplot(data=acc, x="Hour", hue="Urban_or_Rural_Area")
    plt.title("Hour of Day vs Urban/Rural Area")
    plt.tight_layout()
    plt.show()

# 8.4 Pairplot (small sample to keep it light)
sample_pair = acc[[target_col, "Speed_limit",
                   "Number_of_Vehicles", "Number_of_Casualties"]].dropna().sample(3000, random_state=42)

sns.pairplot(sample_pair,
             vars=["Speed_limit", "Number_of_Vehicles", "Number_of_Casualties"],
             hue=target_col, diag_kind="kde", plot_kws={"alpha":0.5, "s":20})
plt.suptitle("Pairplot of Key Numeric Features vs Severity", y=1.02)
plt.show()
