# Univarite EDA: Basic Analysis of Each Variable

Examination of each of the varibales one at a time to get a basic understanding of their nature in terms of distribution and typical values, outliers among others. Data is from the cleaned data notebook


# 2.1 Setup and Data Loading

In [None]:
# Setup and cleaned data loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

cleaned_df = pd.read_pickle("../data/cleaned/crash_2018_cleaned.pkl")

print(f"Crash Data shape: {cleaned_df.shape}")
print(f"\nFirst few rows:")
cleaned_df.head  # first few rows of the crash data

In [None]:
cleaned_df.info()  # Basic info about variables

# 2.2 Identification and Grouping of the Variable Types

The varibales that are of interest are grouped into continuous and categorical types for the analysis


## Continuous Variables

In [None]:
# Continuous variables
continuous_vars = [
    "Number Killed",
    "Number Serious Injuries",
    "Number Non-fatal Injuries",
    "Total Injuries",
    "Impact Speed Num",
    "Driver Age",
]


# Distribution(Histogram) of the continuous variables
for col in continuous_vars:
    plt.subplots(figsize=(10, 4))
    sns.histplot(cleaned_df[col], bins=20, alpha=1.0)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

**Observations** 

In [None]:
# Summary statistics
print("Continuous Variables Summary Statistics:")
cleaned_df[continuous_vars].describe()

**Determining quartiles and outliers**

In [None]:
# Box plot of continuous variables
for col in continuous_vars:
    plt.figure(figsize=(10, 6))

    # Overlay jittered points
    sns.stripplot(
        y=cleaned_df[col],
        color="skyblue",
        alpha=0.5,
        size=5,
        jitter=True,
    )
    # Boxplot (no outlier dots)
    sns.boxplot(
        y=cleaned_df[col],
        width=0.3,
        showcaps=True,
        boxprops=dict(facecolor="none", edgecolor="black", linewidth=1.5),
        whiskerprops=dict(linewidth=1.5),
        capprops=dict(linewidth=1.5),
        medianprops=dict(color="red", linewidth=2),
    )

    plt.grid(False)
    plt.title(f"{col} â€” Distribution with Outliers", fontsize=14)
    plt.tight_layout()
    plt.show()

**Interpretations**

## Categorical Variables

In [None]:
# Categorical variables
categorical_vars = [
    "Crash Severity",
    "Weekend",
    "Time of Day",
    "County",
    "Area Type",
    "Functional Class Recode",
    "Vehicle Type Recode",
    "Driver Gender Recode",
    "Driver License Validity",
    "Driver BAC",
    "BAC Available",
    "Crash Manner Recode",
    "Visibility Obstruction Recode",
    "Lighting Conditions Recode",
]


# Categorical variables distribution
for col in categorical_vars:
    plt.figure(figsize=(8, 5))
    cleaned_df[col].value_counts().plot(kind="bar")
    plt.title(f"{col} Frequency")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

    print(f"Counts for {col}:")
    print(cleaned_df[col].value_counts(), "\n")