STEP 1: Imports + Load Dataset

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

# Load your dataset
df = pd.read_csv("US_Accidents_March23.csv")

df.shape

(7728394, 46)

STEP 2: Remove High-Missingness Features

In [None]:
#Remove High-Missingness Features

missing_threshold = 0.40  # 40%
missing_pct = df.isna().mean()

high_missing_cols = missing_pct[missing_pct > missing_threshold].index.tolist()
print("Columns removed due to missingness:", high_missing_cols)

df_reduced = df.drop(columns=high_missing_cols)
df_reduced.shape


STEP 3: Remove Near-Constant (Low Variance) Features

In [None]:
low_variance_cols = []

for col in df_reduced.columns:
    if df_reduced[col].nunique() <= 1:
        low_variance_cols.append(col)

print("Near-constant columns removed:", low_variance_cols)

df_reduced = df_reduced.drop(columns=low_variance_cols)
df_reduced.shape


STEP 4: Remove Duplicate Columns

In [None]:
df_transposed = df_reduced.T.drop_duplicates().T
duplicate_cols = list(set(df_reduced.columns) - set(df_transposed.columns))

print("Duplicate columns removed:", duplicate_cols)

df_reduced = df_transposed.copy()
df_reduced.shape


STEP 5: Remove Highly Collinear Features (|r| â‰¥ 0.85)

In [None]:
numeric_df = df_reduced.select_dtypes(include=[np.number])

corr_matrix = numeric_df.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_cols = [
    col for col in upper_tri.columns if any(upper_tri[col] >= 0.85)
]

print("High-correlation columns removed:", high_corr_cols)

df_reduced = df_reduced.drop(columns=high_corr_cols)
df_reduced.shape


STEP 6: Mutual Information (Classification Version)

In [None]:
target = "Severity"  # change if needed

X = df_reduced.drop(columns=[target])
y = df_reduced[target]

X_numeric = X.select_dtypes(include=[np.number]).fillna(0)

mi_scores = mutual_info_classif(X_numeric, y)
mi_series = pd.Series(mi_scores, index=X_numeric.columns).sort_values(ascending=False)

print("Mutual Information Scores:")
mi_series.head(20)


STEP 7: Select Top 20 Features

In [None]:
top_features = mi_series.head(20).index.tolist()
top_features.append(target)

df_final = df_reduced[top_features]

print("Final selected features:")
df_final.columns.tolist()


STEP 8: Create Documentation Table

In [None]:
doc_table = pd.DataFrame({
    "Feature": df_reduced.columns,
    "Kept?": ["Yes" if col in df_final.columns else "No" for col in df_reduced.columns],
    "Reason": [
        "High MI score" if col in top_features else
        "Removed for missingness/collinearity/low variance"
        for col in df_reduced.columns
    ],
    "Preprocessing Needed": [
        "Scale" if (col in X_numeric.columns and col in top_features)
        else "One-hot encode" if df_reduced[col].dtype == "object" and col in df_final.columns
        else "-"
        for col in df_reduced.columns
    ]
})

doc_table.head(20)
