In [4]:
import kagglehub

path = kagglehub.dataset_download("sobhanmoosavi/us-accidents")
print(f"Dataset downloaded to: {path}")

Dataset downloaded to: C:\Users\Randy\.cache\kagglehub\datasets\sobhanmoosavi\us-accidents\versions\13


In [2]:
import pandas as pd
df = pd.read_csv("/Users/Randy/.cache/kagglehub/datasets/sobhanmoosavi/us-accidents/versions/13/US_Accidents_March23.csv")
df.head()

In [5]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity, NearestNeighbors

#Create GeoDataFrame from latitude and longitude for start locations
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Start_Lng, df.Start_Lat),crs="EPSG:4326")
gdf = gdf.to_crs(epsg=5070) # Convert to a projected coordinate system for accurate distance calculations
gdf["x"] = gdf.geometry.x
gdf["y"] = gdf.geometry.y

print("Data frame created")

#Multi-scale grid / tile IDs
'''
grid tiles produce compact neighborhood identifiers that capture local context and 
let you compute cell-level aggregates (counts, mean severity). Multi-scale tiles 
(fine + coarse) allow the model to see both micro and macro spatial structure
'''
def tile_id(x, y, scale):
    tile_x = (x // scale).astype(int)
    tile_y = (y // scale).astype(int)
    return tile_x.astype(str) + "_" + tile_y.astype(str)

gdf["cell_1km"] = tile_id(gdf["x"], gdf["y"], 1000)
gdf["cell_5km"] = tile_id(gdf["x"], gdf["y"], 5000)

print("Grid cells computed")

#KDE density (projected coords)
'''Gives measure of local accident concentration or density'''
import scipy.ndimage as ndi

# Fast gridded KDE (recommended)
bandwidth = 1000.0      # same units as projected coords (meters)
grid_size = 500.0       # cell size for histogram in meters (tweak for resolution/perf)

xs = gdf["x"].values
ys = gdf["y"].values

# define grid bounds with padding to avoid edge effects
pad = bandwidth * 3
xmin, xmax = xs.min() - pad, xs.max() + pad
ymin, ymax = ys.min() - pad, ys.max() + pad

nx = int(np.ceil((xmax - xmin) / grid_size))
ny = int(np.ceil((ymax - ymin) / grid_size))

H, xedges, yedges = np.histogram2d(xs, ys, bins=[nx, ny], range=[[xmin, xmax], [ymin, ymax]])

# smooth counts with gaussian filter (sigma in pixels = bandwidth / grid_size)
sigma = bandwidth / grid_size
H_smooth = ndi.gaussian_filter(H, sigma=sigma, mode="constant")

# map each point to its grid cell and assign smoothed density
ix = np.minimum(np.maximum(((xs - xmin) / grid_size).astype(int), 0), H_smooth.shape[0]-1)
iy = np.minimum(np.maximum(((ys - ymin) / grid_size).astype(int), 0), H_smooth.shape[1]-1)

kde_vals = H_smooth[ix, iy]

# optional: convert counts -> density per square meter (makes values comparable)
cell_area = grid_size * grid_size
kde_density = kde_vals / (cell_area)

gdf["kde_1km"] = kde_density

print("Gridded KDE computed (fast)")

# Alternative: k-NN based density estimate (also faster than full KDE for large N)
# from sklearn.neighbors import NearestNeighbors
# k = 50
# nbrs = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", n_jobs=-1).fit(np.column_stack([xs, ys]))
# distances, _ = nbrs.kneighbors(np.column_stack([xs, ys]))
# r_k = distances[:, -1]  # distance to k-th neighbor
# # approximate density: k / (n * area_of_circle)  -> proportional density
# kde_knn = k / ( (np.pi * (r_k**2)) * len(xs) )
# gdf["kde_1km_knn"] = kde_knn
# print("k-NN density computed (approx)")

#Simple cell-level aggregates. Compute count and mean severity per 1km cell
cell_stats = gdf.groupby("cell_1km").agg(cell1_count=("ID","count"), 
                                         cell1_mean_sev=("Severity","mean")).reset_index()
gdf = gdf.merge(cell_stats, on="cell_1km", how="left")

print("Cell-level aggregates computed")
#Numeric features and scaling
numeric_features = ["X","Y","kde_1km","cell1_count","cell1_mean_sev","Temperature(F)",
                    "Humidity(%)","Wind_Chill(F)","Pressure(in)","Visibility(mi)",
                    "Wind_Speed(mph)", "Precipitation(in)",]
for c in numeric_features:
    if c not in gdf.columns:
        gdf[c] = 0.0
gdf[numeric_features] = gdf[numeric_features].fillna(0.0)
scaler = StandardScaler()
gdf[[c + "_s" for c in numeric_features]] = scaler.fit_transform(gdf[numeric_features])

print("Numeric features scaled")

#Categorical features
categorical_features = ["Street","City","County","State","Zipcode","cell_1km","cell_5km",
                        "Wind_Direction","Weather_Condition","Bump","Crossing","Give_Way",
                        "Junction","No_Exit","Railway","Roundabout","Station","Stop",
                        "Traffic_Calming","Traffic_Signal","Turning_Loop","Sunrise_Sunset"]
for c in categorical_features:
    gdf[c] = gdf[c].fillna("__MISSING__").astype(str)
    gdf[c + "_idx"] = pd.factorize(gdf[c])[0]

print("Categorical features encoded")

scaled_numeric = [c + "_s" for c in numeric_features]
cat_idx_cols = [c + "_idx" for c in categorical_features]
features_df = pd.concat([gdf[scaled_numeric].reset_index(drop=True), gdf[cat_idx_cols].reset_index(drop=True)], axis=1)
features_df["Severity"] = gdf["Severity"].values
features_df["ID"] = gdf["ID"].values

features_df.head()

Data frame created
Grid cells computed
Gridded KDE computed (fast)
Cell-level aggregates computed
Numeric features scaled
Categorical features encoded


Unnamed: 0,X_s,Y_s,kde_1km_s,cell1_count_s,cell1_mean_sev_s,Temperature(F)_s,Humidity(%)_s,Wind_Chill(F)_s,Pressure(in)_s,Visibility(mi)_s,...,Railway_idx,Roundabout_idx,Station_idx,Stop_idx,Traffic_Calming_idx,Traffic_Signal_idx,Turning_Loop_idx,Sunrise_Sunset_idx,Severity,ID
0,0.0,0.0,-0.786993,-0.677196,1.330731,-1.127532,1.126448,-1.350621,0.166644,0.374487,...,0,0,0,0,0,0,0,0,3,A-1
1,0.0,0.0,-0.604228,-0.269113,1.351435,-1.079462,1.493372,-1.350621,0.159278,0.374487,...,0,0,0,0,0,0,0,0,2,A-2
2,0.0,0.0,-0.816209,-0.702846,-1.115318,-1.170795,1.493372,-0.309129,0.164188,0.374487,...,0,0,0,0,0,1,0,0,2,A-3
3,0.0,0.0,0.71123,0.302203,0.995366,-1.214059,1.330295,-0.381064,0.156822,0.039505,...,0,0,0,0,0,0,0,0,3,A-4
4,0.0,0.0,-0.338246,-0.441673,-0.861853,-1.170795,1.044909,-0.309129,0.159278,-0.96544,...,0,0,0,0,0,1,0,1,2,A-5
