In [4]:
import kagglehub

path = kagglehub.dataset_download("sobhanmoosavi/us-accidents")
print(f"Dataset downloaded to: {path}")

Dataset downloaded to: C:\Users\Randy\.cache\kagglehub\datasets\sobhanmoosavi\us-accidents\versions\13


In [4]:
import pandas as pd
df = pd.read_csv("/Users/Randy/.cache/kagglehub/datasets/sobhanmoosavi/us-accidents/versions/13/US_Accidents_March23.csv")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,Street,City,County,State,Zipcode,Country,Timezone,Airport_Code,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,Dayton,Montgomery,OH,45424,US,US/Eastern,KFFO,2016-02-08 05:58:00,36.9,,91.0,29.68,10.0,Calm,,0.02,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,Reynoldsburg,Franklin,OH,43068-3402,US,US/Eastern,KCMH,2016-02-08 05:51:00,37.9,,100.0,29.65,10.0,Calm,,0.0,Light Rain,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,Williamsburg,Clermont,OH,45176,US,US/Eastern,KI69,2016-02-08 06:56:00,36.0,33.3,100.0,29.67,10.0,SW,3.5,,Overcast,False,False,False,False,False,False,False,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,Dayton,Montgomery,OH,45417,US,US/Eastern,KDAY,2016-02-08 07:38:00,35.1,31.0,96.0,29.64,9.0,SW,4.6,,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,Dayton,Montgomery,OH,45459,US,US/Eastern,KMGY,2016-02-08 07:53:00,36.0,33.3,89.0,29.65,6.0,SW,3.5,,Mostly Cloudy,False,False,False,False,False,False,False,False,False,False,False,True,False,Day,Day,Day,Day


In [3]:
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KernelDensity, NearestNeighbors

#Create GeoDataFrame from latitude and longitude for start locations
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Start_Lng, df.Start_Lat),crs="EPSG:4326")
gdf = gdf.to_crs(epsg=5070) # Convert to a projected coordinate system for accurate distance calculations
gdf["x"] = gdf.geometry.x
gdf["y"] = gdf.geometry.y

print("Data frame created")

#Multi-scale grid / tile IDs
'''
grid tiles produce compact neighborhood identifiers that capture local context and 
let you compute cell-level aggregates (counts, mean severity). Multi-scale tiles 
(fine + coarse) allow the model to see both micro and macro spatial structure
'''
def tile_id(x, y, scale):
    tile_x = (x // scale).astype(int)
    tile_y = (y // scale).astype(int)
    return tile_x.astype(str) + "_" + tile_y.astype(str)

gdf["cell_1km"] = tile_id(gdf["x"], gdf["y"], 1000)
gdf["cell_5km"] = tile_id(gdf["x"], gdf["y"], 5000)

print("Grid cells computed")

#KDE density (projected coords)
'''Gives measure of local accident concentration or density'''
import scipy.ndimage as ndi

# Fast gridded KDE (recommended)
bandwidth = 1000.0      # same units as projected coords (meters)
grid_size = 500.0       # cell size for histogram in meters (tweak for resolution/perf)

xs = gdf["x"].values
ys = gdf["y"].values

# define grid bounds with padding to avoid edge effects
pad = bandwidth * 3
xmin, xmax = xs.min() - pad, xs.max() + pad
ymin, ymax = ys.min() - pad, ys.max() + pad

nx = int(np.ceil((xmax - xmin) / grid_size))
ny = int(np.ceil((ymax - ymin) / grid_size))

H, xedges, yedges = np.histogram2d(xs, ys, bins=[nx, ny], range=[[xmin, xmax], [ymin, ymax]])

# smooth counts with gaussian filter (sigma in pixels = bandwidth / grid_size)
sigma = bandwidth / grid_size
H_smooth = ndi.gaussian_filter(H, sigma=sigma, mode="constant")

# map each point to its grid cell and assign smoothed density
ix = np.minimum(np.maximum(((xs - xmin) / grid_size).astype(int), 0), H_smooth.shape[0]-1)
iy = np.minimum(np.maximum(((ys - ymin) / grid_size).astype(int), 0), H_smooth.shape[1]-1)

# add per-point grid indices and an id for the KDE grid cell
gdf["kde_ix"] = ix
gdf["kde_iy"] = iy
gdf["kde_cell_kdegrid"] = [f"{i}_{j}" for i, j in zip(ix, iy)]

kde_vals = H_smooth[ix, iy]

# optional: convert counts -> density per square meter (makes values comparable)
cell_area = grid_size * grid_size
kde_density = kde_vals / (cell_area)

gdf["kde_1km"] = kde_density
gdf["kde_density_m2"] = kde_density

# build a DataFrame with one row per KDE-grid cell (counts, density, center coords)
gi, gj = np.indices(H_smooth.shape)       # gi.shape == gj.shape == H_smooth.shape
gi_f = gi.ravel()
gj_f = gj.ravel()
counts_f = H_smooth.ravel()
x_centers = xmin + (gi_f + 0.5) * grid_size
y_centers = ymin + (gj_f + 0.5) * grid_size

kde_grid_df = pd.DataFrame({
    "kde_cell_kdegrid": [f"{i}_{j}" for i, j in zip(gi_f, gj_f)],
    "kde_grid_count": counts_f,
    "kde_grid_density_m2": counts_f / (cell_area),
    "kde_grid_x": x_centers,
    "kde_grid_y": y_centers
})

# merge grid-level statistics back to points (so each point gets its grid's aggregate values)
gdf = gdf.merge(
    kde_grid_df[["kde_cell_kdegrid", "kde_grid_count", "kde_grid_density_m2", "kde_grid_x", "kde_grid_y"]],
    on="kde_cell_kdegrid",
    how="left"
)

print("Gridded KDE computed (fast) and KDE-grid attributes merged into gdf")

#Simple cell-level aggregates. Compute count and mean severity per 1km cell
cell_stats = gdf.groupby("cell_1km").agg(cell1_count=("ID","count"), 
                                         cell1_mean_sev=("Severity","mean")).reset_index()
gdf = gdf.merge(cell_stats, on="cell_1km", how="left")

print("Cell-level aggregates computed")

gdf.head()

'''
geometry: shapely Point for each accident (original lat/lng converted to projected coords).
x, y: projected coordinates (meters) extracted from geometry.
cell_1km / cell_5km: coarse grid tile IDs (string "tilex_tiley") at 1 km and 5 km scales.
kde_ix / kde_iy: integer array indices of the KDE histogram cell (grid column/row) each point falls in.
kde_cell_kdegrid: string id for the KDE cell ("ix_iy") — links points to the gridded KDE row.
kde_1km / kde_density_m2: per-point smoothed KDE density (converted to density per m^2) assigned from the gridded KDE.
kde_grid_count: count (smoothed) for each KDE grid cell (one row per cell in kde_grid_df).
kde_grid_density_m2: per-cell density (counts / cell_area) for each KDE grid cell.
kde_grid_x / kde_grid_y: center coordinates (projected, meters) of each KDE grid cell.
cell1_count: per-1km-cell raw count (groupby on cell_1km).
cell1_mean_sev: per-1km-cell mean Severity.

'''
#Numeric features and scaling
# numeric_features = ["X","Y","kde_1km","cell1_count","cell1_mean_sev","Temperature(F)",
#                     "Humidity(%)","Wind_Chill(F)","Pressure(in)","Visibility(mi)",
#                     "Wind_Speed(mph)", "Precipitation(in)",]
# for c in numeric_features:
#     if c not in gdf.columns:
#         gdf[c] = 0.0
# gdf[numeric_features] = gdf[numeric_features].fillna(0.0)
# scaler = StandardScaler()
# gdf[[c + "_s" for c in numeric_features]] = scaler.fit_transform(gdf[numeric_features])

# print("Numeric features scaled")

# #Categorical features
# categorical_features = ["Street","City","County","State","Zipcode","cell_1km","cell_5km",
#                         "Wind_Direction","Weather_Condition","Bump","Crossing","Give_Way",
#                         "Junction","No_Exit","Railway","Roundabout","Station","Stop",
#                         "Traffic_Calming","Traffic_Signal","Turning_Loop","Sunrise_Sunset"]
# for c in categorical_features:
#     gdf[c] = gdf[c].fillna("__MISSING__").astype(str)
#     gdf[c + "_idx"] = pd.factorize(gdf[c])[0]

# print("Categorical features encoded")

# scaled_numeric = [c + "_s" for c in numeric_features]
# cat_idx_cols = [c + "_idx" for c in categorical_features]
# features_df = pd.concat([gdf[scaled_numeric].reset_index(drop=True), gdf[cat_idx_cols].reset_index(drop=True)], axis=1)
# features_df["Severity"] = gdf["Severity"].values
# features_df["ID"] = gdf["ID"].values

# features_df.head()

Data frame created
Grid cells computed
Gridded KDE computed (fast) and KDE-grid attributes merged into gdf
Cell-level aggregates computed


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,kde_iy,kde_cell_kdegrid,kde_1km,kde_density_m2,kde_grid_count,kde_grid_density_m2,kde_grid_x,kde_grid_y,cell1_count,cell1_mean_sev
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,3320,6732_3320,7e-06,7e-06,1.810593,7e-06,1009713.0,1935422.0,23,2.565217
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,3361,6936_3361,4e-05,4e-05,10.11348,4e-05,1111713.0,1955922.0,198,2.570707
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,3142,6759_3142,2e-06,2e-06,0.48335,2e-06,1023213.0,1846422.0,12,1.916667
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,3291,6711_3291,0.000279,0.000279,69.873731,0.000279,999213.0,1920922.0,443,2.476298
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,3264,6717_3264,8.9e-05,8.9e-05,22.196821,8.9e-05,1002213.0,1907422.0,124,1.983871
