In [1]:
import pandas as pd
import numpy as np

# Load the sampled dataset
df = pd.read_csv("US_Accidents_March23_sampled_1M.csv")

In [2]:
# 1. Parse datetimes
df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")
df["End_Time"]   = pd.to_datetime(df["End_Time"], errors="coerce")

In [3]:
# 2. Drop duplicates and rows with invalid times
df = df.drop_duplicates(subset="ID")
df = df.dropna(subset=["Start_Time", "End_Time"])

In [4]:
# 3. Drop rows missing critical location data
df = df.dropna(subset=["Start_Lat", "Start_Lng"])

In [5]:
# 4. Compute incident duration in minutes
df["Duration_Minutes"] = (df["End_Time"] - df["Start_Time"]).dt.total_seconds() / 60

In [6]:
# 5. Extract temporal features
df["Hour"]        = df["Start_Time"].dt.hour
df["DayOfWeek"]   = df["Start_Time"].dt.weekday
df["Month"]       = df["Start_Time"].dt.month
df["IsWeekend"]   = df["DayOfWeek"].isin([5,6]).astype(int)

In [7]:
# 6. Encode boolean traffic feature flags as integers
bool_cols = [
    "Roundabout",
    "Station",
    "Stop",
    "Traffic_Calming",
    "Traffic_Signal",
    "Turning_Loop"
]
for col in bool_cols:
    df[col] = df[col].astype(int)

In [8]:
# 7. Encode light condition as binary day/night
df["IsDay"] = (df["Sunrise_Sunset"] == "Day").astype(int)

In [9]:
# 8. Drop columns not used for modeling
drop_cols = [
    "ID",
    "Source",
    "Description",
    "Street",
    "Start_Time",
    "End_Time",
    "Sunrise_Sunset",
    "Civil_Twilight",
    "Nautical_Twilight",
    "Astronomical_Twilight"
]
df = df.drop(columns=drop_cols)

In [10]:
# 9. Handle missing values in numeric columns
#    Fill numeric NaNs with median
num_cols = df.select_dtypes(include="number").columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [11]:
# 10. Final clean-up: remove any remaining rows with NaNs
df = df.dropna()

In [12]:
df['Severity'].value_counts()

Severity
2    764946
3    162587
4     24830
1      8502
Name: count, dtype: int64

In [13]:
# 11. stratify/drop rare severity classes if needed
# e.g., keep only severity levels 1-4
df = df[df["Severity"].isin([1,2,3,4])]

# Save cleaned dataset
df.to_csv("accidents_cleaned.csv", index=False)

In [14]:
print("Cleaned dataset shape:", df.shape)
df.head()

Cleaned dataset shape: (960865, 40)


Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,County,State,Zipcode,Country,Timezone,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Duration_Minutes,Hour,DayOfWeek,Month,IsWeekend,IsDay
0,1,26.7069,-80.11936,0.0,West Palm Beach,Palm Beach,FL,33417-4638,US,US/Eastern,...,0,0,1,0,60.0,9,4,4,0,1
1,2,38.781024,-121.26582,0.045,Roseville,Placer,CA,95678-1907,US,US/Pacific,...,1,0,0,0,103.133333,10,3,4,0,1
2,3,33.985249,-84.269348,0.0,Alpharetta,Fulton,GA,30022,US,US/Eastern,...,0,0,0,0,30.0,16,4,8,0,1
3,3,47.118706,-122.556908,0.0,Tacoma,Pierce,WA,98433,US,US/Pacific,...,0,0,0,0,33.733333,15,4,9,0,1
4,2,33.451355,-111.890343,0.0,Scottsdale,Maricopa,AZ,85256,US,US/Mountain,...,0,0,0,0,76.433333,16,0,6,0,1
