In [None]:
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
import time

import os
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("datasets/Motor_Vehicle_Collisions_-_Crashes.csv")

essential_columns = [
    'CRASH DATE', 'CRASH TIME', 'LATITUDE', 'LONGITUDE', 'BOROUGH',
    'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
    'COLLISION_ID'
]

df = df[essential_columns]

In [None]:
df.dropna(inplace=True)
df.head(5)

In [None]:
df.shape

In [None]:
df['datetime'] = pd.to_datetime(
    df['CRASH DATE'] + ' ' + df['CRASH TIME'],
    format='%m/%d/%Y %H:%M'
)

df['iso_datetime'] = df['datetime'].dt.strftime('%Y-%m-%dT%H:%M')

df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month

In [None]:
print(df[['CRASH DATE', 'CRASH TIME', 'datetime']].head())

In [None]:

def get_weather_data(lat, lon, date_time, timezone='America/New_York', retries=3, timeout=5):
    
    base_url = "https://archive-api.open-meteo.com/v1/archive"
    date_str = date_time[:10]  # YYYY-MM-DD
    hour_str = date_time[11:]  # HH:MM

    params = {
        'latitude': lat,
        'longitude': lon,
        'start_date': date_str,
        'end_date': date_str,
        'hourly': 'temperature_2m,precipitation,weathercode',
        'timezone': timezone,
    }

    for attempt in range(retries):
        try:
            response = requests.get(base_url, params=params, timeout=timeout)
            response.raise_for_status()
            data = response.json()

            # extracts weather for the specific hour
            target_time = f"{date_str}T{hour_str}"
            if 'hourly' in data and 'time' in data['hourly']:
                times = data['hourly']['time']
                if target_time in times:
                    index = times.index(target_time)
                    return {
                        'temperature': data['hourly']['temperature_2m'][index],
                        'precipitation': data['hourly']['precipitation'][index],
                        'weathercode': data['hourly']['weathercode'][index],
                    }
            return None
        except requests.exceptions.Timeout:
            print(f"Timeout occurred for lat: {lat}, lon: {lon}, datetime: {date_time} (attempt {attempt + 1}/{retries})")
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error for lat: {lat}, lon: {lon}, datetime: {date_time}: {e}")
            break
        except requests.exceptions.RequestException as e:
            print(f"Request error for lat: {lat}, lon: {lon}, datetime: {date_time}: {e}")
        time.sleep(1)

    print(f"Skipping lat: {lat}, lon: {lon}, datetime: {date_time} after {retries} attempts.")
    return None


In [None]:
def map_weathercode_to_description(weathercode):
    if weathercode in [0, 1]:
        return 'Clear'
    elif weathercode in [2, 3]:
        return 'Cloudy'
    elif weathercode in [45, 48]:
        return 'Fog/Haze'
    elif weathercode in [51, 53, 55, 56, 57]:
        return 'Drizzle'
    elif weathercode in [61, 80]:
        return 'Light Rain'
    elif weathercode in [63, 65, 66, 67, 81, 82]:
        return 'Heavy Rain'
    elif weathercode in [71, 85]:
        return 'Light Snow'
    elif weathercode in [73, 75, 77, 86]:
        return 'Heavy Snow'
    elif weathercode in [95, 96, 99]:
        return 'Thunderstorm'
    else:
        return 'Unknown'


In [None]:
def build_yearly_sample(
    base_df,
    year,
    non_severe_frac=0.10,
    max_per_month=1000,
    max_per_borough=2000,
    random_state=42
):
    year_df = base_df[base_df['year'] == year].copy()
    if year_df.empty:
        print(f"No records for year {year}.")
        return pd.DataFrame()

    severe_mask = (year_df['NUMBER OF PERSONS INJURED'] > 0) | (year_df['NUMBER OF PERSONS KILLED'] > 0)
    severe_accidents = year_df[severe_mask]
    non_severe = year_df[~severe_mask]

    if not non_severe.empty:
        non_severe_sample = non_severe.sample(
            frac=min(non_severe_frac, 1.0),
            random_state=random_state
        )
    else:
        non_severe_sample = pd.DataFrame(columns=year_df.columns)

    sampled_df = pd.concat([severe_accidents, non_severe_sample])

    sampled_df = (
        sampled_df
        .groupby('month', group_keys=False)
        .apply(lambda x: x.sample(min(len(x), max_per_month), random_state=random_state))
    )

    sampled_df = (
        sampled_df
        .groupby('BOROUGH', group_keys=False)
        .apply(lambda x: x.sample(min(len(x), max_per_borough), random_state=random_state))
    )

    sampled_df = sampled_df.drop_duplicates(subset=['LATITUDE', 'LONGITUDE', 'datetime'])

    print(f"Year {year}: sampled {sampled_df.shape[0]} rows.")
    return sampled_df


In [None]:
def enrich_with_weather(sample_df):
    if sample_df.empty:
        return sample_df

    sample_df = sample_df.copy()
    sample_df['temperature'] = np.nan
    sample_df['precipitation'] = np.nan
    sample_df['weather_condition'] = np.nan

    for index, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
        weather_data = get_weather_data(row['LATITUDE'], row['LONGITUDE'], row['iso_datetime'])
        if weather_data:
            sample_df.at[index, 'temperature'] = weather_data['temperature']
            sample_df.at[index, 'precipitation'] = weather_data['precipitation']
            sample_df.at[index, 'weather_condition'] = map_weathercode_to_description(weather_data['weathercode'])

    sample_df.dropna(subset=['temperature', 'precipitation', 'weather_condition'], inplace=True)

    return sample_df

In [None]:
def sample_specific_years(
    base_df,
    years=range(2016, 2026),
    non_severe_frac=0.10,
    max_per_month=1000,
    max_per_borough=2000,
    random_state=42
):

    year_to_df = {}

    for y in years:
        if y not in base_df["year"].unique():
            print(f"Year {y} not found in dataset, skipping.")
            continue

        print(f"\n Processing year: {y}")
        sampled_y = build_yearly_sample(
            base_df,
            year=y,
            non_severe_frac=non_severe_frac,
            max_per_month=max_per_month,
            max_per_borough=max_per_borough,
            random_state=random_state
        )

        if sampled_y.empty:
            print(f" No rows sampled for year {y}.")
            continue

        year_to_df[y] = sampled_y

    return year_to_df


In [None]:
yearly_samples = sample_specific_years(df)

sample_2016 = yearly_samples.get(2016)
sample_2017 = yearly_samples.get(2017)
sample_2018 = yearly_samples.get(2018)
sample_2019 = yearly_samples.get(2019)
sample_2020 = yearly_samples.get(2020)
sample_2021 = yearly_samples.get(2021)
sample_2022 = yearly_samples.get(2022)
sample_2023 = yearly_samples.get(2023)
sample_2024 = yearly_samples.get(2024)
sample_2025 = yearly_samples.get(2025)


In [None]:
def process_single_year(base_df, year, output_dir="datasets"):
    os.makedirs(output_dir, exist_ok=True)

    print(f"Sampling {year}...")
    sampled_df = build_yearly_sample(base_df, year)

    print(f"Enriching {year}...")
    enriched_df = enrich_with_weather(sampled_df)

    output_path = f"{output_dir}/enriched_{year}.csv"
    enriched_df.to_csv(output_path, index=False)

    print(f"Saved enriched dataset for {year} to {output_path}")
    return enriched_df

### The cells below should be run only once every 12 hours to avoid hitting API limits. Expect a long runtime as there are several rows to process.

In [None]:
process_single_year(df, 2016)

In [None]:
process_single_year(df, 2017)

In [None]:
process_single_year(df, 2018)

In [None]:
process_single_year(df, 2019)

In [None]:
process_single_year(df, 2020)

In [None]:
process_single_year(df, 2021)

In [None]:
process_single_year(df, 2022)

In [None]:
process_single_year(df, 2023)

In [None]:
process_single_year(df, 2024)

In [None]:
process_single_year(df, 2025)