In [13]:
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm

In [4]:
df = pd.read_csv("crashes.csv")

essential_columns = [
    'CRASH DATE', 'CRASH TIME', 'LATITUDE', 'LONGITUDE', 'BOROUGH',
    'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
    'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
    'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
    'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
    'COLLISION_ID'
]

df = df[essential_columns]

  df = pd.read_csv("crashes.csv")


In [5]:
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,CRASH DATE,CRASH TIME,LATITUDE,LONGITUDE,BOROUGH,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,COLLISION_ID
2,11/01/2023,1:29,40.62179,-73.970024,BROOKLYN,1.0,0.0,0,0,0,0,1,0,4675373
9,09/11/2021,9:35,40.667202,-73.8665,BROOKLYN,0.0,0.0,0,0,0,0,0,0,4456314
10,12/14/2021,8:13,40.683304,-73.917274,BROOKLYN,0.0,0.0,0,0,0,0,0,0,4486609
13,12/14/2021,8:17,40.86816,-73.83148,BRONX,2.0,0.0,0,0,0,0,2,0,4486660
14,12/14/2021,21:10,40.67172,-73.8971,BROOKLYN,0.0,0.0,0,0,0,0,0,0,4487074


In [6]:
df['datetime'] = pd.to_datetime(df['CRASH DATE'] + ' ' + df['CRASH TIME'], format='%m/%d/%Y %H:%M')

df['iso_datetime'] = df['datetime'].dt.strftime('%Y-%m-%dT%H:%M')

In [7]:
print(df[['CRASH DATE', 'CRASH TIME', 'iso_datetime']].head())

    CRASH DATE CRASH TIME      iso_datetime
2   11/01/2023       1:29  2023-11-01T01:29
9   09/11/2021       9:35  2021-09-11T09:35
10  12/14/2021       8:13  2021-12-14T08:13
13  12/14/2021       8:17  2021-12-14T08:17
14  12/14/2021      21:10  2021-12-14T21:10


In [8]:
def get_weather_data(lat, lon, date_time, timezone='America/New_York'):
    base_url = "https://archive-api.open-meteo.com/v1/archive"
    date_str = date_time[:10]  # take YYYY-MM-DD from the ISO datetime for beter formatting
    hour_str = date_time[11:]  # take HH:MM for same reason

    params = {
        'latitude': lat,
        'longitude': lon,
        'start_date': date_str,
        'end_date': date_str,
        'hourly': 'temperature_2m,precipitation,weathercode',
        'timezone': timezone
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        # finding the weather for a specific hour
        target_time = f"{date_str}T{hour_str}"
        if 'hourly' in data and 'time' in data['hourly']:
            times = data['hourly']['time']
            if target_time in times:
                index = times.index(target_time)
                return {
                    'temperature': data['hourly']['temperature_2m'][index],
                    'precipitation': data['hourly']['precipitation'][index],
                    'weathercode': data['hourly']['weathercode'][index]
                }
        return None
    except requests.exceptions.RequestException as e:
        print(f"API request error: {e}")
        return None


In [9]:
def map_weathercode_to_description(weathercode):
    if weathercode in [0, 1]:
        return 'Clear'
    elif weathercode in [2, 3]:
        return 'Cloudy'
    elif weathercode in [45, 48]:
        return 'Fog/Haze'
    elif weathercode in [51, 53, 55, 56, 57]:
        return 'Drizzle'
    elif weathercode in [61, 80]:
        return 'Light Rain'
    elif weathercode in [63, 65, 66, 67, 81, 82]:
        return 'Heavy Rain'
    elif weathercode in [71, 85]:
        return 'Light Snow'
    elif weathercode in [73, 75, 77, 86]:
        return 'Heavy Snow'
    elif weathercode in [95, 96, 99]:
        return 'Thunderstorm'
    else:
        return 'Unknown'


In [10]:
df['temperature'] = None
df['precipitation'] = None
df['weather_condition'] = None

df_sampled = df.sample(500, random_state=42)
for index, row in tqdm(df_sampled.iterrows(), total=df_sampled.shape[0]):
    weather_data = get_weather_data(row['LATITUDE'], row['LONGITUDE'], row['iso_datetime'])
    if weather_data:
        df_sampled.at[index, 'temperature'] = weather_data['temperature']
        df_sampled.at[index, 'precipitation'] = weather_data['precipitation']
        df_sampled.at[index, 'weather_condition'] = map_weathercode_to_description(weather_data['weathercode'])


100%|██████████| 500/500 [05:17<00:00,  1.58it/s]


In [11]:
df_sampled.dropna(inplace=True)
df_sampled


Unnamed: 0,CRASH DATE,CRASH TIME,LATITUDE,LONGITUDE,BOROUGH,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,COLLISION_ID,datetime,iso_datetime,temperature,precipitation,weather_condition
740226,09/25/2018,20:00,40.831932,-73.906530,BRONX,0.0,0.0,0,0,0,0,0,0,3988010,2018-09-25 20:00:00,2018-09-25T20:00,21.6,0.7,Drizzle
559580,08/12/2019,22:00,40.618977,-73.951225,BROOKLYN,0.0,0.0,0,0,0,0,0,0,4190081,2019-08-12 22:00:00,2019-08-12T22:00,25.1,0.0,Cloudy
2193528,07/24/2025,15:00,40.555836,-74.212790,STATEN ISLAND,0.0,0.0,0,0,0,0,0,0,4830450,2025-07-24 15:00:00,2025-07-24T15:00,31.7,0.0,Cloudy
21510,12/24/2021,16:00,40.647590,-74.011420,BROOKLYN,1.0,0.0,1,0,0,0,0,0,4489420,2021-12-24 16:00:00,2021-12-24T16:00,6.1,0.0,Cloudy
1556371,03/08/2015,14:00,40.720557,-74.003510,MANHATTAN,0.0,0.0,0,0,0,0,0,0,3182326,2015-03-08 14:00:00,2015-03-08T14:00,6.1,0.0,Cloudy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249490,07/24/2016,14:00,40.781670,-73.830760,QUEENS,0.0,0.0,0,0,0,0,0,0,3487724,2016-07-24 14:00:00,2016-07-24T14:00,33.5,0.0,Cloudy
392321,09/10/2020,16:00,40.608593,-74.004860,BROOKLYN,1.0,0.0,0,0,0,0,1,0,4346648,2020-09-10 16:00:00,2020-09-10T16:00,26.0,0.1,Drizzle
875623,03/08/2018,22:00,40.846500,-73.848440,BRONX,0.0,0.0,0,0,0,0,0,0,3859592,2018-03-08 22:00:00,2018-03-08T22:00,0.8,0.1,Light Snow
1878918,09/02/2013,16:00,40.729186,-73.987249,MANHATTAN,0.0,0.0,0,0,0,0,0,0,15759,2013-09-02 16:00:00,2013-09-02T16:00,26.1,0.3,Drizzle


In [12]:
df_sampled.to_csv("random_crash_data_with_weather.csv", index=False)