# **Open Meteo API - Hourly Weather Data for NYC's Zip Codes in 2018**

## Import Packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openmeteo_requests
import requests_cache
from retry_requests import retry

## NYC Zip Codes
Starting from the dataset (https://data.cityofnewyork.us/City-Government/Broadband-Adoption-and-Infrastructure-by-Zip-Code/qz5f-yx82) obtained from the same website from which the accident data were extracted, all zip codes associated with the city of New York were retrieved.

In [2]:
# Read the dataset containing
nyc_zipcodes_data = pd.read_csv('Broadband_Adoption_and_Infrastructure_by_Zip_Code_20240209.csv')

# Filter the necessary data and remove duplicates
zip_codes = list(nyc_zipcodes_data['Zip Code'].dropna().drop_duplicates())
zip_codes = [int(x) for x in zip_codes]

## Retrieve Latitude and Longitude for each Zip Code
Using **web scraping** from the website https://www.zipdatamaps.com/{zip_code}, it was possible to extract the latitude and longitude of each zip code in New York City previously obtained.

In [3]:
# Create an empty dataframe to store the results
result_df = pd.DataFrame(columns=["Zip Code", "Latitude", "Longitude"])

# Loop through the zip codes in the list
for zip_code in zip_codes:
    url = f"https://www.zipdatamaps.com/{zip_code}"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    coordinates = soup.find_all('td')

    # Find the index of the "Coordinates" element in the list
    coordinates_index = None
    for i, td in enumerate(coordinates):
        if "Coordinates" in str(td):
            coordinates_index = i
            break

    # If the index of "Coordinates" is found, take the next element and split it
    if coordinates_index is not None and coordinates_index + 1 < len(coordinates):
        coordinates_text = coordinates[coordinates_index + 1].text.strip()

        # Split the coordinates into latitude and longitude
        lat, lon = map(float, coordinates_text.split(','))

        # Create a temporary dataframe for the current element
        temp_df = pd.DataFrame({"Zip Code": [zip_code], "Latitude": [lat], "Longitude": [lon]})

        # Concatenate the temporary dataframe with the main dataframe
        result_df = pd.concat([result_df, temp_df], ignore_index=True)
    else:
        print(f"Coordinates not found for the zip code {zip_code}")

# Print the final dataframe
print(result_df)


Coordinates not found for the zip code 83
Coordinates not found for the zip code 10047
Coordinates not found for the zip code 10048
Coordinates not found for the zip code 10096
Coordinates not found for the zip code 10097
Coordinates not found for the zip code 10196
    Zip Code   Latitude  Longitude
0      10001  40.750244 -73.997017
1      10002  40.713882 -73.985924
2      10003  40.731991 -73.988869
3      10004  40.694939 -74.016922
4      10005  40.706150 -74.008568
..       ...        ...        ...
237    11691  40.600616 -73.762558
238    11692  40.595150 -73.796173
239    11693  40.611607 -73.815712
240    11694  40.575130 -73.851662
241    11697  40.557446 -73.913467

[242 rows x 3 columns]


## Open Meteo - API Requests

The **Open Meteo API** (https://open-meteo.com/en/docs/historical-weather-api/) is an open API that, by taking as input the geographic coordinates of a location and a time interval, is capable of returning hourly weather conditions of choice for the specified location and period. 

However, it has some **limitations** on a per-minute, hourly, and daily basis. For this reason, when extracting weather information (specifically, temperature at 2 meters above ground, relative humidity at 2 meters above ground, rainfall level, snow level, and percentage of cloud cover) for **242 different locations**, it was necessary to divide the list of zip codes into two parts, making two separate requests spaced *90 seconds apart* to avoid exceeding the per-minute limit.

In [3]:
latitudes = list(result_df['Latitude'])
longitudes = list(result_df['Longitude'])

# Split coordinates into two different lists
split_point = (len(latitudes)+1)//2
subsets_lat = [latitudes[:split_point], latitudes[split_point:]]
subsets_long = [longitudes[:split_point], longitudes[split_point:]]

In [4]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Initalize the url
url = "https://archive-api.open-meteo.com/v1/archive"

# Set the time interval
start_date = '2017-12-31'
stop_date = '2018-12-31'

In [5]:
import time
responses = [0]*len(subsets_lat)

# Repeat the request for the two lists of coordinates
for i in range(len(subsets_lat)):
  # Define parameters to make the API request
  params = {
      "latitude": subsets_lat[i],
      "longitude": subsets_long[i],
      "hourly": ["temperature_2m", "relative_humidity_2m", "rain", "snowfall", "cloud_cover"],
      "timezone": "auto",
      "start_date": start_date,
      "end_date": stop_date
  }

  # Make the API request
  response = openmeteo.weather_api(url, params=params)
  responses[i] = response
  time.sleep(90)

In [6]:
# Concat the two lists of responses in a unique list
resp = []
resp.extend(responses[0])
resp.extend(responses[1])

242

In [7]:
# Create an empty dataframe in which collect hourly weather data of 2018, for each location
weather_dataset = pd.DataFrame()

# Process location's data
for location in range(len(resp)):
	response = resp[location]

	# Process hourly data. The order of variables needs to be the same as requested.
	hourly = response.Hourly()
	hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
	hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
	hourly_rain = hourly.Variables(2).ValuesAsNumpy()
	hourly_snowfall = hourly.Variables(3).ValuesAsNumpy()
	hourly_cloud_cover = hourly.Variables(4).ValuesAsNumpy()

	hourly_data = {"date": pd.date_range(
		start = pd.to_datetime(hourly.Time(), unit = "s"),
		end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
		freq = pd.Timedelta(seconds = hourly.Interval()),
		inclusive = "left"
	)}

	# Save data into the dataframe
	hourly_data["temperature_2m"] = hourly_temperature_2m
	hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
	hourly_data["rain"] = hourly_rain
	hourly_data["snowfall"] = hourly_snowfall
	hourly_data["cloud_cover"] = hourly_cloud_cover
	hourly_data['zip_code'] = list(result_df['Zip Code'])[location]

	hourly_dataframe = pd.DataFrame(data = hourly_data).sort_values(by='date')
	weather_dataset = pd.concat([weather_dataset, hourly_dataframe])

When a time interval is inputted to the API, it returns results from 05:00:00 of the start date (which is why the start date was set to 2018-12-31) to 04:00:00 of the day following the end date. Due to the large amount of downloaded data, to reduce the dataset size, non-relevant time intervals were removed.

In [8]:
weather_dataset = weather_dataset[(weather_dataset['date']>= '2018-01-01 00:00:00') & (weather_dataset['date']<= '2018-12-31 23:59:59')]

## Export Data
Export data into a csv file.

In [10]:
weather_dataset.to_csv('Weather.csv', index = False)