# Get weather data

Weather can impact the number of bike riders on the road. We want to get weather data from 2013-2022 from [NOAA](https://www.ncei.noaa.gov/data/global-hourly/archive/csv/)
* Bulk download the dataset for specified years from https://www.ncei.noaa.gov/access/metadata/landing-page/bin/iso?id=gov.noaa.ncdc:C00516
* Untar and extract
* Keep only weather station data for the KNYC (Central Park, Manhattan) weather station

Weather stations and can be found [here](https://www.ncei.noaa.gov/access/search/data-search/global-summary-of-the-day?startDate=2013-01-01T00:00:00&endDate=2022-03-20T23:59:59&pageNum=1&stations=72505394728). Data can by downloaded manually for specific stations and years in lieu of running this script
* NY CITY CENTRAL PARK, NY US (72505394728.csv)

### Data
We are interested in temperature, precipitation, and whether a day had snow
* Global Hourly (downloaded using this script)
* Global Summary of Day (manually downloaded from [here](https://www.ncei.noaa.gov/access/search/data-search/global-summary-of-the-day?startDate=2013-01-01T00:00:00&endDate=2022-03-20T23:59:59&pageNum=1&stations=72505394728))

### Data dictionaries
* [Global Hourly CSV Help](https://www.ncei.noaa.gov/data/global-hourly/doc/CSV_HELP.pdf)
* [Global Hourly ISD Format](https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf)

# WARNING: Running the notebook will download many gigabytes of data

In [None]:
import os
import requests
import logging
import sys
import tarfile
import pandas as pd
import numpy as np

In [None]:
YEARS = range(2013, 2023)
NOAA_URL = "https://www.ncei.noaa.gov/data/global-hourly/archive/csv/"
FILE_EXTENSION = ".tar.gz"
WEATHER_DIR = "data/weather/"
HOURLY_DIR = WEATHER_DIR + "hourly/"
DAILY_DIR = WEATHER_DIR + "daily/"
KNYC_FILE = "72505394728.csv"

In [None]:
# create directory to save weather data to
if not os.path.exists(WEATHER_DIR):
    os.makedirs(os.path.dirname(WEATHER_DIR))

### Global Hourly Weather Data

In [None]:
# download all weather *tar.gz files
for year in YEARS:
    noaa_url = NOAA_URL + str(year) + FILE_EXTENSION
    with requests.get(noaa_url, stream=True) as r:
        r.raise_for_status()

        local_filename = WEATHER_DIR + str(year) + FILE_EXTENSION
        logging.debug(f"downloaded {noaa_url}. saving to {local_filename}...")
        with open(local_filename, "w+b") as f:
            f.write(r.raw.read(decode_content=True))

In [None]:
# extract (only KNYC weather data)
for year in YEARS:
    tar = tarfile.open(WEATHER_DIR + str(year) + FILE_EXTENSION)
    YEAR_DIR = WEATHER_DIR + str(year) + "/"

    if not os.path.exists(YEAR_DIR):
        os.makedirs(os.path.dirname(YEAR_DIR))

    tar.extract(member=KNYC_FILE, path=YEAR_DIR)

In [None]:
def c2f(celsius):
    return celsius * 9 / 5 + 32

In [None]:
# create hourly weather dir
if not os.path.exists(HOURLY_DIR):
    os.makedirs(os.path.dirname(HOURLY_DIR))

In [None]:
# process and save hourly weather data
# "hourly" weather for NYC from KNYC weather station
for year in YEARS:
    hourly_file = WEATHER_DIR + str(year) + "/" + KNYC_FILE
    hourly = pd.read_csv(hourly_file, usecols=["DATE", "TMP", "AA1"])
    hourly.dropna(axis=0, inplace=True)

    # TMP -> keep digits before comma -> divide by 10, keep sign (in celcius -> convert to F)
    hourly["TMP"] = (
        hourly["TMP"]
        .apply(lambda s: int(s.split(",")[0]) / 10)
        .replace(999.9, np.NAN)
        .map(c2f)
    )
    # AA1 -> precipitation. <?, amount precip in mm/10, ?, ?>
    hourly["AA1"] = (
        hourly["AA1"].apply(lambda a: int(a.split(",")[1]) / 10).replace(999.9, np.NAN)
    )
    # DATE -> need to round to nearest hour, drop duplicate hours, if no hour impute

    # drop the 999.9->NAs as time series model needs data
    hourly.dropna(axis=0, inplace=True)
    hourly.rename(
        columns={"DATE": "date", "TMP": "temp_f", "AA1": "precip_mm"}, inplace=True
    )

    hourly.to_csv(HOURLY_DIR + str(year) + ".csv")

### Global Summary of Daily Weather Data
NOTE: This data was downloaded manually as specified above

In [None]:
# create daily weather dir
if not os.path.exists(DAILY_DIR):
    os.makedirs(os.path.dirname(DAILY_DIR))

In [None]:
# process and save daily weather data
# daily weather for NYC from KNYC weather station
for year in YEARS:
    daily_file = WEATHER_DIR + str(year) + ".csv"
    # daily weather from KNYC weather station: NY CITY CENTRAL PARK, NY US
    daily = pd.read_csv(daily_file, usecols=["DATE", "TEMP", "SNDP", "PRCP"])

    # TEMP -> leave as is (in Farenheit)
    # SNDP in cm -> 0 for 999.9 and 1 for any snow
    daily["SNDP"].replace(999.9, 0, inplace=True)
    daily["SNDP"] = np.sign(daily["SNDP"])
    daily["SNDP"] = daily["SNDP"].astype("int")  # 0/1 instead of boolean?

    daily.rename(
        columns={
            "DATE": "date",
            "TEMP": "temp_f",
            "SNDP": "is_snowday",
            "PRCP": "precip_mm",
        },
        inplace=True,
    )

    daily.to_csv(DAILY_DIR + str(year) + ".csv")