# Notebook that downloads GPM rainfall data done per typhoon

In [25]:
%load_ext jupyter_black

In [3]:
import getpass
import os
from pathlib import Path


import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup
import requests

In [4]:
# Setting directories
input_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/03_rainfall/input"
)
# Setting path to save the GPM data
gpm_file_name = "gpm_data/rainfall_data/output_hhr/"
gpm_folder_path = Path(input_dir, gpm_file_name)

In [11]:
# To create an account for downloading the data
# follow the instructions here: https://registration.pps.eosdis.nasa.gov/registration/
# Change the user name and provide the password in the code
USERNAME =  getpass.getpass(prompt="Username: ", stream=None)
PASSWORD = getpass.getpass(prompt="Password: ", stream=None)

# Setting the number of days prior to the landfall data for which to collect data
DAYS_TO_LANDFALL = 2

Username:  ········
Password:  ········


In [26]:
# Load and clean the typhoon metadata
# We really only care about the landfall date
typhoon_metadata = pd.read_csv(input_dir / "metadata_typhoons.csv").set_index(
    "typhoon"
)
for colname in ["startdate", "enddate", "landfalldate"]:
    typhoon_metadata[colname] = pd.to_datetime(
        typhoon_metadata[colname], format="%d/%m/%Y"
    )
typhoon_metadata

Unnamed: 0_level_0,startdate,enddate,landfalldate,landfall_time,imerg_type
typhoon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
durian2006,2006-11-24,2006-12-09,2006-11-30,06:00:00,final
fengshen2008,2008-06-17,2008-06-25,2008-06-20,06:00:00,final
aere2011,2011-05-05,2011-05-15,2011-05-07,21:00:00,final
atsani2020,2020-10-29,2020-11-07,2020-11-06,00:00:00,final
bopha2012,2012-11-25,2012-12-09,2012-12-03,21:00:00,final
danas2019,2019-07-14,2019-07-23,2019-07-17,00:00:00,final
fung-wong2014,2014-09-17,2014-09-25,2014-09-19,03:00:00,final
goni2015,2015-08-13,2015-08-30,2015-08-22,00:00:00,final
goni2020,2020-10-28,2020-11-06,2020-10-31,21:00:00,final
hagupit2014,2014-11-30,2014-12-12,2014-12-06,18:00:00,final


In [None]:
#%% Functions used
def list_files(url):
    page = requests.get(url, auth=(USERNAME, PASSWORD)).text
    soup = BeautifulSoup(page, "html.parser")
    return [
        url + "/" + node.get("href")
        for node in soup.find_all("a")
        if node.get("href").endswith("tif")
    ]


def download_gpm_http(start_date, end_date, download_path):
    base_url = "https://arthurhouhttps.pps.eosdis.nasa.gov/pub/gpmdata"

    date_list = pd.date_range(start_date, end_date)
    file_list = []

    for date in date_list:
        print(f"Downloading data for date {date}")
        day_path = download_path / date.strftime("%Y%m%d")
        day_path.mkdir(parents=True, exist_ok=True)

        url = f"{base_url}/{date.strftime('%Y/%m/%d')}/gis"
        tiff_files = list_files(url=url)

        for tiff_file in tiff_files:
            file_name = tiff_file.split("/")[-1]

            file_path = day_path / file_name
            file_list.append(file_path)
            r = requests.get(tiff_file, auth=(user_name, user_name))
            open(file_path, "wb").write(r.content)

    return file_list

## Download the data

This section is for downloading the data.
It takes a long time to complete.

In [None]:
for typhoon, metadata in typhoon_metadata.iterrows():
    start_date = metadata["landfalldate"] - dt.timedelta(days=DAYS_TO_LANDFALL)
    end_date = metadata["landfalldate"] - dt.timedelta(days=DAYS_TO_LANDFALL)
    print(f"Downloading data for {typhoon} between {start_date} and {end_date}") 
    download_gpm_http(start_date=start_date, 
                      end_date=end_date, 
                      download_path=gpm_folder_path / typhoon / "GPM")