In [29]:
import rioxarray
import rasterio
import pandas as pd
import os
import geopandas
import ftplib
import re
import zipfile
import pickle
import shutil
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm

In [22]:
data_input_dir = "data/gpm-imerg"
aoi_dir = "data/aoi.geojson"
geodf = geopandas.read_file(aoi_dir)
grace_dates_dir = "data/grace_dates.pickle"
gpm_imerg_dir = "/Users/pawel/jsimpsonhttps.pps.eosdis.nasa.gov/imerg/gis"

In [4]:
geodf.bounds

Unnamed: 0,minx,miny,maxx,maxy
0,18.406925,50.476079,19.680133,51.179343


In [23]:
with open(grace_dates_dir, 'rb') as f:
    grace_dates = pickle.load(f)

In [24]:
grace_dates = [datetime.strptime(date, "%Y-%m-%d") for date in grace_dates]

In [25]:
def generate_date_path(date):
    return os.path.join(gpm_imerg_dir, date.strftime('%Y'), date.strftime('%m'))

In [39]:
def handle_file(date):
    file_path = generate_date_path(date)
    extracted_path = os.path.join(file_path, "extracted")
    zips = os.listdir(file_path)
    file_pattern = ".*" + date.strftime('%Y%m%d') + "-.*E235959.*7day.zip"
    file_patter = re.compile(file_pattern)
    zips = list(filter(file_patter.match, zips))
    zip = zips[0]
    zip_path = os.path.join(file_path, zip)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_path)
    except:
        print("Error in: " + zip_path)

    files = os.listdir(extracted_path)
    file_pattern = ".*7day.tif"
    file_patter = re.compile(file_pattern)
    file_list = list(filter(file_patter.match, files))
    file = file_list[0]
    tif_path = os.path.join(extracted_path, file)
    data = rasterio.open(tif_path)
    data = rioxarray.open_rasterio(data)[0]
    data = data.rio.write_crs("EPSG:4326")
    data = data.astype("float32")
    data = data.rio.clip(geodf.geometry.values, geodf.crs, all_touched=True)
    data = data.rio.write_crs("EPSG:4326")
    masked = data.to_masked_array()
    shutil.rmtree(extracted_path)
    return masked.data

In [30]:
def aggregate_imerg_data(grace_date):
    file = handle_file(grace_date)
    for i in range(3):
        grace_date = grace_date - timedelta(days=7)
        file = file + handle_file(grace_date)
    return file


In [40]:
df = pd.DataFrame(columns = ["date", "value"])
for grace_date in tqdm(grace_dates):
    df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)

  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_index=True)
  df = df.append({"date": grace_date, "value": aggregate_imerg_data(grace_date)}, ignore_in

In [41]:
df

Unnamed: 0,date,value
0,2002-04-17,"[[nan, nan, nan, nan, nan, nan, 111.0, 83.0, 7..."
1,2002-05-10,"[[nan, nan, nan, nan, nan, nan, 406.0, 328.0, ..."
2,2002-08-16,"[[nan, nan, nan, nan, nan, nan, 223.0, 254.0, ..."
3,2002-09-16,"[[nan, nan, nan, nan, nan, nan, 336.0, 355.0, ..."
4,2002-10-16,"[[nan, nan, nan, nan, nan, nan, 548.0, 595.0, ..."
...,...,...
201,2021-10-16,"[[nan, nan, nan, nan, nan, nan, 249.0, 272.0, ..."
202,2021-11-16,"[[nan, nan, nan, nan, nan, nan, 302.0, 329.0, ..."
203,2021-12-16,"[[nan, nan, nan, nan, nan, nan, 313.0, 277.0, ..."
204,2022-01-16,"[[nan, nan, nan, nan, nan, nan, 197.0, 304.0, ..."


In [4]:
gpm_imerg_zip_dir = "data/gpm-imerg/zip"
gpm_imerg_extracted_dir = "data/gpm-imerg/extracted"
zips = os.listdir(gpm_imerg_zip_dir)
file_pattern = ".*.zip"
file_patter = re.compile(file_pattern)
zips = list(filter(file_patter.match, zips))
for zipf in zips:
    try:
        with zipfile.ZipFile(os.path.join(gpm_imerg_zip_dir, zipf), 'r') as zip_ref:
            zip_ref.extractall(gpm_imerg_extracted_dir)
    except:
        print("Error in: " + zipf)
        

Error in: 3B-MO-L.GIS.IMERG.20190601.V06B.zip


In [5]:
gpm_imerg_extracted_dir = "data/gpm-imerg/extracted"
files = os.listdir(gpm_imerg_extracted_dir)
file_pattern = ".*V06B.tif"
file_patter = re.compile(file_pattern)
file_list = list(filter(file_patter.match, files))

In [6]:
df = pd.DataFrame(columns = ["date", "value"])

for f in file_list: 
    input_raster = os.path.join(gpm_imerg_extracted_dir, f)
    data = rasterio.open(input_raster)
    data = rioxarray.open_rasterio(data)[0]
    data = data.rio.write_crs("EPSG:4326")
    data = data.astype("float32")
    data = data.rio.clip(geodf.geometry.values, geodf.crs, all_touched=True)
    data = data.rio.write_crs("EPSG:4326")
    masked = data.to_masked_array()
    datetime = f[18:22] + "-" + f[22:24] + "-" + f[24:26]
    df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
    #df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)

  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignore_index=True)
  df = df.append({'date': datetime, 'value': masked.data}, ignor

In [42]:
df.to_pickle("data/gpm-imerg_df.pickle")

In [8]:
df

Unnamed: 0,date,value
0,2019-12-01,"[[nan, nan, nan, nan, nan, nan, 18.0, 23.0, 23..."
1,2022-04-01,"[[nan, nan, nan, nan, nan, nan, 64.0, 59.0, 64..."
2,2018-06-01,"[[nan, nan, nan, nan, nan, nan, 71.0, 67.0, 68..."
3,2009-11-01,"[[nan, nan, nan, nan, nan, nan, 30.0, 35.0, 36..."
4,2008-05-01,"[[nan, nan, nan, nan, nan, nan, 60.0, 58.0, 46..."
...,...,...
237,2016-04-01,"[[nan, nan, nan, nan, nan, nan, 43.0, 44.0, 49..."
238,2017-10-01,"[[nan, nan, nan, nan, nan, nan, 120.0, 119.0, ..."
239,2006-07-01,"[[nan, nan, nan, nan, nan, nan, 12.0, 13.0, 14..."
240,2011-04-01,"[[nan, nan, nan, nan, nan, nan, 30.0, 33.0, 53..."
