In [1]:
import pandas as pd

In [2]:
df = pd.read_hdf('3B-MO.MS.MRG.3IMERG.20190801-S000000-E235959.08.V06B.HDF5')

ValueError: Dataset(s) incompatible with Pandas data types, not table, or no datasets found in HDF5 file.

In [3]:
import h5py

In [4]:
f = h5py.File('3B-MO.MS.MRG.3IMERG.20190801-S000000-E235959.08.V06B.HDF5', 'r')

In [5]:
f

<HDF5 file "3B-MO.MS.MRG.3IMERG.20190801-S000000-E235959.08.V06B.HDF5" (mode r)>

In [7]:
f.keys()

<KeysViewHDF5 ['Grid']>

In [9]:
f['Grid'].keys()

<KeysViewHDF5 ['nv', 'lonv', 'latv', 'time', 'lon', 'lat', 'time_bnds', 'lon_bnds', 'lat_bnds', 'precipitation', 'randomError', 'gaugeRelativeWeighting', 'probabilityLiquidPrecipitation', 'precipitationQualityIndex']>

In [10]:
grid = f['Grid']

In [11]:
print("Longitude data: {}".format(grid['lon']))
print("Longitude data attributes: {}".format(list(grid['lon'].attrs)))

Longitude data: <HDF5 dataset "lon": shape (3600,), type "<f4">
Longitude data attributes: ['DimensionNames', 'Units', 'units', 'standard_name', 'LongName', 'bounds', 'axis', 'CLASS', 'REFERENCE_LIST']


In [12]:
print("Name: {}".format(grid['lon'].attrs['standard_name'].decode()))
print("Unit: {}".format(grid['lon'].attrs['units'].decode()))

Name: longitude
Unit: degrees_east


In [13]:
import numpy as np

In [14]:
longitude_values = np.repeat(list(grid['lon']), 1800)
latitude_values = list(grid['lat'])*3600
precipitation_values = np.array(list(grid['precipitation'])).flatten()

dataset = pd.DataFrame({"lon": longitude_values, "lat": latitude_values, "precipitation": precipitation_values})
dataset.columns = [grid['lon'].attrs['standard_name'].decode() + " (" + grid['lon'].attrs['units'].decode() + ")",
                   grid['lat'].attrs['standard_name'].decode() + " (" + grid['lat'].attrs['units'].decode() + ")",
                   "Precipitation (" + grid['precipitation'].attrs['units'].decode() + ")",]
dataset.head()

Unnamed: 0,longitude (degrees_east),latitude (degrees_north),Precipitation (mm/hr)
0,-179.949997,-89.949997,-9999.900391
1,-179.949997,-89.849998,-9999.900391
2,-179.949997,-89.75,-9999.900391
3,-179.949997,-89.650002,-9999.900391
4,-179.949997,-89.550003,-9999.900391


In [28]:
dataset['Precipitation (mm/hr)'] = dataset['Precipitation (mm/hr)'].mask(
                                    np.isclose(dataset['Precipitation (mm/hr)'], -9999.9), 0)

In [32]:
dataset['Precipitation (mm/hr)'].describe()

count    6.480000e+06
mean     9.321189e-02
std      1.370286e-01
min      0.000000e+00
25%      3.924359e-04
50%      4.468640e-02
75%      1.331607e-01
max      3.226260e+00
Name: Precipitation (mm/hr), dtype: float64

array([ True,  True,  True, ...,  True,  True,  True])

In [220]:
import gc
import pandas as pd
import h5py
from datetime import datetime
import os
import numpy as np

In [165]:
promed_df = pd.read_feather(r'F:\OneDrive\School\research\thilanka-summer2021\scraper2.0\combined_df_raw.feather')

In [3]:
dataset = h5py.File('3B-MO.MS.MRG.3IMERG.20190801-S000000-E235959.08.V06B.HDF5', 'r')

In [5]:
from scipy.interpolate import RegularGridInterpolator

itp = RegularGridInterpolator( (dataset['Grid']['lon'], dataset['Grid']['lat']), np.arange(3600*1800).reshape(3600,1800), method='nearest')

In [166]:
def get_closest_gpm(data):
	try:
		idx = int(itp((data.zoom_lon, data.zoom_lat)))
		lati = idx % 1800
		loni = idx // 1800
		data['gpm_lat_idx'] = lati
		data['gpm_lon_idx'] = loni
		return data
	except:
		data['gpm_lat_idx'] = pd.NA
		data['gpm_lon_idx'] = pd.NA

promed_df[['zoom_lat', 'zoom_lon', 'gpm_lat_idx', 'gpm_lon_idx']] = promed_df[['zoom_lat', 'zoom_lon']].apply(get_closest_gpm, axis=1)

In [183]:
promed_df[['gpm_lat_idx', 'gpm_lon_idx']] = promed_df[['gpm_lat_idx', 'gpm_lon_idx']].fillna(-1).astype(int)

In [186]:
promed_df.to_feather(r'F:\OneDrive\School\research\thilanka-summer2021\scraper2.0\combined_df_raw.feather')

In [184]:
focus_points = pd.Series(list(zip(promed_df.gpm_lat_idx, promed_df.gpm_lon_idx))).unique()

In [241]:
from collections import defaultdict
from tqdm import tqdm
precip_totals = defaultdict(lambda: 0)

In [240]:
for dataset_path in tqdm([x for x in os.listdir() if x.endswith('.HDF5')]):
	with h5py.File(dataset_path, 'r') as dataset:
		for lat, lon in focus_points:
			precip_totals[(lat, lon)] += dataset['Grid']['precipitation'][0][lon][lat]

  0%|          | 0/250 [00:13<?, ?it/s]


KeyboardInterrupt: 

In [222]:
len([x for x in os.listdir() if x.endswith('.HDF5')])

250

In [None]:
output = pd.DataFrame([[key[0], key[1], value] for key, value in precip_totals.items()], columns=['lat_idx', 'lon_idx', 'total_precip'])
output.to_csv('total_precip.csv')

[(0, 0)]