In [None]:
import re
import pandas as pd
import geopandas as gpd
from datetime import datetime
from pathlib import Path
from functools import partial, partialmethod
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.temperature import NcConverter, Aggregator
from projections import raster, utils

In [2]:
def save_location_mapping(row_and_path):
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)   


In [3]:
read_path = Path("../Data/GPCP")
filename = "precip.mon.mean.nc"

output_path = utils.make_path('../Output/GPCP/')
partial_path = utils.make_path(output_path / "partial")
by_country_path = utils.make_path(output_path / "by_country")

output_path.mkdir(exist_ok=True)

# Map raster to polygons

In [6]:
converter = NcConverter(nodata_name="missing_value")
converter.read(read_path / filename)
print(converter.ds['lat'])
print(converter.ds['lon'])

<class 'netCDF4._netCDF4.Variable'>
float32 lat(lat)
    units: degrees_north
    actual_range: [ 88.75 -88.75]
    long_name: Latitude
    standard_name: latitude
    axis: Y
unlimited dimensions: 
current shape = (72,)
filling on, default _FillValue of 9.969209968386869e+36 used
<class 'netCDF4._netCDF4.Variable'>
float32 lon(lon)
    units: degrees_east
    long_name: Longitude
    actual_range: [  1.25 358.75]
    standard_name: longitude
    axis: X
unlimited dimensions: 
current shape = (144,)
filling on, default _FillValue of 9.969209968386869e+36 used


In [None]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [7]:
IMAGE = converter.get_xarray("precip", period=0, lon_offset=-180)

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix='p0')

n_processes = 30
if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ProcessPoolExecutor(n_processes) as ppe:
        for _ in tqdm(
            ppe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

 46%|████████████████▌                   | 56602/122772 [17:57<20:59, 52.53it/s]


# Union portions

In [8]:
df_by_region = {}
for file in tqdm(partial_path.glob('*.csv'), desc='Reading'):
    try:
        df = pd.read_csv(file)
    except pd.errors.EmptyDataError:
        continue
        
    if 'id' not in df.columns:
        df['id'] = df['adm2']
        df['id'].fillna(df['adm1'], inplace=True)
        df['id'].fillna(df['adm0'], inplace=True)
    region = df.loc[0, 'id']
    df_by_region.setdefault(region, []).append(df)

for region, dfs in tqdm(df_by_region.items(), desc='Saving'):
    df = utils.combine_dataframes(dfs)
    df.to_feather(by_country_path / f'{region}.feather')
    
del df_by_region

Reading: 122772it [03:32, 577.32it/s]
Saving: 100%|████████████████████████████| 48139/48139 [01:10<00:00, 678.41it/s]


# Map all time periods and variables

In [4]:
converter = NcConverter(nodata_name="missing_value")
converter.read(read_path / filename)

base_agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True
    )
mapping_dfs = base_agg.get_mapping_dfs()

Reading: 48139it [01:07, 708.81it/s]


In [5]:
def aggregate(image):
    agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True,
        mapping_dfs=mapping_dfs
    )
    agg.aggregate(image)

n_processes = 30

print(f"Running with {n_processes} processes")
for variable in ("precip",):
    if n_processes == 1:
        for image in converter.iter_periods(variable, lon_offset=-180):
            print(aggregate(image))
    else:
        tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
        with ProcessPoolExecutor(n_processes) as tpe:
            for name in tpe.map(aggregate, converter.iter_periods(attribute_name=variable, lon_offset=-180)):
                pass

Running with 30 processes
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF

In [42]:
def rename_value_column(df, file_name, time_array):
    var, suffix = file_name.split('_')
    time_n = int(re.findall(r"p(\d+).csv", suffix)[0])
    n_days = int(time_array[time_n])
    yearmonth = n_days_to_yearmonth(n_days)
    df.rename(columns={"value": f"{var}_{yearmonth}"}, inplace=True)
    
    
def n_days_to_yearmonth(n_days):
    base_ordinal = datetime(1800,1,1).toordinal()
    dt = datetime.fromordinal(base_ordinal + n_days)
    return f"{dt.year}{dt.month:02}"


df = None
tqdm.__init__ = partialmethod(tqdm.__init__, disable=False)

for file in tqdm(output_path.glob('*.csv')):        
    field = utils.robust_read(file)
    if field.empty:
        print(file.name, "is empty")
        continue
        
    field.set_index("id", inplace=True)
    rename_value_column(field, file.name, time_array=converter.ds['time'])
        
    if df is None:
        df = field
    else:
        df = df.merge(
            field.drop(columns='intersection_area'), 
            left_index=True, 
            right_index=True, 
            how='outer'
        )
        
columns = sorted((x for x in df.columns if x != "intersection_area"))
df = df[["intersection_area"] + columns]
df.reset_index().to_csv(output_path / "gpcp.csv", index=False)

508it [00:35, 14.47it/s]


In [43]:
df.sample(15)

Unnamed: 0_level_0,intersection_area,precip_197901,precip_197902,precip_197903,precip_197904,precip_197905,precip_197906,precip_197907,precip_197908,precip_197909,...,precip_202007,precip_202008,precip_202009,precip_202010,precip_202011,precip_202012,precip_202101,precip_202102,precip_202103,precip_202104
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HND.16.26_1,303887900.0,0.0,0.112798,0.0,0.958087,3.800792,8.410983,11.419774,6.756451,7.112541,...,10.480112,11.166276,11.749871,8.601905,2.493099,1.872023,0.067194,0.330656,0.521225,1.599283
ROU.20.35_1,185684400.0,5.956531,6.059831,3.9803,2.088639,2.422599,2.471656,2.830302,2.699059,5.729964,...,1.459186,3.836083,5.280222,11.206941,4.658408,6.664148,5.21726,6.799782,2.546207,3.13191
GTM.20.2_1,221408100.0,0.0,0.137569,0.008101,0.567952,2.086399,6.047151,9.05508,5.054951,6.78857,...,9.955028,11.063694,10.86996,8.963907,4.832074,3.262987,0.188029,0.422913,0.233045,0.878194
LKA.6.4_1,56617930.0,2.772145,2.613663,2.07785,8.017475,13.829563,4.860833,7.587496,4.524807,4.918789,...,6.640871,2.763447,1.033971,4.082309,3.522593,8.582113,9.624702,1.718467,1.560997,8.735518
COL.17.31_1,388844700.0,4.735024,6.420702,6.633536,9.640895,4.588734,7.846069,6.99968,4.510444,7.865609,...,7.327694,4.358498,8.203504,6.284709,11.655031,7.493047,13.196154,1.577848,6.869431,8.366201
IRN.12.1_1,2177689000.0,2.335242,1.878455,1.483887,1.07815,0.365185,0.217033,0.306677,0.1238,0.001553,...,0.044559,0.089437,0.080477,0.172707,0.579674,0.571993,0.792078,0.996808,1.101325,0.632214
MOZ.2.10_1,5684177000.0,7.410971,1.255407,7.845563,6.655481,1.050228,3.707628,4.610275,2.982217,2.606891,...,5.996288,2.434821,3.68388,7.939962,5.925394,4.464822,0.68309,4.726316,3.184764,2.737675
TWN.6.1_1,278115600.0,2.607592,1.63214,2.43662,2.09811,5.015272,5.538036,1.487379,2.960696,3.238752,...,0.611694,0.991128,8.506593,1.839169,4.490914,1.99674,1.228877,1.912731,2.624125,1.323628
USA.18.25_1,658875500.0,0.070708,0.022251,0.059574,0.17593,0.145907,0.868655,1.790359,0.598452,0.509786,...,0.677058,0.42834,0.148519,0.065574,0.072236,0.077052,0.056939,0.069422,0.18771,0.874095
NLD.8.39_1,37228790.0,5.125485,4.239252,4.114454,4.082963,2.481325,2.679819,3.058338,4.046303,3.998711,...,3.19651,3.008621,3.162277,3.65816,6.11706,4.264613,4.017041,5.252374,2.689831,1.712042
