In [1]:
import re
import pandas as pd
import geopandas as gpd
from datetime import datetime
from pathlib import Path
from functools import partial, partialmethod
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.temperature import NcConverter, Aggregator
from projections import raster, utils



In [2]:
def save_location_mapping(row_and_path):
    """
    Common function used to obtain a mapping of polygons to
    the rasters used by the IMAGE. 
    
    This is not part of projections because it assumes IMAGE 
    exists in the global scope.
    """
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)   


In [3]:
read_path = Path("../Data/GPCP")
filename = "precip.mon.mean.nc"

output_path = utils.make_path('../Output/GPCP/')
partial_path = utils.make_path(output_path / "partial")
by_country_path = utils.make_path(output_path / "by_country")

output_path.mkdir(exist_ok=True)

# Map raster to polygons

In [4]:
converter = NcConverter(nodata_name="missing_value")
converter.read(read_path / filename)
print(converter.ds['lat'])
print(converter.ds['lon'])

<class 'netCDF4._netCDF4.Variable'>
float32 lat(lat)
    units: degrees_north
    actual_range: [ 88.75 -88.75]
    long_name: Latitude
    standard_name: latitude
    axis: Y
unlimited dimensions: 
current shape = (72,)
filling on, default _FillValue of 9.969209968386869e+36 used
<class 'netCDF4._netCDF4.Variable'>
float32 lon(lon)
    units: degrees_east
    long_name: Longitude
    actual_range: [  1.25 358.75]
    standard_name: longitude
    axis: X
unlimited dimensions: 
current shape = (144,)
filling on, default _FillValue of 9.969209968386869e+36 used


In [5]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [6]:
IMAGE = converter.get_xarray("precip", period=0, lon_offset=-180)

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix='p0')

n_processes = 30
if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ProcessPoolExecutor(n_processes) as ppe:
        for _ in tqdm(
            ppe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

100%|███████████████████████████████████| 122772/122772 [23:23<00:00, 87.49it/s]


# Union portions

In [7]:
utils.union_and_save_portions(read_from=partial_path, save_in=by_country_path)

Reading: 122772it [03:01, 674.95it/s]
Saving: 100%|████████████████████████████| 48139/48139 [01:02<00:00, 775.99it/s]


# Map all time periods and variables

In [8]:
converter = NcConverter(nodata_name="missing_value")
converter.read(read_path / filename)

base_agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True
    )
mapping_dfs = base_agg.get_mapping_dfs()

Reading: 48139it [01:04, 745.01it/s]


In [9]:
def aggregate(image):
    agg = Aggregator(
        by_country_path=by_country_path, 
        partial_path=partial_path, 
        in_memory=True,
        mapping_dfs=mapping_dfs
    )
    agg.aggregate(image)

n_processes = 30

print(f"Running with {n_processes} processes")
for variable in ("precip",):
    if n_processes == 1:
        for image in converter.iter_periods(variable, lon_offset=-180):
            print(aggregate(image))
    else:
        tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
        with ProcessPoolExecutor(n_processes) as tpe:
            for name in tpe.map(aggregate, converter.iter_periods(attribute_name=variable, lon_offset=-180)):
                pass

Running with 30 processes
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF emptyDF empty

DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF

DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF emptyDF empty

DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
DF empty
D

In [10]:
def rename_value_column(df, file_name, time_array):
    var, suffix = file_name.split('_')
    time_n = int(re.findall(r"p(\d+).csv", suffix)[0])
    n_days = int(time_array[time_n])
    yearmonth = n_days_to_yearmonth(n_days)
    df.rename(columns={"value": f"{var}_{yearmonth}"}, inplace=True)
    
    
def n_days_to_yearmonth(n_days):
    base_ordinal = datetime(1800,1,1).toordinal()
    dt = datetime.fromordinal(base_ordinal + n_days)
    return f"{dt.year}{dt.month:02}"


df = None
tqdm.__init__ = partialmethod(tqdm.__init__, disable=False)

for file in tqdm(output_path.glob('*.csv')):        
    field = utils.robust_read(file)
    if field.empty:
        print(file.name, "is empty")
        continue
        
    field.set_index("id", inplace=True)
    rename_value_column(field, file.name, time_array=converter.ds['time'])
        
    if df is None:
        df = field
    else:
        df = df.merge(
            field.drop(columns='intersection_area'), 
            left_index=True, 
            right_index=True, 
            how='outer'
        )
        
columns = sorted((x for x in df.columns if x != "intersection_area"))
df = df[["intersection_area"] + columns]
df.reset_index().to_csv(output_path / "gpcp.csv", index=False)

508it [00:35, 14.50it/s]
