In [10]:
import pandas as pd
import geopandas as gpd
import numpy as np
import yaml
import re
from tqdm import tqdm
from pathlib import Path
from glob import glob
from functools import partial
from concurrent.futures import ProcessPoolExecutor

import shapefile
import shapely
from shapely.geometry import Point

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes, iter_records
from projections.models import Records
from projections import raster, utils, constants

pd.set_option('max_columns', None)

In [2]:
output_folder = utils.make_path('../Output/Precipitaciones/Raster/CRU/')
location_folder = utils.make_path(output_folder / 'locations')

In [3]:
def load_df():
    df = pd.read_csv('../Data/Precipitaciones/cru_80-20.csv')
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['lon'], df['lat']), crs=4326)
    df = raster.create_by_separation(df, lat='lat', lon='lon')
    return df

# Map raster to adm2

In [4]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [11]:
def mapper_worker(indices):
    df = load_df()
    geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')
    
    for _, row in geo_df.loc[indices].iterrows():
        portion = f"_p{int(row['portion']):03d}" if row['portion'] else ""
        file_name = f"{row['id']}{portion}.csv"
        row_path = location_folder / file_name
        if row_path.exists():
            continue

        subset = raster.get_intersection_area(df, row['geometry'])
        if subset is not None:
            subset = subset[subset["intersection_area"] > 0].copy()
        else:
            subset = pd.DataFrame()
        subset.to_csv(row_path, index=False)
    return 0

In [6]:
n_processes = 15
random_index = list(geo_df.index)
np.random.shuffle(random_index)
batch_size = (len(random_index) // n_processes) + 1
indices = [random_index[i:i+batch_size] for i in range(0, len(random_index), batch_size)]

In [12]:
with ProcessPoolExecutor(n_processes) as ppe:
    futures = [ppe.submit(mapper_worker, index) for index in indices]
    for future in futures:
        print(future.result())

Separation:Separation:  {'lat': 0.5, 'lon': 0.5}{'lat': 0.5, 'lon': 0.5}

Separation: {'lat': 0.5, 'lon': 0.5}



  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)


Separation: 


  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)


{'lat': 0.5, 'lon': 0.5}
Separation: 


  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)


{'lat': 0.5, 'lon': 0.5}



  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)

  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)


Separation:

In [25]:
locs = []
for file in tqdm(location_folder.glob('*.csv')):
    subset = pd.read_csv(file)
    
    portion = re.findall(r'.*_p(\d+)\.csv', file.name)
    if portion:
        subset['id'] = re.findall(r'(.*)_p\d+\.csv', file.name)[0]
        subset['portion'] = portion[0]
    else:
        subset['id'] = file.name[:-4]
    locs.append(subset)


locs = locs[0].append(locs[1:], ignore_index=True)
locs.drop_duplicates(inplace=True)
locs.to_csv(output_folder / 'loc_map.csv', index=False)
print(locs.shape)
locs.head()

122774it [03:47, 539.85it/s]


(320615, 5)


Unnamed: 0,lat,lon,intersection_area,id,portion
0,27.75,-18.25,126323000.0,ESP__GUANCHE,
1,27.75,-17.75,142809800.0,ESP__GUANCHE,
2,27.75,-15.75,755342300.0,ESP__GUANCHE,
3,27.75,-15.25,240791300.0,ESP__GUANCHE,
4,28.25,-17.75,15893210.0,ESP__GUANCHE,


# Map locations back

In [31]:
df = load_df()
locs = pd.read_csv(output_folder / 'loc_map.csv')
# locs['portion'].fillna(0, inplace=True)
df = df.merge(locs, on=['lat', 'lon'])

Separation: {'lat': 0.5, 'lon': 0.5}



  df["raster"] = df.buffer(sep["lat"] / 2, cap_style=3)


In [32]:
pivot = raster.weighted_pivot(
    df, 
    weight='intersection_area', 
    value_name='precipitation', 
    id_vars=['id']
)
utils.map_year_month(pivot, 'time', 1980, 2020)

# Save results
pivot.to_csv(output_folder / 'CRU.csv', index=False)
print(pivot.shape)
pivot.head()

(23595828, 7)


Unnamed: 0,id,time,precipitation,intersection_area_count,intersection_area_sum,year,month
0,ABW,80-01,412.090046,1,2209533.0,1980,1
47959,ABW,80-02,292.410013,1,2209533.0,1980,2
95918,ABW,80-03,28.090002,1,2209533.0,1980,3
143877,ABW,80-04,610.090038,1,2209533.0,1980,4
191836,ABW,80-05,8686.240853,1,2209533.0,1980,5


# Aggregate

In [40]:
df = pd.read_csv(output_folder / 'CRU.csv')

adm = {}
for key in df['id'].unique():
    parts = key.split('.')
    adm[key] = {0: parts[0], 1: '.'.join(parts[:2]), 2: key}
    
for i in range(3):
    df[f"adm{i}"] = df['id'].apply(lambda x: adm[x][i])
    
df.sample(5)

Unnamed: 0,id,time,precipitation,intersection_area_count,intersection_area_sum,year,month,adm0,adm1,adm2
7030547,EGY.16.16_1,09-12,38.491658,2,284746600.0,2009,12,EGY,EGY.16,EGY.16.16_1
14768433,PHL.6.17_1,85-10,127734.755637,1,16552830.0,1985,10,PHL,PHL.6,PHL.6.17_1
21837022,USA.3.10_1,87-11,97.584027,21,25894850000.0,1987,11,USA,USA.3,USA.3.10_1
11960739,MEX.30.133_1,98-04,52.148236,4,325087700.0,1998,4,MEX,MEX.30,MEX.30.133_1
6402982,DZA.28.23_1,87-11,5416.959775,1,40145140.0,1987,11,DZA,DZA.28,DZA.28.23_1


In [41]:
time_groups = {'yearly': ['year'], 'monthly': ['year', 'month']}
loc_groups = {'country': ['adm0'], 
              'edo': ['adm0', 'adm1'], 
              'mun': ['adm0', 'adm1', 'adm2']}

groups = {}
for loc_name, loc_group in loc_groups.items():
    for time_name, time_group in time_groups.items():
        groups[f'{loc_name}_{time_name}'] = loc_group + time_group

output_folder = Path('../Output/Precipitaciones/Raster/CRU/')
done = [Path(x).name for x in glob(str(output_folder / 'CRU_*.csv'))]
    
utils.aggregate_by_groups(df, groups, output_folder, ['precipitation'], done=done, batch=None)

Aggregating: 100%|████████████████████████████████| 6/6 [07:38<00:00, 76.44s/it]
