In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import yaml
import re
from tqdm import tqdm
from pathlib import Path
from glob import glob
from functools import partial
from concurrent.futures import ProcessPoolExecutor
import os

import shapefile
import shapely
from shapely.geometry import Point

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes, iter_records
from projections.models import Records
from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
lon = 'lon'
lat = 'lat'
country = 'country'
output_folder = utils.make_path('../Output/Precipitaciones/Raster/GPCC/')
location_folder = utils.make_path(output_folder / 'locations')

# Load data

In [3]:
dfs = []

for file in tqdm(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv')):
    df = pd.read_csv(file)
    dfs.append(df[['lon', 'lat']].copy())
    del df
    
df = dfs[0].append(dfs[1:])
del dfs

print(df.shape)
print('Unique locs', df.drop_duplicates(['lon', 'lat']).shape)
df.head()

100%|███████████████████████████████████████████| 13/13 [01:46<00:00,  8.17s/it]


(3787420, 2)
Unique locs (291340, 2)


Unnamed: 0,lon,lat
0,-120.125,85.875
1,-119.875,85.875
2,-119.625,85.875
3,-120.125,85.625
4,-119.875,85.625


In [4]:
try:
    locs = pd.read_csv(output_folder / 'unique_locs_gpcc.csv')
except FileNotFoundError:
    print('File not found, loading locs from df')
    locs = df[[lon, lat]].drop_duplicates()
    locs.loc[:, 'adm0'] = np.nan
    locs.loc[:, 'adm1'] = np.nan
    locs.loc[:, 'adm2'] = np.nan
    locs.to_csv(output_folder / 'unique_locs_gpcc.csv', index=False)
    
locs = raster.create_by_separation(locs, lat='lat', lon='lon')
locs.head()

File not found, loading locs from df
Separation: {'lat': 0.25, 'lon': 0.25}
Converting to GeoDataFrame


Unnamed: 0,lon,lat,adm0,adm1,adm2,geometry,raster
0,-120.125,85.875,,,,POINT (-120.12500 85.87500),"POLYGON ((-120.00000 86.00000, -120.00000 85.7..."
1,-119.875,85.875,,,,POINT (-119.87500 85.87500),"POLYGON ((-119.75000 86.00000, -119.75000 85.7..."
2,-119.625,85.875,,,,POINT (-119.62500 85.87500),"POLYGON ((-119.50000 86.00000, -119.50000 85.7..."
3,-120.125,85.625,,,,POINT (-120.12500 85.62500),"POLYGON ((-120.00000 85.75000, -120.00000 85.5..."
4,-119.875,85.625,,,,POINT (-119.87500 85.62500),"POLYGON ((-119.75000 85.75000, -119.75000 85.5..."


# ADM

In [5]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [7]:
def mapper_worker(indices):
    df = pd.read_csv(output_folder / 'unique_locs_gpcc.csv')
    df = raster.create_by_separation(df, lat='lat', lon='lon')
    geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp').loc[indices]
    
    for _, row in geo_df.iterrows():
        portion = f"_p{int(row['portion']):03d}" if row['portion'] else ""
        file_name = f"{row['id']}{portion}.csv"
        row_path = location_folder / file_name
        if row_path.exists():
            continue

        subset = raster.get_intersection_area(df, row['geometry'])
        if subset is not None:
            subset = subset[subset["intersection_area"] > 0].copy()
        else:
            subset = pd.DataFrame()
        subset.to_csv(row_path, index=False)
    return 0

n_processes = 15
random_index = list(geo_df.index)
np.random.shuffle(random_index)
batch_size = (len(random_index) // n_processes) + 1
indices = [random_index[i:i+batch_size] for i in range(0, len(random_index), batch_size)]

In [11]:
with ProcessPoolExecutor(n_processes) as ppe:
    futures = [ppe.submit(mapper_worker, index) for index in indices]
    for future in futures:
        print(future.result())

Separation: Separation:{'lat': 0.25, 'lon': 0.25}
 Separation:Separation:Converting to GeoDataFrame{'lat': 0.25, 'lon': 0.25}Separation:Separation:  

{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}  
Converting to GeoDataFrame
{'lat': 0.25, 'lon': 0.25}Converting to GeoDataFrame{'lat': 0.25, 'lon': 0.25}
Separation:
Separation:
Separation:
Converting to GeoDataFrameConverting to GeoDataFrame  Converting to GeoDataFrame 

{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}



Converting to GeoDataFrameConverting to GeoDataFrameConverting to GeoDataFrame


Separation:Separation:Separation:Separation:Separation:Separation:      {'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}{'lat': 0.25, 'lon': 0.25}
{'lat': 0.25, 'lon': 0.25}



Converting to GeoDataFrameConverting to GeoDataFrameConverting to GeoDataFrame
Converting to GeoDataFrameConverting to GeoDataFrame


Converting to GeoDataFrame


0
0
0
0
0


In [None]:
locs = []
for file in tqdm(location_folder.glob('*.csv')):
    subset = pd.read_csv(file)
    
    portion = re.findall(r'.*_p(\d+)\.csv', file.name)
    if portion:
        subset['id'] = re.findall(r'(.*)_p\d+\.csv', file.name)[0]
        subset['portion'] = portion[0]
    else:
        subset['id'] = file.name[:-4]
    locs.append(subset)


locs = locs[0].append(locs[1:], ignore_index=True)
locs.drop_duplicates(inplace=True)
locs.to_csv(output_folder / 'loc_map.csv', index=False)
print(locs.shape)
locs.head()

122772it [03:31, 579.19it/s]


# Aggregate

In [3]:
locs = pd.read_csv(output_folder / 'loc_map.csv').fillna('NA')
assert locs.duplicated().sum() == 0

adm = {}
for key in locs['id'].unique():
    parts = key.split('.')
    adm[key] = {0: parts[0], 1: '.'.join(parts[:2]), 2: key}
    
for i in range(3):
    locs[f"adm{i}"] = locs['id'].apply(lambda x: adm[x][i])
    
locs.sample(5)

Unnamed: 0,lat,lon,intersection_area,id,portion,adm0,adm1,adm2
414108,18.375,-102.875,375279500.0,MEX.16.24_1,,MEX,MEX.16,MEX.16.24_1
387376,44.375,20.125,42795280.0,SRB.7.3_1,,SRB,SRB.7,SRB.7.3_1
451855,-5.875,-38.375,5721834.0,BRA.6.93_1,,BRA,BRA.6,BRA.6.93_1
488565,58.875,34.625,401595500.0,RUS.49.7_1,,RUS,RUS.49,RUS.49.7_1
571830,19.375,-5.625,195795400.0,MLI__BERABISH,24.0,MLI__BERABISH,MLI__BERABISH,MLI__BERABISH


In [6]:
time_groups = {'yearly': ['year'], 'monthly': ['year', 'month']}
loc_groups = {'country': ['adm0'], 
              'edo': ['adm0', 'adm1'], 
              'mun': ['adm0', 'adm1', 'adm2']}

groups = {}
for loc_name, loc_group in loc_groups.items():
    for time_name, time_group in time_groups.items():
        groups[f'{loc_name}_{time_name}'] = loc_group + time_group
        

done = [Path(x).name for x in glob(str(output_folder / 'GPCC_*.csv'))]

files = sorted(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv'))
for i, file in enumerate(files):
    file = Path(file)
    print(file.name)
    
    from_year, to_year = file.name.split('_')[-3:-1] 
    full_name = f'GPCC_full_{from_year}_{to_year}.csv'
    if full_name in done:
        continue
    
    # Read file and merge locations back
    df = pd.read_csv(file)
    df = df.merge(locs, on=['lat', 'lon'])
    
    # Reshape to long
    pivot = raster.weighted_pivot(
        df, 
        weight='intersection_area', 
        value_name='precipitation', 
        id_vars=['adm0', 'adm1', 'adm2']
    )
    
    # Format time
    utils.map_year_month(pivot, 'time', int(from_year), int(to_year))
    assert pivot[['year', 'month']].isnull().sum().sum() == 0
    
    # Compute and save aggregations
    utils.aggregate_by_groups(pivot, groups, output_folder, values=['precipitation'], done=done, batch=i)
        
    # Save micro data
    pivot.to_csv(output_folder / full_name, index=False)
    print(pivot.shape)

full_data_monthly_v2020_1891_1900_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:43<00:00, 17.20s/it]


(5774520, 9)
full_data_monthly_v2020_1901_1910_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:39<00:00, 16.66s/it]


(5774520, 9)
full_data_monthly_v2020_1911_1920_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:39<00:00, 16.50s/it]


(5774520, 9)
full_data_monthly_v2020_1921_1930_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:39<00:00, 16.64s/it]


(5774520, 9)
full_data_monthly_v2020_1931_1940_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:38<00:00, 16.43s/it]


(5774520, 9)
full_data_monthly_v2020_1941_1950_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:40<00:00, 16.78s/it]


(5774520, 9)
full_data_monthly_v2020_1951_1960_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:38<00:00, 16.41s/it]


(5774520, 9)
full_data_monthly_v2020_1961_1970_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:40<00:00, 16.79s/it]


(5774520, 9)
full_data_monthly_v2020_1971_1980_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:40<00:00, 16.67s/it]


(5774520, 9)
full_data_monthly_v2020_1981_1990_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:40<00:00, 16.78s/it]


(5774520, 9)
full_data_monthly_v2020_1991_2000_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:39<00:00, 16.58s/it]


(5774520, 9)
full_data_monthly_v2020_2001_2010_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:40<00:00, 16.71s/it]


(5774520, 9)
full_data_monthly_v2020_2011_2019_025.csv


Aggregating: 100%|████████████████████████████████| 6/6 [01:30<00:00, 15.09s/it]


(5197068, 9)


In [7]:
# Join all batches together
files = [Path(x) for x in glob(str(output_folder / 'GPCC*.csv'))]
files = [x for x in files if not x.name.startswith('GPCC_full')]
groups = {}
for file in files:
    prefix = '_'.join(file.name.split('_')[:-1])
    if prefix in groups:
        groups[prefix].append(file)
    else:
        groups[prefix] = [file]
        
for prefix, files in tqdm(groups.items()):
    dfs = [pd.read_csv(file, dtype=str) for file in files]
    df = dfs[0].append(dfs[1:], ignore_index=True)
    del dfs
    
    df.to_csv(output_folder / (prefix + '.csv'))
    
    for file in files:
        os.remove(file)

100%|█████████████████████████████████████████████| 6/6 [06:38<00:00, 66.41s/it]
