In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import yaml
from tqdm import tqdm
from pathlib import Path
from glob import glob
from functools import partial
from multiprocessing import Pool
import os

import shapefile
import shapely
from shapely.geometry import Point

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes, iter_records
from projections.models import Records
from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
lon = 'lon'
lat = 'lat'
country = 'country'
output_folder = Path('../Output/Precipitaciones/Raster/GPCC/')
ethnic_folder = Path('../Output/Precipitaciones/Raster/GPCC_ethnic/')

# Load data

In [3]:
dfs = []

for file in tqdm(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv')):
    df = pd.read_csv(file)
    dfs.append(df[['lon', 'lat']].copy())
    del df
    
df = dfs[0].append(dfs[1:])
del dfs

print(df.shape)
print('Unique locs', df.drop_duplicates(['lon', 'lat']).shape)
df.head()

100%|██████████| 13/13 [01:09<00:00,  5.36s/it]


(3787420, 2)
Unique locs (291340, 2)


Unnamed: 0,lon,lat
0,-120.125,85.875
1,-119.875,85.875
2,-119.625,85.875
3,-120.125,85.625
4,-119.875,85.625


In [3]:
try:
    locs = pd.read_csv(output_folder / 'unique_locs_gpcc.csv')
except FileNotFoundError:
    print('File not found, loading locs from df')
    locs = df[[lon, lat]].drop_duplicates()
    locs.loc[:, 'adm0'] = np.nan
    locs.loc[:, 'adm1'] = np.nan
    locs.loc[:, 'adm2'] = np.nan
    locs.to_csv(output_folder / 'unique_locs_gpcc.csv', index=False)
    
locs = raster.create_by_separation(locs, lat='lat', lon='lon')
locs.head()

Separation: {'lat': 0.25, 'lon': 0.25}
Converting to GeoDataFrame


Unnamed: 0,lon,lat,adm0,adm1,adm2,geometry,raster
0,-120.125,85.875,,,,POINT (-120.12500 85.87500),"POLYGON ((-120.00000 86.00000, -120.00000 85.7..."
1,-119.875,85.875,,,,POINT (-119.87500 85.87500),"POLYGON ((-119.75000 86.00000, -119.75000 85.7..."
2,-119.625,85.875,,,,POINT (-119.62500 85.87500),"POLYGON ((-119.50000 86.00000, -119.50000 85.7..."
3,-120.125,85.625,,,,POINT (-120.12500 85.62500),"POLYGON ((-120.00000 85.75000, -120.00000 85.5..."
4,-119.875,85.625,,,,POINT (-119.87500 85.62500),"POLYGON ((-119.75000 85.75000, -119.75000 85.5..."


# ADM

In [4]:
shapes = load_shapes('../Shapefiles/preprocessed')

100%|██████████| 276/276 [02:18<00:00,  1.99it/s]


In [6]:
# Create rasters from point
results = []
for pol, record in iter_records(shapes):    
    subdf = raster.intersection_ratio(locs, pol)
    if subdf is None:
        continue

    # Add record information
    subdf['adm0'] = record[0]
    subdf['adm1'] = record[1]
    subdf['adm2'] = record[2]
    
    results.append(subdf)
    
print('Appending')
results = results[0].append(results[1:], ignore_index=True)
results.drop_duplicates(inplace=True)

print('Saving')
results.to_csv(output_folder / 'loc_map.csv', index=False)
print(results.shape)
results.head()

Separation: {'lat': 0.25, 'lon': 0.25}
Converting to GeoDataFrame


Mapping polygons: 100%|██████████| 147767/147767 [3:42:03<00:00, 11.09it/s]      


Appending
Saving
(357836, 6)


Unnamed: 0,lat,lon,intersection_ratio,adm0,adm1,adm2
0,6.875,72.875,0.004304,MDV,,
1,6.875,73.125,0.024442,MDV,,
2,6.625,72.875,0.008656,MDV,,
3,6.625,73.125,0.020311,MDV,,
4,6.375,72.875,0.004036,MDV,,


# Aggregate

In [3]:
locs = pd.read_csv(output_folder / 'loc_map.csv').fillna('NA')
assert locs.duplicated().sum() == 0


time_groups = {'yearly': ['year'], 'monthly': ['year', 'month']}
loc_groups = {'country': ['adm0'], 
              'edo': ['adm0', 'adm1'], 
              'mun': ['adm0', 'adm1', 'adm2']}

groups = {}
for loc_name, loc_group in loc_groups.items():
    for time_name, time_group in time_groups.items():
        groups[f'{loc_name}_{time_name}'] = loc_group + time_group
        
done = [Path(x).name for x in glob(str(output_folder / 'GPCC_*.csv'))]

files = sorted(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv'))
for i, file in enumerate(files):
    file = Path(file)
    print(file.name)
    
    from_year, to_year = file.name.split('_')[-3:-1] 
    full_name = f'GPCC_full_{from_year}_{to_year}.csv'
    if full_name in done:
        continue
    
    # Read file and merge locations back
    df = pd.read_csv(file)
    df = df.merge(locs, on=['lat', 'lon'])
    
    # Reshape to long
    pivot = raster.weighted_pivot(df, value_name='precipitation')
    
    # Format time
    utils.map_year_month(pivot, 'time', int(from_year), int(to_year))
    assert pivot[['year', 'month']].isnull().sum().sum() == 0
    
    # Compute and save aggregations
    utils.aggregate_by_groups(pivot, groups, output_folder, values=['precipitation'], done=done, batch=i)
        
    # Save micro data
    pivot.to_csv(output_folder / full_name, index=False)
    print(pivot.shape)

full_data_monthly_v2020_1891_1900_025.csv
full_data_monthly_v2020_1901_1910_025.csv


Aggregating: 100%|██████████| 6/6 [00:58<00:00,  9.78s/it]


(3958200, 9)
full_data_monthly_v2020_1911_1920_025.csv


Aggregating: 100%|██████████| 6/6 [01:01<00:00, 10.24s/it]


(3958200, 9)
full_data_monthly_v2020_1921_1930_025.csv


Aggregating: 100%|██████████| 6/6 [01:02<00:00, 10.35s/it]


(3958200, 9)
full_data_monthly_v2020_1931_1940_025.csv


Aggregating: 100%|██████████| 6/6 [01:01<00:00, 10.19s/it]


(3958200, 9)
full_data_monthly_v2020_1941_1950_025.csv


Aggregating: 100%|██████████| 6/6 [01:03<00:00, 10.58s/it]


(3958200, 9)
full_data_monthly_v2020_1951_1960_025.csv


Aggregating: 100%|██████████| 6/6 [01:02<00:00, 10.37s/it]


(3958200, 9)
full_data_monthly_v2020_1961_1970_025.csv


Aggregating: 100%|██████████| 6/6 [01:02<00:00, 10.39s/it]


(3958200, 9)
full_data_monthly_v2020_1971_1980_025.csv


Aggregating: 100%|██████████| 6/6 [01:02<00:00, 10.34s/it]


(3958200, 9)
full_data_monthly_v2020_1981_1990_025.csv


Aggregating: 100%|██████████| 6/6 [01:04<00:00, 10.71s/it]


(3958200, 9)
full_data_monthly_v2020_1991_2000_025.csv


Aggregating: 100%|██████████| 6/6 [01:05<00:00, 10.89s/it]


(3958200, 9)
full_data_monthly_v2020_2001_2010_025.csv


Aggregating: 100%|██████████| 6/6 [01:03<00:00, 10.54s/it]


(3958200, 9)
full_data_monthly_v2020_2011_2019_025.csv


Aggregating: 100%|██████████| 6/6 [00:56<00:00,  9.49s/it]


(3562380, 9)


In [4]:
# Join all batches together
files = [Path(x) for x in glob(str(output_folder / 'GPCC*.csv'))]
files = [x for x in files if not x.name.startswith('GPCC_full')]
groups = {}
for file in files:
    prefix = '_'.join(file.name.split('_')[:-1])
    if prefix in groups:
        groups[prefix].append(file)
    else:
        groups[prefix] = [file]
        
for prefix, files in tqdm(groups.items()):
    dfs = [pd.read_csv(file, dtype=str) for file in files]
    df = dfs[0].append(dfs[1:], ignore_index=True)
    del dfs
    
    df.to_csv(output_folder / (prefix + '.csv'))
    
    for file in files:
        os.remove(file)

100%|██████████| 6/6 [05:18<00:00, 53.17s/it]


# Ethnic

In [4]:
# Read shapes
adm = gpd.read_file('../Shapefiles/ethnic_preprocessed/tribe_adm0.shp')
adm.head()

Unnamed: 0,NAME,TRIBE_CODE,LAT,LON,GID_0,NAME_0,area_tribe,area_adm,area_inter,geometry
0,GUANCHE,250,28.3354,-15.6735,ESP,Spain,7485274000.0,506043800000.0,7112255000.0,"MULTIPOLYGON (((-17.89487 27.78681, -17.89514 ..."
1,JEBALA,312,34.8506,-5.28036,ESP,Spain,16737560000.0,506043800000.0,20285420.0,"MULTIPOLYGON (((-5.37708 35.91704, -5.37708 35..."
2,RIF,651,34.7908,-3.71829,ESP,Spain,20272920000.0,506043800000.0,7560911.0,"MULTIPOLYGON (((-2.92593 35.29208, -2.92708 35..."
3,ADANGME,6,6.076851,0.270457,GHA,Ghana,4986379000.0,238324300000.0,4803454000.0,"MULTIPOLYGON (((0.69465 5.77336, 0.69328 5.775..."
4,ADELE,8,8.244284,0.673651,GHA,Ghana,1413803000.0,238324300000.0,677916100.0,"POLYGON ((0.45975 8.06680, 0.46512 8.07837, 0...."


In [6]:
# Find intersections per shape
ethnic_df = []
for idx, row in tqdm(adm.iterrows(), total=adm.shape[0]):
    subdf = raster.intersection_ratio(locs, row['geometry'])
    
    if subdf is None or subdf.empty:
        continue
        
    for col in ('NAME', 'TRIBE_CODE', 'GID_0'):
        subdf[col] = row[col]
        
    ethnic_df.append(subdf)
    
ethnic_df = ethnic_df[0].append(ethnic_df[1:], ignore_index=True)
ethnic_df.to_csv(ethnic_folder / 'ethnic_map.csv', index=False)

100%|██████████| 1412/1412 [12:35<00:00,  1.87it/s]  


In [7]:
ethnic_df = pd.read_csv(ethnic_folder / 'ethnic_map.csv')
assert ethnic_df.duplicated().sum() == 0

time_groups = {'yearly': ['year'], 'monthly': ['year', 'month']}
loc_groups = {'ethnic': ['NAME', 'TRIBE_CODE', 'GID_0']}

groups = {}
for loc_name, loc_group in loc_groups.items():
    for time_name, time_group in time_groups.items():
        groups[f'{loc_name}_{time_name}'] = loc_group + time_group
        
done = [Path(x).name for x in glob(str(ethnic_folder / 'GPCC_*.csv'))]
done = []

files = sorted(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv'))
for i, file in enumerate(files):
    file = Path(file)
    print(file.name)
    
    from_year, to_year = file.name.split('_')[-3:-1] 
    full_name = f'GPCC_full_{from_year}_{to_year}.csv'
    if full_name in done:
        continue
    
    # Read file and merge locations back
    df = pd.read_csv(file)
    df = df.merge(ethnic_df, on=['lat', 'lon'])
    
    # Reshape to long
    pivot = raster.weighted_pivot(
        df,          
        value_name='precipitation', 
        weight='intersection_area', 
        id_vars=('NAME', 'TRIBE_CODE', 'GID_0')
    )
    
    # Format time
    utils.map_year_month(pivot, 'time', int(from_year), int(to_year))
    assert pivot[['year', 'month']].isnull().sum().sum() == 0
    
    # Compute and save aggregations
    utils.aggregate_by_groups(pivot, groups, ethnic_folder, values=['precipitation'], done=done, batch=i)
        
    # Save micro data
    pivot.to_csv(ethnic_folder / full_name, index=False)
    print(pivot.shape)

full_data_monthly_v2020_1891_1900_025.csv


Aggregating: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]


(169440, 9)
full_data_monthly_v2020_1901_1910_025.csv


Aggregating: 100%|██████████| 2/2 [00:02<00:00,  1.10s/it]


(169440, 9)
full_data_monthly_v2020_1911_1920_025.csv


Aggregating: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]


(169440, 9)
full_data_monthly_v2020_1921_1930_025.csv


Aggregating: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]


(169440, 9)
full_data_monthly_v2020_1931_1940_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]


(169440, 9)
full_data_monthly_v2020_1941_1950_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


(169440, 9)
full_data_monthly_v2020_1951_1960_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]


(169440, 9)
full_data_monthly_v2020_1961_1970_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


(169440, 9)
full_data_monthly_v2020_1971_1980_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]


(169440, 9)
full_data_monthly_v2020_1981_1990_025.csv


Aggregating: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]


(169440, 9)
full_data_monthly_v2020_1991_2000_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.03it/s]


(169440, 9)
full_data_monthly_v2020_2001_2010_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]


(169440, 9)
full_data_monthly_v2020_2011_2019_025.csv


Aggregating: 100%|██████████| 2/2 [00:01<00:00,  1.13it/s]


(152496, 9)


In [14]:
# Join all batches together
files = [Path(x) for x in glob(str(ethnic_folder / 'GPCC*.csv'))]
files = [x for x in files if not x.name.startswith('GPCC_full')]
groups = {}
for file in files:
    prefix = '_'.join(file.name.split('_')[:-1]).replace('_ethnic_ethnic', '_ethnic')
    if prefix in groups:
        groups[prefix].append(file)
    else:
        groups[prefix] = [file]
        
for prefix, files in tqdm(groups.items()):
    dfs = [pd.read_csv(file, dtype=str) for file in files]
    df = dfs[0].append(dfs[1:], ignore_index=True)
    del dfs
    
    df.to_csv(ethnic_folder / (prefix + '.csv'))
    
    for file in files:
        os.remove(file)

100%|██████████| 2/2 [00:11<00:00,  5.78s/it]
