In [1]:
import pandas as pd
import numpy as np
import yaml
from tqdm import tqdm
from pathlib import Path
from glob import glob
from functools import partial
from multiprocessing import Pool
import os

import shapefile
import shapely
from shapely.geometry import Point

pd.set_option('max_columns', None)

In [2]:
lon = 'lon'
lat = 'lat'
country = 'country'
output_folder = Path('../Output/Precipitaciones/GPCC')

In [None]:
dfs = []

for file in tqdm(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv')):
    df = pd.read_csv(file)
    dfs.append(df[['lon', 'lat']].copy())
    del df
    
df = dfs[0].append(dfs[1:])
del dfs

print(df.shape)
print('Unique locs', df.drop_duplicates(['lon', 'lat']).shape)
df.head()

In [4]:
try:
    locs = pd.read_csv(output_folder / 'unique_locs_gpcc.csv')
except FileNotFoundError:
    print('File not found, loading locs from df')
    locs = df[[lon, lat]].drop_duplicates()
    locs.loc[:, 'adm0'] = np.nan
    locs.loc[:, 'adm1'] = np.nan
    locs.loc[:, 'adm2'] = np.nan
    locs.loc[:, 'nearest_loc'] = False

File not found, loading locs from df


In [None]:
shps = glob('../Shapefiles/preprocessed/*.shp')
shapes_by_country = {}
for i, shp in tqdm(enumerate(shps), total=len(shps)):
    codes = Path(shp).name.replace('.shp', '').split('_')
    shp = shapefile.Reader(shp)
    shapes = [shapely.geometry.shape(s) for s in shp.shapes()]
    records = shp.records()
    
    for code in codes:
        shapes_by_country[code] = []
        for shape, record in zip(shapes, records):
            shapes_by_country[code].append((shape, *record))
    
assert 'GT' in shapes_by_country and 'PM' in shapes_by_country

  6%|▌         | 17/276 [00:17<10:52,  2.52s/it]

In [7]:
def country_shapes(country=None):
    if country is not None and country in shapes_by_country:
        print('Found country', country)
        yield from shapes_by_country[country]
    else:
        for shapes in shapes_by_country.values():
            yield from shapes
            
            
def match_single_geo(row, lon_col, lat_col, country_col):
    if isinstance(row, tuple):
        row = row[1]
        
    if isinstance(row['adm1'], str):
        return {'nearest_loc': row['nearest_loc'], 
                'adm1': row['adm1'],
                'adm2': row['adm2'], 
                'adm0': row['adm0'],
                'idx': row.name}
    
    results = {'nearest_loc': False, 'idx': row.name}

    point = Point((float(row[lon_col]), float(row[lat_col])))
    nearest_record = None
    nearest_distance = np.inf
    for shape, country, adm1, adm2 in country_shapes(row[country_col]):
        if point.within(shape):
            results['adm0'] = country
            results['adm2'] = adm2
            results['adm1'] = adm1
            break
    else:
        for shape, country, adm1, adm2 in country_shapes(row[country_col]):
            distance = point.distance(shape)
            if distance < nearest_distance:
                nearest_distance = distance
                nearest_record = (country, adm1, adm2)

        if nearest_record:
            results['nearest_loc'] = True
            results['adm0'] = nearest_record[0]
            results['adm1'] = nearest_record[1]
            results['adm2'] = nearest_record[2]
    
    return results    
            
            
locs[country] = 'no country'
mask = locs['adm0'].isnull()
pool_size = 30
f = partial(match_single_geo, lon_col=lon, lat_col=lat, country_col=country)

with Pool(pool_size) as p:
    i = 0
    for result in tqdm(p.imap_unordered(f, locs[mask].iterrows()), 
                       total=mask.sum()):
        i += 1
        idx = result['idx']
        locs.loc[idx, 'adm2'] = result.get('adm2')
        locs.loc[idx, 'adm1'] = result.get('adm1')
        locs.loc[idx, 'adm0'] = result.get('adm0')
        locs.loc[idx, 'nearest_loc'] = result['nearest_loc']
        
        if i % 10000 == 0:
            locs.to_csv(output_folder / 'unique_locs_gpcc.csv', index=False)
        
locs.to_csv(output_folder / 'unique_locs_gpcc.csv', index=False)
print('Total nearest:', locs['nearest_loc'].sum())

100%|██████████| 284913/284913 [6:07:30<00:00, 12.92it/s]   


Total nearest: 45031


In [8]:
locs.drop(columns='country', inplace=True)
for file in tqdm(glob('../Data/Precipitaciones/full_data_monthly_v2020_*.csv')):
    file = Path(file)
    df = pd.read_csv(file)
    
    df = df.merge(locs, on=['lat', 'lon'], how='left', validate='m:1')
    df.to_csv(output_folder / file.name)
    del df

100%|██████████| 13/13 [11:53<00:00, 54.91s/it]


# Aggregate

In [3]:
def clean_time(df, from_, to_):
    time = df['time'].str.split('-', expand=True)
    time.columns = ['year', 'month']
    
    years = {str(year)[2:]: year for year in range(from_, to_+1)}
    df['year'] = time['year'].map(years)
    df['month'] = pd.to_numeric(time['month'])

In [4]:
preprocessing_folder = Path('../Output/Precipitaciones/GPCC_preprocessing')
time_groups = {'yearly': ['year'], 'monthly': ['year', 'month']}
loc_groups = {'country': ['adm0'], 
              'edo': ['adm0', 'adm1'], 
              'mun': ['adm0', 'adm1', 'adm2']}

groups = {}
for loc_name, loc_group in loc_groups.items():
    for time_name, time_group in time_groups.items():
        groups[f'{loc_name}_{time_name}'] = loc_group + time_group

See : https://gis.stackexchange.com/questions/251812/returning-percentage-of-area-of-polygon-intersecting-another-polygon-using-shape

In [5]:
done = [Path(x).name for x in glob(str(preprocessing_folder / 'GPCC_*.csv'))]

files = sorted(glob(str(preprocessing_folder / 'full_data_monthly_v2020_*.csv')))
done = set(Path(x) for x in glob(str(preprocessing_folder / 'full_data_monthly_v2020_*.feather')))
for i, file in enumerate(files):
    feather = Path(file).with_suffix('.feather')
    if feather in done:
        continue 
        
    print('Reading', file)
    df = pd.read_csv(file)
    df.drop(columns='Unnamed: 0', inplace=True)
    df = df.melt(id_vars=['lon', 'lat', 'adm0', 'adm1', 'adm2', 'nearest_loc'],
                 var_name='time')
    
    print('Getting time')
    file_parts = file.split('_')
    from_ = int(file_parts[-3])
    to_ = int(file_parts[-2])
    clean_time(df, from_, to_)
    
    df.to_feather(feather)

Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1891_1900_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1901_1910_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1911_1920_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1921_1930_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1931_1940_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1941_1950_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1951_1960_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1961_1970_025.csv
Getting time
Reading ../Output/Precipitaciones/GPCC_preprocessing/full_data_monthly_v2020_1971_1980_025.csv
Getting time
Reading ../Output/Precipitac

In [7]:
files = list(preprocessing_folder.glob('full_data_monthly_v2020_*.feather'))
done = [x.name for x in preprocessing_folder.glob('GPCC_*.feather')]

for i, file in enumerate(files):
    df = pd.read_feather(file)
    
    for group_name, group in tqdm(groups.items(), desc=file.name):
        fname = f'GPCC_{group_name}_b{i}.feather'
#         if fname in done:
#             continue
            
        n_groups = len(df.drop_duplicates(group))
        collapsed = []

        pivot = pd.pivot_table(df, 
                               values=['value', 'nearest_loc'],
                               index=group,
                               aggfunc={'value': [np.mean, np.median, np.std, 'count'], 'nearest_loc': np.mean})
        pivot.columns = ['_'.join([level for level in reversed(columns) if level]) for columns in pivot.columns]
        pivot.reset_index().to_feather(preprocessing_folder / fname)

        pivot = pd.pivot_table(df[~df['nearest_loc']], 
                               values=['value'],
                               index=group,
                               aggfunc={'value': [np.mean, np.median, np.std, 'count']})
        pivot['nearest_loc_mean'] = 0
        pivot.columns = ['_'.join([level for level in reversed(columns) if level]) for columns in pivot.columns]
        pivot.reset_index().to_feather(preprocessing_folder / f'GPCC_nnl_{group_name}_b{i}.feather')

    del df

full_data_monthly_v2020_1891_1900_025.feather: 100%|█| 6/6 [02:50<00:00, 28.44s/
full_data_monthly_v2020_1901_1910_025.feather: 100%|█| 6/6 [02:52<00:00, 28.75s/
full_data_monthly_v2020_1911_1920_025.feather: 100%|█| 6/6 [02:58<00:00, 29.71s/
full_data_monthly_v2020_1921_1930_025.feather: 100%|█| 6/6 [03:09<00:00, 31.55s/
full_data_monthly_v2020_1931_1940_025.feather: 100%|█| 6/6 [02:50<00:00, 28.47s/
full_data_monthly_v2020_1941_1950_025.feather: 100%|█| 6/6 [02:42<00:00, 27.01s/
full_data_monthly_v2020_1951_1960_025.feather: 100%|█| 6/6 [02:41<00:00, 26.97s/
full_data_monthly_v2020_1961_1970_025.feather: 100%|█| 6/6 [02:39<00:00, 26.57s/
full_data_monthly_v2020_1971_1980_025.feather: 100%|█| 6/6 [02:40<00:00, 26.77s/
full_data_monthly_v2020_1981_1990_025.feather: 100%|█| 6/6 [02:47<00:00, 27.97s/
full_data_monthly_v2020_1991_2000_025.feather: 100%|█| 6/6 [02:53<00:00, 28.90s/
full_data_monthly_v2020_2001_2010_025.feather: 100%|█| 6/6 [03:03<00:00, 30.53s/
full_data_monthly_v2020_2011

In [8]:
files = list(preprocessing_folder.glob('GPCC*.feather'))
output_folder = Path('../Output/Precipitaciones/GPCC')

files_by_prefix = {}
for file in files:
    prefix = '_'.join(file.name.split('_')[:-1])
    if prefix in files_by_prefix:
        files_by_prefix[prefix].append(file)
    else:
        files_by_prefix[prefix] = [file]
        
for group_name, group in groups.items():
    for pr in ('GPCC_nnl', 'GPCC'):
        prefix = f'{pr}_{group_name}'
        print(prefix)
        files = files_by_prefix[prefix]

        dfs = [pd.read_feather(file) for file in files]
        df = dfs[0].append(dfs[1:], ignore_index=True)
        del dfs

        df = df.groupby(group).mean().reset_index()

        df.to_csv(output_folder / (prefix + '.csv'), index=False)

#         for file in files:
#             os.remove(file)

GPCC_nnl_country_yearly
GPCC_country_yearly
GPCC_nnl_country_monthly
GPCC_country_monthly
GPCC_nnl_edo_yearly
GPCC_edo_yearly
GPCC_nnl_edo_monthly
GPCC_edo_monthly
GPCC_nnl_mun_yearly
GPCC_mun_yearly
GPCC_nnl_mun_monthly
GPCC_mun_monthly


In [9]:
df['year'].unique()

array([1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901,
       1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912,
       1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923,
       1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934,
       1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945,
       1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956,
       1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
       1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
       1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
       1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])