In [1]:
import os
import pandas as pd
import geopandas as gpd

import yaml

from functools import partial
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.elevation import get_indices_by_file
from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
def save_location_mapping(row_path):
    """
    Common function used to obtain a mapping of polygons to
    the rasters used by the IMAGE. 
    
    This is not part of projections because it assumes IMAGE 
    exists in the global scope.
    """
    row, path = row_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)
    
    
def yield_missing_shapes(gdf, save_path, prefix):
    for _, row in gdf.iterrows():
        path = save_path / get_save_file_name(prefix, row)
        if path.exists():
            continue
            
        yield row, path
        

def get_save_file_name(prefix, row):
    portion = f"_p{row['portion']}" if row['portion'] else ''
    return f'{prefix}_{row["id"]}{portion}.csv'

In [3]:
read_path = Path('../Data/Elevation/GTOPO30')
output_path = Path('../Output/Elevation/GTOPO30/')
partial_path = output_path / 'partial'
by_country_path = output_path / 'by_country'

output_path.mkdir(exist_ok=True)
partial_path.mkdir(exist_ok=True)
by_country_path.mkdir(exist_ok=True)

# Map raster to polygons

In [7]:
# Read SHP with all polygons (output of PreprocessingLocs.ipynb, also available in Drive)
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [5]:
indices_file = read_path / 'indices_by_file.yml'
if indices_file.exists():
    with open(indices_file, 'r') as f:
        indices_by_file = yaml.safe_load(f)
else:
    files = list(read_path.glob('gt30e*.tif')) + list(read_path.glob('gt30w*.tif'))
    indices_by_file = get_indices_by_file(geo_df[geo_df['id'] != 'ATA'], files)

    with open(read_path / 'indices_by_file.yml', 'w') as f:
        yaml.dump(indices_by_file, f)

In [6]:
n_processes = 30
for file, indices in indices_by_file.items():
    IMAGE = utils.read_tif(read_path / file)
    iterator = partial(
        yield_missing_shapes, 
        save_path=partial_path, 
        prefix=file[:-4],
    )
    
    if n_processes == 1:
        for row_and_path in tqdm(iterator(geo_df.loc[indices])):
            save_location_mapping(row_and_path)
    else:
        with ThreadPoolExecutor(n_processes) as tpe:
            for _ in tqdm(
                tpe.map(save_location_mapping, iterator(geo_df.loc[indices])), 
                total=geo_df.shape[0]
            ):
                pass

  5%|█▋                              | 6625/122772 [1:45:29<30:49:34,  1.05it/s]
  7%|██▍                             | 9125/122772 [1:02:40<13:00:40,  2.43it/s]
  2%|▌                               | 2124/122772 [1:18:52<74:40:04,  2.23s/it]
  2%|▋                               | 2859/122772 [1:33:58<65:41:41,  1.97s/it]
  4%|█▎                                | 4540/122772 [39:11<17:00:50,  1.93it/s]
  0%|                                  | 6/122772 [18:14<6222:22:14, 182.47s/it]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  5%|█▊                                | 6343/122772 [56:53<17:24:22,  1.86it/s]
  3%|█                                 | 3973/122772 [35:09<17:31:14,  1.88it/s]
  2%|▊                                 | 2845/122772 [19:49<13:55:32,  2.39it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  2%|▌                                 | 1899/122772 [17:54<19:00:07,  1.77it/s]
  2%|▌                      

In [4]:
utils.union_and_save_portions(read_from=partial_path, save_in=by_country_path)

Reading: 116303it [08:50, 219.08it/s]
Saving: 100%|████████████████████████████| 47649/47649 [02:44<00:00, 289.49it/s]


# Aggregate
Read the preprocessed results, aggregate them and save a consolidated file

In [5]:
countries = {}
for file in tqdm(by_country_path.glob('*.feather'), desc='Grouping'):
    df = pd.read_feather(file)
    df = df[df['value'] > -9999].copy()
    if df.empty:
        continue
    df = utils.get_weighted_average(df, value='value', weight='intersection_area', by=['id'])
    country = df.loc[0, 'id']
    countries.setdefault(country, []).append(df)

dfs = [utils.combine_dataframes(country_dfs)
      for country_dfs in countries.values()]
df = utils.combine_dataframes(dfs)
df.rename(columns={'_weighted_value_': 'value'}, inplace=True)
df.to_csv(output_path / 'elevation.csv', index=False)

Grouping: 47649it [06:21, 125.05it/s]
