In [None]:
import os
import pandas as pd
import geopandas as gpd

import yaml

from functools import partial
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.elevation import get_indices_by_file
from projections import raster, utils


pd.set_option('max_columns', None)

In [8]:
def save_location_mapping(row_and_path):
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)
    
    
def yield_missing_shapes(gdf, save_path, prefix):
    for _, row in gdf.iterrows():
        path = save_path / get_save_file_name(prefix, row)
        if path.exists():
            continue
            
        yield row, path
        

def get_save_file_name(prefix, row):
    portion = f"_p{row['portion']}" if row['portion'] else ''
    return f'{prefix}_{row["id"]}{portion}.csv'

In [3]:
read_path = Path('../Data/Elevation/GTOPO30')
output_path = Path('../Output/Elevation/GTOPO30/')
partial_path = output_path / 'partial'
by_country_path = output_path / 'by_country'

output_path.mkdir(exist_ok=True)
partial_path.mkdir(exist_ok=True)
by_country_path.mkdir(exist_ok=True)

# Map raster to polygons

In [4]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [7]:
indices_file = read_path / 'indices_by_file.yml'
if indices_file.exists():
    with open(indices_file, 'r') as f:
        indices_by_file = yaml.safe_load(f)
else:
    files = list(read_path.glob('gt30e*.tif')) + list(read_path.glob('gt30w*.tif'))
    indices_by_file = get_indices_by_file(geo_df[geo_df['id'] != 'ATA'], files)

    with open(read_path / 'indices_by_file.yml', 'w') as f:
        yaml.dump(indices_by_file, f)

100%|████████████████████████████████████████| 33/33 [1:09:19<00:00, 126.04s/it]


In [9]:
n_processes = 30
for file, indices in indices_by_file.items():
    IMAGE = utils.read_tif(read_path / file)
    iterator = partial(yield_missing_shapes, save_path=partial_path, prefix=file[:-4])
    
    if n_processes == 1:
        for row_and_path in tqdm(iterator(geo_df.loc[indices])):
            save_location_mapping(row_and_path)
    else:
        with ThreadPoolExecutor(n_processes) as tpe:
            for _ in tqdm(
                tpe.map(save_location_mapping, iterator(geo_df.loc[indices])), 
                total=geo_df.shape[0]
            ):
                pass

  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  1%|▎                                 | 1165/122772 [48:40<84:40:17,  2.51s/it]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  2%|▌                                 | 1868/122772 [26:57<29:04:56,  1.15it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                     | 1/122772 [00:00<20:01:34,  1.70it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                                                | 0/122772 [00:00<?, ?it/s]
  0%|                       

# Union portions from different files and shapes

In [12]:
df_by_region = {}
for file in tqdm(partial_path.glob('*.csv'), desc='Reading'):
    try:
        df = pd.read_csv(file)
    except pd.errors.EmptyDataError:
        continue
        
    if 'id' not in df.columns:
        df['id'] = df['adm2']
        df['id'].fillna(df['adm1'], inplace=True)
        df['id'].fillna(df['adm0'], inplace=True)
    region = df.loc[0, 'id']
    df_by_region.setdefault(region, []).append(df)

for region, dfs in tqdm(df_by_region.items(), desc='Saving'):
    df = utils.combine_dataframes(dfs)
    df.to_feather(by_country_path / f'{region}.feather')
    
del df_by_region

Reading: 116307it [09:31, 203.66it/s]
Saving: 100%|████████████████████████████| 47912/47912 [05:24<00:00, 147.61it/s]


# Aggregate

In [17]:
countries = {}
for file in tqdm(by_country_path.glob('*.feather'), desc='Grouping'):
    df = pd.read_feather(file)
    df = df[df['value'] > -9999].copy()
    if df.empty:
        continue
    df = utils.get_weighted_average(df, value='value', weight='intersection_area', by=['id'])
    country = df.loc[0, 'id']
    countries.setdefault(country, []).append(df)

dfs = [utils.combine_dataframes(country_dfs)
      for country_dfs in countries.values()]
df = utils.combine_dataframes(dfs)
df.rename(columns={'_weighted_value_': 'value'}, inplace=True)
df.to_csv(output_path / 'elevation.csv', index=False)

Grouping: 47912it [08:00, 99.81it/s] 
