In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import tarfile

import yaml

from functools import partial
from glob import glob, iglob
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes, iter_records
from projections.models import Records
from projections.elevation import get_indices_by_file, SpacialTxt
from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
def save_location_mapping(row_and_path):
    """
    Common function used to obtain a mapping of polygons to
    the rasters used by the IMAGE. 
    
    This is not part of projections because it assumes IMAGE 
    exists in the global scope.
    """
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)

In [3]:
read_path = Path('../Data/Elevation/Ruggedness')
output_path = Path('../Output/Elevation/Ruggedness/')

base_file = 'cellarea.txt'
file_path = output_path / base_file[:-4]
partial_path = file_path / 'partial'
by_country_path = file_path / 'by_country'

output_path.mkdir(exist_ok=True)
file_path.mkdir(exist_ok=True)
partial_path.mkdir(exist_ok=True)
by_country_path.mkdir(exist_ok=True)

# Map raster to polygons

In [4]:
# Read SHP with all polygons (output of PreprocessingLocs.ipynb, also available in Drive)
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [5]:
n_processes = 30

spacial_txt = SpacialTxt(read_path / base_file)
spacial_txt.read(save=True)
IMAGE = spacial_txt.get_xarray()

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix=base_file[:-4])

if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ProcessPoolExecutor(n_processes) as tpe:
        for _ in tqdm(
            tpe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

100%|████████████████████████████████| 122772/122772 [13:59:29<00:00,  2.44it/s]


# Union portions from different files and shapes

In [6]:
utils.union_and_save_portions(read_from=partial_path, save_in=by_country_path)

Reading: 122772it [10:04, 203.17it/s]
Saving: 100%|████████████████████████████| 48140/48140 [03:03<00:00, 262.92it/s]


# Aggregate

In [7]:
utils.aggregate_feather_splits_and_save(by_country_path, output_path / base_file, no_data_value=-9999)

Reading: 100%|███████████████████████████| 48140/48140 [02:04<00:00, 388.03it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [05:02<00:00, 159.27it/s]


# Additional files with the same raster
As there are multiple files sharing the same raster, match the values from these to the preprocessed raster and perform aggregations.

In [8]:
files = ['slope.txt', 'tri.txt']
for file in files:
    print(file)
    file_path = output_path / file[:-4]
    file_by_country_path = file_path / 'by_country'
    file_path.mkdir(exist_ok=True)
    file_by_country_path.mkdir(exist_ok=True)

    spacial_txt = SpacialTxt(read_path / file)
    spacial_txt.read(save=True)
    IMAGE = spacial_txt.get_xarray()
    
    for df_path in tqdm(by_country_path.glob('*.feather')):
        df = pd.read_feather(df_path)
        pol = utils.get_mock_polygon_from_df(df, increment=spacial_txt.increment)
        subdf = raster.merge_df_to_array_by_lat_lon(df, IMAGE, pol)
        if subdf.empty:
            print(df_path.name, 'is empty')
        else:
            subdf.to_feather(file_by_country_path / df_path.name)
            
    utils.aggregate_feather_splits_and_save(file_by_country_path, output_path / file, no_data_value=-9999)

slope.txt


48140it [20:21, 39.41it/s] 
Reading: 100%|███████████████████████████| 48140/48140 [01:52<00:00, 429.19it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [04:53<00:00, 163.76it/s]


tri.txt


48140it [20:07, 39.86it/s] 
Reading: 100%|███████████████████████████| 48140/48140 [01:52<00:00, 427.60it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [04:51<00:00, 165.09it/s]
