In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import tarfile

import yaml

from functools import partial
from glob import glob, iglob
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections.shapefiles import load_shapes, iter_records
from projections.models import Records
from projections.elevation import get_indices_by_file, SpacialTxt
from projections import raster, utils


pd.set_option('max_columns', None)

In [2]:
def save_location_mapping(row_and_path):
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)

In [4]:
read_path = Path('../Data/Elevation/Ruggedness')
output_path = Path('../Output/Elevation/Ruggedness/')

base_file = 'cellarea.txt'
file_path = output_path / base_file[:-4]
partial_path = file_path / 'partial'
by_country_path = file_path / 'by_country'

output_path.mkdir(exist_ok=True)
file_path.mkdir(exist_ok=True)
partial_path.mkdir(exist_ok=True)
by_country_path.mkdir(exist_ok=True)

# Map raster to polygons

In [5]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [6]:
n_processes = 30

spacial_txt = SpacialTxt(read_path / base_file)
spacial_txt.read(save=True)
IMAGE = spacial_txt.get_xarray()

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix=base_file[:-4])

if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for _ in tqdm(
            tpe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

 87%|█████████████████████████▎   | 107092/122772 [17:32:58<18:33:20,  4.26s/it]IOStream.flush timed out
 87%|█████████████████████████▎   | 107098/122772 [17:33:42<21:59:53,  5.05s/it]IOStream.flush timed out
 87%|█████████████████████████▎   | 107099/122772 [17:33:53<29:12:50,  6.71s/it]IOStream.flush timed out
 87%|██████████████████████████▏   | 107169/122772 [17:35:58<5:47:40,  1.34s/it]IOStream.flush timed out
 88%|█████████████████████████▍   | 107721/122772 [17:54:55<25:39:01,  6.14s/it]IOStream.flush timed out
 88%|█████████████████████████▍   | 107724/122772 [17:56:02<38:08:57,  9.13s/it]IOStream.flush timed out
IOStream.flush timed out
 88%|█████████████████████████▍   | 107726/122772 [17:56:45<53:03:08, 12.69s/it]IOStream.flush timed out
 88%|█████████████████████████▍   | 107731/122772 [17:57:44<44:41:59, 10.70s/it]IOStream.flush timed out
IOStream.flush timed out
 88%|█████████████████████████▍   | 107733/122772 [17:58:30<64:20:17, 15.40s/it]IOStream.flush timed out
 88%|

# Union portions from different files and shapes

In [7]:
df_by_region = {}
for file in tqdm(partial_path.glob('*.csv'), desc='Reading'):
    try:
        df = pd.read_csv(file)
    except pd.errors.EmptyDataError:
        continue
        
    if 'id' not in df.columns:
        df['id'] = df['adm2']
        df['id'].fillna(df['adm1'], inplace=True)
        df['id'].fillna(df['adm0'], inplace=True)
    region = df.loc[0, 'id']
    df_by_region.setdefault(region, []).append(df)

for region, dfs in tqdm(df_by_region.items(), desc='Saving'):
    df = utils.combine_dataframes(dfs)
    df.to_feather(by_country_path / f'{region}.feather')
    
del df_by_region

Reading: 122772it [08:26, 242.25it/s]
Saving: 100%|████████████████████████████| 48138/48138 [03:59<00:00, 201.14it/s]


# Aggregate

In [10]:
utils.aggregate_feather_splits_and_save(by_country_path, output_path / base_file, no_data_value=-9999)

Grouping: 48138it [07:17, 110.08it/s]


# Additional files with the same raster

In [13]:
files = ['slope.txt', 'tri.txt']
for file in files:
    print(file)
    file_path = output_path / file[:-4]
    file_by_country_path = file_path / 'by_country'
    file_path.mkdir(exist_ok=True)
    file_by_country_path.mkdir(exist_ok=True)

    spacial_txt = SpacialTxt(read_path / file)
    spacial_txt.read(save=True)
    IMAGE = spacial_txt.get_xarray()
    
    for df_path in tqdm(by_country_path.glob('*.feather')):
        df = pd.read_feather(df_path)
        pol = utils.get_mock_polygon_from_df(df, increment=spacial_txt.increment)
        subdf = raster.merge_df_to_array_by_lat_lon(df, IMAGE, pol)
        if subdf.empty:
            print(df_path.name, 'is empty')
        else:
            subdf.to_feather(file_by_country_path / df_path.name)
            
    utils.aggregate_feather_splits_and_save(file_by_country_path, output_path / file, no_data_value=-9999)

slope.txt


48138it [22:30, 35.64it/s] 
Grouping: 48138it [06:56, 115.47it/s]


tri.txt


48138it [22:54, 35.03it/s] 
Grouping: 48138it [06:56, 115.58it/s]
