In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import re

import shutil

from functools import partial
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
def save_location_mapping(row_and_path):
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)   
    
    
def make_partial_path(parent):
    path = output_path / parent / "partial"
    path.mkdir(exist_ok=True, parents=True)
    return path

def make_countries_path(parent):
    path = output_path / parent / "by_country"
    path.mkdir(exist_ok=True, parents=True)
    return path

In [3]:
read_path = Path('../Data/Population')
output_path = Path('../Output/Population/')
output_path.mkdir(exist_ok=True)

# partial_path = output_path / 'partial'
# by_country_path = output_path / 'by_country'
# partial_path.mkdir(exist_ok=True)
# by_country_path.mkdir(exist_ok=True)

In [4]:
# Validate that all TIF share the same coordinates
x = None
y = None
no_data_value = None

all_tif = sorted(read_path.glob('**/*.tif'))
for tif in all_tif:
    image = utils.read_tif(tif)
    if x is None:
        x = image.x
        y = image.y
        no_data_value = image._FillValue
    else:
        assert np.all(x == image.x) and np.all(y == image.y), tif
        assert no_data_value == image._FillValue

In [5]:
base_file = all_tif[0]
partial_path = make_partial_path(base_file.parent.name)
by_country_path = make_countries_path(base_file.parent.name)

# Map raster to polygons

In [6]:
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [7]:
n_processes = 30

IMAGE = utils.read_tif(base_file)

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix=base_file.name[:-4])

if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for _ in tqdm(
            tpe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

 93%|██████████████████████████▊  | 113695/122772 [19:08:09<12:25:14,  4.93s/it]IOStream.flush timed out
 93%|██████████████████████████▊  | 113696/122772 [19:08:32<16:32:53,  6.56s/it]IOStream.flush timed out
 93%|██████████████████████████▊  | 113703/122772 [19:09:15<12:32:18,  4.98s/it]IOStream.flush timed out
 93%|██████████████████████████▊  | 113704/122772 [19:09:42<27:21:24, 10.86s/it]IOStream.flush timed out
 93%|██████████████████████████▊  | 113713/122772 [19:10:59<12:16:37,  4.88s/it]IOStream.flush timed out
 93%|██████████████████████████▊  | 113716/122772 [19:11:56<33:55:09, 13.48s/it]IOStream.flush timed out
 93%|██████████████████████████▉  | 114264/122772 [19:27:11<35:09:24, 14.88s/it]IOStream.flush timed out
 93%|██████████████████████████▉  | 114265/122772 [19:27:25<40:03:10, 16.95s/it]IOStream.flush timed out
 93%|██████████████████████████▉  | 114266/122772 [19:27:41<36:36:44, 15.50s/it]IOStream.flush timed out
 93%|██████████████████████████▉  | 114271/122772 [19:2

# Union portions from different files and shapes

In [8]:
df_by_region = {}
for file in tqdm(partial_path.glob('*.csv'), desc='Reading'):
    try:
        df = pd.read_csv(file)
    except pd.errors.EmptyDataError:
        continue
        
    if 'id' not in df.columns:
        df['id'] = df['adm2']
        df['id'].fillna(df['adm1'], inplace=True)
        df['id'].fillna(df['adm0'], inplace=True)
    region = df.loc[0, 'id']
    df_by_region.setdefault(region, []).append(df)

for region, dfs in tqdm(df_by_region.items(), desc='Saving'):
    df = utils.combine_dataframes(dfs)
    df.to_feather(by_country_path / f'{region}.feather')
    
del df_by_region

Reading: 122772it [06:53, 296.96it/s]
Saving: 100%|████████████████████████████| 48136/48136 [03:33<00:00, 225.43it/s]


# Aggregate all files with same raster

In [6]:
def aggregate_one(file):
    partial_path = make_partial_path(file.parent.name)
    output_path = partial_path.parent / file.name
    if output_path.exists():
        return file.name
    
    file_path = partial_path.parent / file.name[:-4]
    file_path.mkdir(exist_ok=True)

    IMAGE = utils.read_tif(file)
    increment = raster.get_increment_from_tif(IMAGE)
    
    for df_path in by_country_path.glob('*.feather'):
        subdf_path = file_path / df_path.name
        if subdf_path.exists():
            continue
            
        df = pd.read_feather(df_path)
        pol = utils.get_mock_polygon_from_df(df, increment=increment)
        subdf = raster.merge_df_to_array_by_lat_lon(df, IMAGE, pol)
        if subdf.empty:
            print(df_path.name, 'is empty')
        else:
            subdf.to_feather(subdf_path)
            
    utils.aggregate_feather_splits_and_save(
        input_path=file_path, 
        output_path=output_path, 
        no_data_value=no_data_value
    )
    shutil.rmtree(file_path)
    return file.name
    
n_processes = min(len(all_tif), 1)
print(f"Running with {n_processes} processes")
if n_processes == 1:
    for tif_file in all_tif:
        print(aggregate_one(tif_file))
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for name in tpe.map(aggregate_one, all_tif):
            print(name)

Running with 1 processes


Grouping:   0%|                             | 59/48136 [00:00<07:33, 106.13it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2000_30_sec/AGO__YOMBE.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [06:50<00:00, 117.32it/s]


gpw_v4_population_count_rev11_2000_30_sec.tif


Grouping:   0%|                              | 64/48136 [00:00<08:32, 93.72it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2005_30_sec/BEN__EGBA.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:02<00:00, 113.80it/s]


gpw_v4_population_count_rev11_2005_30_sec.tif


Grouping:   0%|                              | 59/48136 [00:00<08:24, 95.37it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2010_30_sec/AGO__LUNDA.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:25<00:00, 108.17it/s]


gpw_v4_population_count_rev11_2010_30_sec.tif


Grouping:   0%|                              | 63/48136 [00:00<08:06, 98.75it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2015_30_sec/AGO__HOLO.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:13<00:00, 111.15it/s]


gpw_v4_population_count_rev11_2015_30_sec.tif


Grouping:   0%|                              | 64/48136 [00:00<08:15, 97.06it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2020_30_sec/AGO__YOMBE.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:22<00:00, 108.68it/s]


gpw_v4_population_count_rev11_2020_30_sec.tif


Grouping: 100%|██████████████████████████| 48136/48136 [07:21<00:00, 109.13it/s]


gpw_v4_population_density_rev11_2000_30_sec.tif


Grouping:   0%|                              | 61/48136 [00:00<08:39, 92.61it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2005_30_sec/GHA__BRONG.feather. File is too small to be a well-formed file


Grouping:  99%|██████████████████████████▋| 47509/48136 [06:48<00:18, 34.20it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2005_30_sec/ATA.feather. Not an Arrow file


Grouping: 100%|██████████████████████████| 48136/48136 [07:10<00:00, 111.82it/s]


gpw_v4_population_density_rev11_2005_30_sec.tif


Grouping:   0%|                             | 50/48136 [00:00<07:49, 102.33it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2010_30_sec/AGO__MBAGANI.feather. File is too small to be a well-formed file


Grouping:  99%|██████████████████████████▋| 47510/48136 [06:56<00:17, 34.87it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2010_30_sec/ATA.feather. Not an Arrow file


Grouping: 100%|██████████████████████████| 48136/48136 [07:18<00:00, 109.74it/s]


gpw_v4_population_density_rev11_2010_30_sec.tif


Grouping:   0%|                              | 61/48136 [00:00<08:32, 93.81it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2015_30_sec/GHA__DAGOMBA.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:17<00:00, 109.94it/s]


gpw_v4_population_density_rev11_2015_30_sec.tif


Grouping:   0%|                              | 57/48136 [00:00<08:18, 96.54it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2020_30_sec/GHA__DAGOMBA.feather. File is too small to be a well-formed file


Grouping: 100%|██████████████████████████| 48136/48136 [07:20<00:00, 109.35it/s]


gpw_v4_population_density_rev11_2020_30_sec.tif


# Join together results

In [10]:
def read_population_df(file):
    df = utils.robust_read(file)
    year = get_year_from_population_file(file)
    value_name = f'{file.parent.name}_{year}'
    df.rename(columns={'value': value_name}, inplace=True)
    return df

def get_year_from_population_file(file):
    return re.findall(".*_(\d{4})_.*", file.name)[0]

In [13]:
df = None

for file in tqdm(output_path.glob('**/*.csv')):
    if file.parent.name == 'partial':
        continue
        
    field = read_population_df(file)
    if field.empty:
        continue
    elif df is None:
        df = field
    else:
        df = df.merge(field.drop(columns='intersection_area'), on='id', how='outer')

122782it [00:02, 58513.95it/s] 


In [15]:
df.to_csv(output_path / "population.csv", index=False)