In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import re

import shutil

from functools import partial
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from projections import raster, utils


pd.set_option('max_columns', None)



In [2]:
def save_location_mapping(row_and_path):
    """
    Common function used to obtain a mapping of polygons to
    the rasters used by the IMAGE. 
    
    This is not part of projections because it assumes IMAGE 
    exists in the global scope.
    """
    row, path = row_and_path
    shape = row['geometry']
    
    subset = raster.find_subset_with_intersection_area(IMAGE, shape)

    if subset.empty:
        with open(path, 'w') as f:
            f.write('')
        return

    subset['id'] = row['id']
    
    subset.to_csv(path, index=False)   

In [3]:
read_path = Path('../Data/Population')
output_path = utils.make_path('../Output/Population/')
partial_path = utils.make_path(output_path / 'partial')
by_country_path = utils.make_path(output_path / 'by_country')

In [4]:
# Validate that all TIF share the same coordinates
x = None
y = None
no_data_value = None

all_tif = sorted(read_path.glob('**/*.tif'))
for tif in all_tif:
    image = utils.read_tif(tif)
    if x is None:
        x = image.x
        y = image.y
        no_data_value = image._FillValue
    else:
        assert np.all(x == image.x) and np.all(y == image.y), tif
        assert no_data_value == image._FillValue
        
base_file = all_tif[0]

# Map raster to polygons
This preprocessing step creates one csv file per shape with the correct raster's coordinates and their intersection area with each polygon

In [5]:
# Read SHP with all polygons (output of PreprocessingLocs.ipynb, also available in Drive)
geo_df = gpd.read_file('../Shapefiles/preprocessed/all_countries_with_eth.shp')

In [6]:
n_processes = 15

IMAGE = utils.read_tif(base_file)

iterator = partial(utils.yield_missing_shapes, save_path=partial_path, prefix=base_file.name[:-4])

if n_processes == 1:
    for row_and_path in tqdm(iterator(geo_df)):
        save_location_mapping(row_and_path)
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for _ in tqdm(
            tpe.map(save_location_mapping, iterator(geo_df)), 
            total=geo_df.shape[0]
        ):
            pass

  0%|                                  | 25/122772 [34:34<1595:54:20, 46.81s/it]IOStream.flush timed out
  0%|                                  | 26/122772 [34:47<1280:57:10, 37.57s/it]IOStream.flush timed out
  1%|▍                                 | 1777/122772 [51:20<58:15:18,  1.73s/it]


In [7]:
utils.union_and_save_portions(read_from=partial_path, save_in=by_country_path)

Reading: 122772it [12:17, 166.40it/s]
Saving: 100%|████████████████████████████| 48140/48140 [03:33<00:00, 225.53it/s]


# Aggregate all files with same raster
As there are multiple TIFs but all of them share the same raster, match the values from these to the preprocessed raster and perform aggregations.

In [5]:
def aggregate_one(file):
    partial_path = utils.make_path(f'../Output/Population/{file.parent.name}/partial')
    output_path = partial_path.parent / file.name
    if output_path.exists():
        return file.name
    
    file_path = partial_path.parent / file.name[:-4]
    file_path.mkdir(exist_ok=True)

    IMAGE = utils.read_tif(file)
    increment = raster.get_increment_from_tif(IMAGE)
    
    for df_path in by_country_path.glob('*.feather'):
        subdf_path = file_path / df_path.name
        if subdf_path.exists():
            continue
            
        df = pd.read_feather(df_path)
        pol = utils.get_mock_polygon_from_df(df, increment=increment)
        subdf = raster.merge_df_to_array_by_lat_lon(df, IMAGE, pol)
        if subdf.empty:
            print(df_path.name, 'is empty')
        else:
            subdf.to_feather(subdf_path)
            
    utils.aggregate_feather_splits_and_save(
        input_path=file_path, 
        output_path=output_path, 
        no_data_value=no_data_value
    )
    shutil.rmtree(file_path)
    return file.name
    
n_processes = min(len(all_tif), 1)
print(f"Running with {n_processes} processes")
if n_processes == 1:
    for tif_file in all_tif[1:]:
        print(aggregate_one(tif_file))
else:
    with ThreadPoolExecutor(n_processes) as tpe:
        for name in tpe.map(aggregate_one, all_tif):
            print(name)

Running with 1 processes


Reading: 100%|███████████████████████████| 48140/48140 [02:18<00:00, 347.24it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:38<00:00, 104.89it/s]


gpw_v4_population_count_rev11_2005_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:49<00:00, 283.53it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:20<00:00, 109.25it/s]


gpw_v4_population_count_rev11_2010_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:20<00:00, 341.62it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:14<00:00, 110.75it/s]


gpw_v4_population_count_rev11_2015_30_sec.tif


Reading:  43%|███████████▋               | 20798/48140 [01:15<01:06, 412.05it/s]

Error while reading ../Output/Population/Count/gpw_v4_population_count_rev11_2020_30_sec/JPN.22.1_1.feather. File is too small to be a well-formed file


Reading: 100%|███████████████████████████| 48140/48140 [02:22<00:00, 337.49it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:13<00:00, 110.93it/s]


gpw_v4_population_count_rev11_2020_30_sec.tif


Reading:  43%|███████████▋               | 20840/48140 [01:14<01:03, 428.65it/s]

Error while reading ../Output/Population/Density/gpw_v4_population_density_rev11_2000_30_sec/JPN.23.6_1.feather. File is too small to be a well-formed file


Reading: 100%|███████████████████████████| 48140/48140 [02:22<00:00, 338.66it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:15<00:00, 110.66it/s]


gpw_v4_population_density_rev11_2000_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:21<00:00, 339.09it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:14<00:00, 110.68it/s]


gpw_v4_population_density_rev11_2005_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:22<00:00, 338.62it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:15<00:00, 110.44it/s]


gpw_v4_population_density_rev11_2010_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:19<00:00, 344.16it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:15<00:00, 110.54it/s]


gpw_v4_population_density_rev11_2015_30_sec.tif


Reading: 100%|███████████████████████████| 48140/48140 [02:22<00:00, 337.67it/s]
Grouping: 100%|██████████████████████████| 48140/48140 [07:16<00:00, 110.31it/s]


gpw_v4_population_density_rev11_2020_30_sec.tif


# Join results
Combine all the intermediate results and save a consolidated file

In [12]:
def read_population_df(file):
    df = utils.robust_read(file)
    year = get_year_from_population_file(file)
    renames = {'a_value': f'average_{file.parent.name}_{year}', "s_value": f'sum_{file.parent.name}_{year}'}
    df.rename(columns=renames, inplace=True)
    return df

def get_year_from_population_file(file):
    return re.findall(".*_(\d{4})_.*", file.name)[0]

In [18]:
df = None

for file in tqdm(output_path.glob('**/*.csv')):
    if file.parent.name == 'partial':
        continue
        
    field = read_population_df(file)
    if field.empty:
        continue
    elif df is None:
        df = field
    else:
        field.drop(columns='n_grids', inplace=True)
        df = df.merge(field.drop(columns='intersection_area'), on='id', how='outer')

122781it [00:02, 50738.25it/s]


In [19]:
df.to_csv(output_path / "population.csv", index=False)
df.head()

Unnamed: 0,id,average_Count_2005,intersection_area,sum_Count_2005,n_grids,average_Count_2010,sum_Count_2010,average_Count_2015,sum_Count_2015,average_Count_2020,sum_Count_2020,average_Density_2000,sum_Density_2000,average_Density_2005,sum_Density_2005,average_Density_2010,sum_Density_2010,average_Density_2015,sum_Density_2015,average_Density_2020,sum_Density_2020
0,ESP__JEBALA,2124.700901,20285970.0,61957.43473,55,2299.92632,67067.082923,2490.245142,72616.856351,2697.109862,78649.108859,3235.96216,94362.598853,3501.463327,102104.748955,3789.509177,110504.308021,4102.195191,119622.379562,4441.853972,129526.981204
1,GHA__BASARI,35.645827,1095963.0,46.282871,7,42.640694,55.365083,51.339321,66.659477,62.167306,80.718653,35.332634,45.876208,41.961965,54.483803,50.196276,65.175323,60.436239,78.471004,73.182871,95.021374
2,GHA__AVATIME,195.322298,121094700.0,27856.667075,175,233.119314,33247.236738,278.271609,39686.807988,332.213036,47379.875395,191.51921,27314.276476,228.542221,32594.458779,272.76764,38901.840329,325.599298,46436.634001,388.714914,55438.114224
3,GHA__ADELE,28.954485,677864200.0,23195.041591,882,33.683694,26983.546696,39.185701,31391.128635,45.586826,36518.982668,29.225065,23411.800934,33.998141,27235.446677,39.551144,31683.881246,46.011558,36859.231244,53.527712,42880.319615
4,GHA__MOBA,28.341024,2560286.0,86.269693,11,32.497303,98.921357,37.310472,113.572586,42.893741,130.567994,29.237103,88.997343,33.484403,101.926072,38.394973,116.873788,44.081648,134.183959,50.678184,154.263737
