In [None]:
# Defaults
REGION = 'fortportal'
UTM = 32636
PIPELINE = 'output'
POP = 'grid_population'

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt
from pathlib import Path

plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
CWD = Path('.')
DATA = Path('data')

INTER  = DATA/'inter'
INPUT  = DATA/'input'
OUTPUT = DATA/'output'

In [None]:
!ls {DATA}

In [None]:
!ls {OUTPUT} | grep {REGION}

In [None]:
grids_gdf = gpd.read_file(f'{OUTPUT}/{REGION}_grids_output_{4326}.geojson').to_crs(epsg=UTM)
region = gpd.read_file(f'{INPUT/REGION}.geojson').to_crs(grids_gdf.crs)

In [None]:
region.crs, grids_gdf.crs

In [None]:
def show(region, gdf, col):
    fig, ax = plt.subplots(1,1,figsize=(10, 10))
    ax.set_axis_off()
    
    gdf.plot(ax=ax, cmap='RdYlGn_r', alpha=0.8, column=col, scheme='percentiles', legend=True)
    region.boundary.plot(ax=ax, color='darkgray', alpha=0.8)

In [None]:
show(region, grids_gdf, POP)

### Analyse the population count data

In [None]:
grids_gdf.grid_population.plot(kind='hist', bins=100, logy=True)

In [None]:
def _population(gdf, col, percentage=None, cutoff=None, top=True):
    total_pop = gdf[col].sum()
    gdf = gdf.sort_values(by=col, ascending=False)
    gdf["pop_percentile"] = (gdf[col].cumsum() / total_pop * 100).astype(np.float32)

    if percentage:
        if top:
            gdf = gdf.query("pop_percentile <= @percentage")
        else:
            gdf = gdf.query("pop_percentile > (100 - @percentage)")

    if cutoff:
        if top:
            gdf = gdf.query(f"{col} >= @cutoff")
        else:
            gdf = gdf.query(f"{col} <  @cutoff")
            
    return gdf[['idx', 'geometry', 'grid_population']]

In [None]:
population = _population(grids_gdf, POP, percentage=50)

In [None]:
population.shape

In [None]:
show(region, _population(grids_gdf, POP, percentage=50), POP)

In [None]:
def _building(gdf, metrics='area'):
    return gdf[['idx', 'geometry', f'grid_building_{metrics}']]

In [None]:
building = _building(grids_gdf, metrics='count')

In [None]:
building.plot(figsize=(10, 10), cmap='RdYlGn_r', column='grid_building_count', scheme='percentiles', legend=True)
plt.gca().set_axis_off()

In [None]:
def _pairings(gdf):
    step_stats = list()
    for step in list(range(5, 100, 5)) + [99, 100]:
        per_df = _population(gdf, POP, percentage=step)
        step_stats.append(
            {
                "per": step,
                "pop": per_df[POP].sum(),
                "area": per_df.geometry.area.sum() / 1000000,
                "unit": "sq. km",
            }
        )

    return step_stats

In [None]:
pairings = pd.DataFrame(_pairings(grids_gdf))

In [None]:
(pairings
    .style
    .bar(subset='area'))

In [None]:
sns.lineplot(x=pairings.per, y=pairings.area)

In [None]:
def _stats(gdf):
    m2km = 1000000
    return {
        "area": f"{gdf.geometry.area.sum() / m2km:.2f} sq. km",
        "pop": f"{gdf[POP].sum():,.0f}",
        "release_area": f'{gdf.query(f"{POP} > 0").geometry.area.sum() / m2km:.2f} sq. km',
        "exclusion_area": f'{gdf.query(f"{POP} <= 0").geometry.area.sum() / m2km:.2f} sq. km',
        "analysis_date": dt.now().strftime("%Y-%m-%d %I-%M-%S %p"),
        "pop_stats": {
            "min": f"{gdf[POP].min():,.2f}",
            "max": f"{gdf[POP].max():,.2f}",
            "mean": f"{gdf[POP].mean():,.2f}",
            "median": f"{gdf[POP].median():,.2f}",
        },
        "building_stats": {
            "min_area": f"{gdf.grid_building_area.min()  } sq. m",
            "max_area": f"{gdf.grid_building_area.max()  } sq. m",
            "mean_area": f"{gdf.grid_building_area.mean() } sq. m",
            "min_count": f"{gdf.grid_building_count.min() }",
            "max_count": f"{gdf.grid_building_count.max() }",
            "mean_count": f"{int(gdf.grid_building_count.mean())}",
        },
    }

In [None]:
stats = _stats(grids_gdf)

In [None]:
stats