In [None]:
import copy
import json
import re
import time
import unicodedata
import warnings
from functools import partial

from tqdm.autonotebook import tqdm
from urllib.parse import urlencode
import contextily
import folium
import geopandas as gpd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import rasterio
import rasterio.warp
import rasterio.features
import rasterio.plot
import requests
import requests_cache
import seaborn as sns
import shapely.wkt

%matplotlib inline

In [None]:
mpl.rcParams['figure.dpi'] = 200
pd.set_option('display.float_format', lambda x: f'{x:,.3f}')
pd.set_option("display.max_rows", 100)
pd.set_option('display.width', 2000)

In [None]:
# Disclosing
df_disclosing = pd.read_csv('../input/cdp-unlocking-climate-solutions/Cities/Cities Disclosing/2020_Cities_Disclosing_to_CDP.csv')
df_disclosing = df_disclosing.drop(['Access', 'Last update', 'Year Reported to CDP'], axis=1)
df_disclosing = df_disclosing.drop(['Reporting Authority', 'First Time Discloser'], axis=1)

# Use None instead of NaN for strings.
df_disclosing['City'] = df_disclosing['City'].fillna('')
df_disclosing['city_location_raw'] = df_disclosing['City Location'].fillna('').apply(lambda x: shapely.wkt.loads(x) if x else None)
df_disclosing['Organization'] = df_disclosing['Organization'].fillna('')

# Fix points.
df_disclosing.loc[df_disclosing['Account Number'] == 68383, 'City Location'] = 'POINT(-46.5 -23)'  # Missing -ve sign.
df_disclosing.loc[df_disclosing['Account Number'] == 826212, 'City Location'] = 'POINT(-103.6167 18.820)'  # No location given.
df_disclosing.loc[df_disclosing['Account Number'] == 826380, 'City Location'] = 'POINT(-104.361 19.7713)'  # No location given.

# Assign point geometry.
df_disclosing['City Location'] = df_disclosing['City Location'].fillna('').apply(lambda x: shapely.wkt.loads(x) if x else None)
df_disclosing = gpd.GeoDataFrame(df_disclosing, geometry='City Location', crs='EPSG:4326')

# Tidy population year.
fixed_population = {19: 2019, 214: 2014, 216: 2016, 217: 2017}
df_disclosing['Population Year'] = df_disclosing['Population Year'].apply(lambda x: fixed_population.get(x, x))

# Fix incorrect populations.
df_disclosing.loc[(df_disclosing['City'] == 'Montevideo') & (df_disclosing['Population'] == 1_383_432_134), 'Population'] = 1_383_432
df_disclosing.loc[(df_disclosing['Account Number'] == 826211) & (df_disclosing['Population'] == 399_724_000), 'Population'] = 399_724
df_disclosing.loc[(df_disclosing['Account Number'] == 54652) & (df_disclosing['Population'] == 676), 'Population'] = 676_000
df_disclosing.loc[(df_disclosing['Account Number'] == 841003) & (df_disclosing['Population'] == 5_247), 'Population'] = 524_700
df_disclosing.loc[(df_disclosing['Account Number'] == 839980) & (df_disclosing['Population'] == 26_492), 'Population'] = 264_920

df_disclosing

In [None]:
def log_print(*args, **kwargs):
    # print(*args, **kwargs)
    return None
          

def osm_query(**kwargs):
    
    params = copy.deepcopy(kwargs)
    params.update({
        'format': 'jsonv2',
        'polygon_geojson': '1',
        'email': 'andrew@ajnisbet.com',
    })
    url = 'https://nominatim.openstreetmap.org/search'
    
    log_print('    https://nominatim.openstreetmap.org/ui/search.html?' + urlencode(kwargs))

    response = requests.get(url, params=params)
    df_response = pd.DataFrame.from_records(response.json())
    if len(df_response) == 0:
        return None
        
    def parse_geojson(x):
        try:
            return shapely.geometry.shape(x)
        except AttributeError:
            return None
            
    df_response['geometry'] = df_response.geojson.apply(parse_geojson)
    df_response = df_response.drop([x for x in ['licence', 'boundingbox', 'place_id', 'osm_id', 'icon'] if x in df_response.columns], axis=1)
    df_response['q'] = json.dumps(kwargs)
    df_response['q_rank'] = df_response.index.values
    df_response = pd.concat([df_response[df_response.category == 'boundary'], df_response[df_response.category != 'boundary']])
    
    return df_response
    
    
    
def validate_osm_response(df, disclosure_point=None, country=None):
    
    # If there's no dataframe, nothing we can do about that.
    if df is None or len(df) == 0:
        log_print('    Empty response.')
        return None
    
    # Try only to get polygons.
    df = df[df.category == 'boundary']
    if len(df) == 0:
        log_print('    No polygons in response.')
        return None
    
    # If country is provided, try to validate that.
    if country:
        display_name = df.display_name.iloc[0]
        df = df[df.display_name.str.lower().str.contains(country.lower(), regex=False)]
        if len(df) == 0:
            log_print(f'    Country mismatch {country} vs {display_name}.')
            return None
    
    # Check that polygon contains point.
    if not disclosure_point:
        log_print('    Success! No point to validate.')
        return df.iloc[0]
    
    
    df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
    df['contains_point'] = df.intersects(disclosure_point)
    df = df[df.contains_point]
    if len(df) == 0:
        log_print('    Point not in any polygon.')
        return None
    else:
        log_print('    Success! Point in polygon.')
        return df.iloc[0]
    
    
def parse_city_from_org(org):
    org = org.split('Municipal Government of ')[-1].strip()
    org = org.replace('Junta Municipal de Medio Ambiente ', '').strip()
    org = org.replace('Junta Intermunicipal de Medio Ambiente ', '').strip()
    org = org.replace("Municipal People's Government", '').strip()
    org = org.replace('County Council', '').strip()
    org = org.replace('City Council', '').strip()
    org = org.replace('Municipalidad Distrital de ', '').strip()
    org = org.replace('Municipalidad de ', '').strip()
    org = org.replace('Alcaldía Municipal de ', '').strip()
    org = org.replace('Ayuntamiento de ', '').strip()
    org = org.replace('Prefeitura de ', '').strip()
    org = org.replace('Municipalidad de Provincial de ', '').strip()
    org = org.replace('Junta Intermunicipal de la ', '').strip()
    org = org.replace('Town of ', '').strip()
    org = org.replace('City Government', '').strip()
    org = org.replace('Gobierno Municipal de', '').strip()
    org = org.replace('City of ', '').strip()
    org = org.replace('Concejo Municipal de Distrito de ', '').strip()
    org = org.replace('Municipalidad Provincial de ', '').strip()
    org = org.replace('Municipal Government', '').strip()
    org = org.replace('Municipio de ', '').strip()
    org = org.replace('Municipality of', '').strip()
    org = org.replace(' Council', '').strip()
    org = org.replace('Gemeente', '').strip()
    org = org.split('Estado de')[-1].strip()
    org = org.split('(')[0].strip()
    return org

def cdp_to_osm_country(c):
    return {
        'Taiwan, Greater China': 'Taiwan',
        'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
        'China, Hong Kong Special Administrative Region': 'Hong Kong',
        'Republic of Korea': 'South Korea',
        'Bolivia (Plurinational State of)': 'Bolivia',
        'Brazil': 'Brasil',
        'Mexico': 'México',
    }.get(c, c)
    
def load_boundary_from_osm(i):
    city_name = df_disclosing.City.iloc[i]
    country_name = df_disclosing.Country.iloc[i]
    osm_country = cdp_to_osm_country(country_name)
    organization_name = df_disclosing.Organization.iloc[i]
    disclosure_point = df_disclosing.geometry.iloc[i]
    city_from_org = parse_city_from_org(organization_name)
    city_filled = city_name or city_from_org
    
    dfs = []
    
    log_print()
    log_print(f'[{i}] {city_name or None} ({organization_name or None}), {country_name} [{"point" if disclosure_point else "no_point"}]')
    
    if country_name in {'United States of America', 'Canada'} and city_name and re.search(', [A-Z]{2}$', city_name):
        log_print('Checking structured City, State')
        city, state = city_name.rsplit(',', 1)
        df_osm = osm_query(city=city, state=state, country=country_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # County and state provided.   
    if country_name == 'United States of America' and organization_name and re.search('County, [A-Z]{2}$', organization_name):
        log_print('Checking structured US County, State')
        res = re.match('(.*?) County, ([A-Z]{2})$', organization_name)
        county, state = res.groups()
        df_osm = osm_query(county=county, state=state, country=country_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Just county.
    if country_name == 'United States of America' and organization_name and organization_name.endswith(' County'):
        log_print('Checking structured US County')
        county = organization_name.split(' County')[0]
        county = county.split('of')[-1].strip()
        df_osm = osm_query(county=county, country=country_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Structured city, county.
    if city_name:
        log_print('Checking structured City, County')
        df_osm = osm_query(city=city_name, country=country_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Check base organisation name with point safety.
    if organization_name and disclosure_point:
        log_print('Checking origanization only with point')
        df_osm = osm_query(q=organization_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Check city with point safety.
    if city_filled and disclosure_point:
        log_print('Checking city only with point')
        df_osm = osm_query(city=city_filled)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Filled city, structured.
    if city_filled:
        log_print('Structured City, Country')
        df_osm = osm_query(city=city_filled, country=country_name)
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # Filled city, unstructured.
    if city_filled:
        log_print('Unstructured City, Country')
        df_osm = osm_query(q=f'{city_filled}, {osm_country}')
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point)
        if val is not None:
            return val
        
    # City only, check country.
    if city_filled:
        log_print('City only, check country')
        df_osm = osm_query(q=f'{city_filled}')
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point, country=country_name)
        if val is not None:
            return val
        
    # OSM country.
    if city_filled:
        log_print('City only, check country')
        df_osm = osm_query(q=f'{city_filled}')
        dfs.append(df_osm)
        val = validate_osm_response(df_osm, disclosure_point=disclosure_point, country=osm_country)
        if val is not None:
            return val
    
    # If OSM didn't find anything, return the disclosure point.
    dfs = [x for x in dfs if x is not None]
    if len(dfs) > 0:
        df = dfs[0]
        return df.iloc[0]
    elif disclosure_point:
        print(f'{i}: returning dp')
        return disclosure_point
    else:
        raise ValueError('No polygon found')
    

    
def buffer_meters(geom, buffer):
    local_azimuthal_projection = f"+proj=aeqd +R=6371000 +units=m +lat_0={geom.centroid.y} +lon_0={geom.centroid.x}"

    wgs84_to_aeqd = partial(
        pyproj.transform,
        pyproj.Proj('+proj=longlat +datum=WGS84 +no_defs'),
        pyproj.Proj(local_azimuthal_projection),
    )

    aeqd_to_wgs84 = partial(
        pyproj.transform,
        pyproj.Proj(local_azimuthal_projection),
        pyproj.Proj('+proj=longlat +datum=WGS84 +no_defs'),
    )

    geom_transformed = shapely.ops.transform(wgs84_to_aeqd, geom)

    buffer = geom_transformed.buffer(buffer)

    buffer_wgs84 = shapely.ops.transform(aeqd_to_wgs84, buffer)
    
    return buffer_wgs84        
        
        
    
    


In [None]:
geoms = []
geom_sources = []

for i in tqdm(range(len(df_disclosing))):
    geom = load_boundary_from_osm(i)
    raw_geom = geom
    geom_source = 'CDP point'
    
    if isinstance(geom, pd.Series) or isinstance(geom, gpd.GeoSeries):
        geom = geom.geometry
        geom_source = 'OSM polygon'
        
    if isinstance(geom, shapely.geometry.MultiPolygon) or isinstance(geom, shapely.geometry.LineString):
        geom = geom.convex_hull
        geom_source = 'OSM polygon'

    if isinstance(geom, shapely.geometry.Point):
        if isinstance(raw_geom, shapely.geometry.Point):
            geom_source = 'CDP point'
        else:
            geom_source = 'OSM point'
            
        # Exact radius doesn't matter so much, it will be adjusted to match population.
        geom = buffer_meters(geom, 1000)
        
    if geom is None:
        raise ValueError('No boundary found.')
        
    geoms.append(geom)
    geom_sources.append(geom_source)

df_disclosing['city_geom'] = gpd.GeoSeries(geoms)
df_disclosing['city_geom_source'] = geom_sources
df_disclosing = df_disclosing.set_geometry('city_geom')

In [None]:
def point_to_latlon(point):
    return point.y, point.x

def point_to_lonlat(point):
    return point.x, point.y

def point_to_xy(point):
    return point_to_lonlat(point)

def total_population_from_geom(f, a, window, g):
    mask = rasterio.features.geometry_mask([g], a.shape, f.window_transform(window), invert=True).astype(int)
    a_geom = a * mask
    return a_geom.sum()

def shrink_around_point(point, poly, meters_to_shrink):
    local_azimuthal_projection = f"+proj=aeqd +R=6371000 +units=m +lat_0={point.y} +lon_0={point.x}"

    wgs84_to_aeqd = partial(
        pyproj.transform,
        pyproj.Proj('+proj=longlat +datum=WGS84 +no_defs'),
        pyproj.Proj(local_azimuthal_projection),
    )

    poly_t = shapely.ops.transform(wgs84_to_aeqd, poly)
    point_t = shapely.ops.transform(wgs84_to_aeqd, point)

    max_radius = poly_t.hausdorff_distance(point_t)
    new_radius = max_radius - meters_to_shrink
    circle_to_cut = buffer_meters(point, new_radius)

    new_poly = poly.intersection(circle_to_cut)
    if not isinstance(new_poly, shapely.geometry.Polygon):
        new_poly = new_poly.convex_hull
    
    return new_poly


def max_distance(point, poly):
    local_azimuthal_projection = f"+proj=aeqd +R=6371000 +units=m +lat_0={point.y} +lon_0={point.x}"

    wgs84_to_aeqd = partial(
        pyproj.transform,
        pyproj.Proj('+proj=longlat +datum=WGS84 +no_defs'),
        pyproj.Proj(local_azimuthal_projection),
    )

    poly_t = shapely.ops.transform(wgs84_to_aeqd, poly)
    point_t = shapely.ops.transform(wgs84_to_aeqd, point)

    return poly_t.hausdorff_distance(point_t)

POP_PATH_1k = '../input/ppp_2020_1km_Aggregated.tif'
POP_PATH_100m = '../input/MOSAIC_ppp_prj_2020/MOSAIC_ppp_prj_2020.vrt'
def load_pop_raster(geom, target_population, buffer=10_000, pop_path=POP_PATH_1k):

    

    # Load population raster.
    geom_buffer = buffer_meters(geom, buffer)
    with rasterio.open(pop_path) as f:
        window = rasterio.windows.from_bounds(*geom_buffer.bounds, f.transform)
        a = f.read(1, window=window, masked=True)
        a = np.ma.filled(a, 0)
        
    # Check population in raster is enough.
    
    raster_population = a.sum()
    geom_population = total_population_from_geom(f, a, window, geom)
    if raster_population < target_population*2:
        if target_population > geom_population:
            if (target_population - geom_population) / target_population > 0.25:
                print(f'Target population {target_population} more than half raster {raster_population}')
                return load_pop_raster(geom, target_population, buffer=buffer*3, pop_path=pop_path)

    return f, a, window

def shrink_with_point(f, a, window, geom, point, target_population):
    min_to_remove = 0
    max_to_remove = int(max_distance(point, geom)) - 100

    dx = (max_to_remove - min_to_remove) / 2 + min_to_remove
    x_test = max_to_remove - dx

    for i in range(10):
        x_test = int(x_test)
        geom_test = shrink_around_point(point, geom, x_test)
        pop_test = total_population_from_geom(f, a, window, geom_test)
        dx /= 2
        if pop_test < target_population:
            x_test -= dx
        else:
            x_test += dx
    return geom_test, pop_test

def shrink_without_point(f, a, window, geom, target_population):
    min_to_remove = 0
    max_to_remove = int(max_distance(geom.centroid, geom)) - 100

    dx = (max_to_remove - min_to_remove) / 2 + min_to_remove
    x_test = max_to_remove - dx

    for i in range(10):
        x_test = int(x_test)
        geom_test = buffer_meters(geom, -x_test)
        pop_test = total_population_from_geom(f, a, window, geom_test)
        dx /= 2
        if pop_test < target_population:
            x_test -= dx
        else:
            x_test += dx
    return geom_test, pop_test

def grow_without_point(f, a, window, geom, target_population):
    max_radius = max_distance(geom.centroid, geom)
    to_add = max_radius / 10
    
    geom_test = buffer_meters(geom, to_add)
    while total_population_from_geom(f, a, window, geom_test) < target_population:
        to_add *= 2
        geom_test = buffer_meters(geom, to_add)
        
    min_to_add = to_add / 2
    max_to_add = to_add
    dx = (max_to_add - min_to_add) / 2 + min_to_add
    x_test = max_to_add - dx

    for i in range(10):
        x_test = int(x_test)
        geom_test = buffer_meters(geom, x_test)
        pop_test = total_population_from_geom(f, a, window, geom_test)
        dx /= 2
        if pop_test < target_population:
            x_test += dx
        else:
            x_test -= dx
    return geom_test, pop_test
        
    
def point_to_polygon_meters(point, poly):
    if not point:
        return None
    
    if point.intersects(poly):
        return 0
    
    local_azimuthal_projection = f"+proj=aeqd +R=6371000 +units=m +lat_0={point.y} +lon_0={point.x}"

    wgs84_to_aeqd = partial(
        pyproj.transform,
        pyproj.Proj('+proj=longlat +datum=WGS84 +no_defs'),
        pyproj.Proj(local_azimuthal_projection),
    )

    poly_t = shapely.ops.transform(wgs84_to_aeqd, poly)
    point_t = shapely.ops.transform(wgs84_to_aeqd, point)
    
    return poly.exterior.distance(point)
    
    

In [None]:
# Rescale each geometry to match population.
new_geoms = []
raw_geom_pops = []

for i in tqdm(range(len(df_disclosing))):
    print()
    print(f'i: {i}')
    # Get data from df.
    geom = df_disclosing.city_geom.iloc[i]
    point = df_disclosing['City Location'].iloc[i]
    if point and not geom.intersects(point):
        point = None
    target_population = df_disclosing.Population.iloc[i]
    
    if not target_population or pd.isnull(target_population):
        new_geoms.append(geom)
        raw_geom_pops.append(None)
        continue

    
    # Load raster.
    f, a, window = load_pop_raster(geom, target_population)
    if max(a.shape) < 100:
        f, a, window = load_pop_raster(geom, target_population, pop_path=POP_PATH_100m)
    geom_population = total_population_from_geom(f, a, window, geom)
    raw_geom_pops.append(geom_population)
    if abs(geom_population - target_population) / target_population < 0.25:
        new_geoms.append(geom)
        continue
    
    # Do the transformation.
    if geom_population <= target_population:
        resized_geom, resized_population = grow_without_point(f, a, window, geom, target_population)
    elif geom_population > target_population and point:
        resized_geom, resized_population = shrink_with_point(f, a, window, geom, point, target_population)
    elif geom_population > target_population and not point:
        resized_geom, resized_population = shrink_with_point(f, a, window, geom, geom.centroid, target_population)

    new_geoms.append(resized_geom)
    del a

In [None]:
df_disclosing['city_geom_resized'] = new_geoms
df_disclosing = df_disclosing.set_geometry('city_geom_resized', crs='epsg:4326')
df_disclosing['area_km2'] = df_disclosing.to_crs('epsg:6933').area / 1000**2
df_disclosing['city_location_error'] = [point_to_polygon_meters(point, poly) for point, poly in zip(df_disclosing['city_location_raw'], new_geoms)]
df_disclosing['city_name_from_org'] = df_disclosing.Organization.fillna('').apply(parse_city_from_org)
df_disclosing['raw_geom_population'] = raw_geom_pops

In [None]:
# Save dataset.
df_dataset = df_disclosing[[
    'Account Number',
    'Population',
    'city_geom_source',
    'area_km2',
    'city_geom_resized',
    'city_location_error',
    'city_name_from_org',
    'raw_geom_population',
]]
df_dataset = df_dataset.rename({'Account Number': 'account_number'}, axis=1)
df_dataset = df_dataset.rename({'Population': 'population_corrected'}, axis=1)
df_dataset.to_file('../data/city_boundaries_100m.geojson', driver='GeoJSON')

In [None]:
# Plot a single result.

i = 4

print()
print(f'i: {i}')
# Get data from df.
geom = df_disclosing.city_geom.iloc[i]
point = df_disclosing['City Location'].iloc[i]
if point and not geom.intersects(point):
    point = None
target_population = df_disclosing.Population.iloc[i]

if not target_population or pd.isnull(target_population):
    pass


# Load raster.
f, a, window = load_pop_raster(geom, target_population)
if max(a.shape) < 100:
    f, a, window = load_pop_raster(geom, target_population, pop_path=POP_PATH_100m)
geom_population = total_population_from_geom(f, a, window, geom)
raw_geom_pops.append(geom_population)
# if abs(geom_population - target_population) / target_population < 0.25:
#     new_geoms.append(geom)
#     continue

# Do the transformation.
if geom_population <= target_population:
    resized_geom, resized_population = grow_without_point(f, a, window, geom, target_population)
elif geom_population > target_population and point:
    resized_geom, resized_population = shrink_with_point(f, a, window, geom, point, target_population)
elif geom_population > target_population and not point:
    resized_geom, resized_population = shrink_with_point(f, a, window, geom, geom.centroid, target_population)

# new_geoms.append(resized_geom)
# del a


# Plot results.
fig, axes = plt.subplots(1, 3, figsize=(20, 8))
ax = axes[0]
a_re, t_re = rasterio.warp.reproject(
    source=a,
    src_transform=f.window_transform(window),
    src_crs='epsg:4326',
    dst_crs='epsg:3857',
)
rasterio.plot.show(np.log10(a_re + 0.0001), transform=t_re, ax=ax)
ax.set_title(f'Bounds population: {int(a.sum()):,}')
ax.axis('off')

ax = axes[1]
gpd.GeoSeries([geom], crs='EPSG:4326').to_crs('epsg:3857').plot(ax=ax, alpha=0.5, edgecolor='k')
if point:
    gpd.GeoSeries([point], crs='EPSG:4326').to_crs('epsg:3857').plot(ax=ax, color='red', alpha=0.5, edgecolor='k', markersize=200)
ax.set_title(f'Starting population: {int(geom_population):,}')
axes[1].set_xlim(axes[0].get_xlim())
axes[1].set_ylim(axes[0].get_ylim())
contextily.add_basemap(ax, source=contextily.providers.OpenStreetMap.Mapnik)
ax.axis('off')

ax = axes[2]
gpd.GeoSeries([resized_geom], crs='EPSG:4326').to_crs('epsg:3857').plot(ax=ax, alpha=0.5, edgecolor='k')
if point:
    gpd.GeoSeries([point], crs='EPSG:4326').to_crs('epsg:3857').plot(ax=ax, color='red', alpha=0.5, edgecolor='k', markersize=200)
axes[2].set_xlim(axes[0].get_xlim())
axes[2].set_ylim(axes[0].get_ylim())
contextily.add_basemap(ax, source=contextily.providers.OpenStreetMap.Mapnik)
ax.set_title(f'Final population: {int(resized_population):,} (target {int(target_population):,})')
ax.axis('off')