In [1]:
# ---- user config ----
YOUR_NAME = 'sara'
AWS_PROFILE = 'cities'

# If you want to limit the run (debug):
# LIMIT = 20
LIMIT = None

# Cities to ignore for now
IGNORE_CITIES = [
    'Ngo__Nigeria',
    'Bugama__Nigeria',
    'Mubi__Nigeria',
    'San_Pedro_de_Macoris__Dominican_Republic',
]


In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
# ---- shared S3 paths (mirrors gather_data_executor + pre_processing.py) ----
MAIN_PATH = 's3://wri-cities-sandbox/identifyingLandSubdivisions/data'
INPUT_PATH = f'{MAIN_PATH}/input'
CITY_INFO_PATH = f'{INPUT_PATH}/city_info'
EXTENTS_PATH = f'{CITY_INFO_PATH}/extents'
BUILDINGS_PATH = f'{INPUT_PATH}/buildings'
ROADS_PATH = f'{INPUT_PATH}/roads'
INTERSECTIONS_PATH = f'{INPUT_PATH}/intersections'
NATURAL_FEATURES_PATH = f'{INPUT_PATH}/natural_features_and_railroads'
BLOCKS_PATH = f'{INPUT_PATH}/blocks'
OUTPUT_PATH = f'{MAIN_PATH}/output'


In [4]:
# ---- check S3 connection using AWS_PROFILE ----
import boto3, os

session = boto3.Session(profile_name=AWS_PROFILE)
s3 = session.client('s3')

# Export profile so s3fs/cloudpathlib inherit it
os.environ['AWS_PROFILE'] = AWS_PROFILE

s3.list_buckets()


{'ResponseMetadata': {'RequestId': 'G3XK1EMJZX4BAYVW',
  'HostId': 'Vo0lFsWeMDJWffd82rSzGPqVMi418ZYA8CQ8toCw+Z8R/uSMR1lgCFDWctfLXe+3J46qmKLgBVM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Vo0lFsWeMDJWffd82rSzGPqVMi418ZYA8CQ8toCw+Z8R/uSMR1lgCFDWctfLXe+3J46qmKLgBVM=',
   'x-amz-request-id': 'G3XK1EMJZX4BAYVW',
   'date': 'Tue, 20 Jan 2026 21:59:44 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'aft-sandbox-540362055257',
   'CreationDate': datetime.datetime(2022, 9, 13, 15, 12, 20, tzinfo=tzutc()),
   'BucketArn': 'arn:aws:s3:::aft-sandbox-540362055257'},
  {'Name': 'amplify-citiesindicatorsapi-dev-10508-deployment',
   'CreationDate': datetime.datetime(2023, 8, 30, 5, 5, 13, tzinfo=tzutc()),
   'BucketArn': 'arn:aws:s3:::amplify-citiesindicatorsapi-dev-10508-deployment'},
  {'Name': 'cities-heat',
   'CreationDate': datetime.datetime(2023, 6, 1, 13, 22, 1, tzinfo=tzutc

In [None]:
# ---- start Coiled cluster (adjust n_workers / instance type as needed) ----
import coiled

cluster = coiled.Cluster(
    workspace='wri-cities-data',
    name=f'ils-blocks-{YOUR_NAME}',
    region='us-west-2',
    arm=True,
    worker_vm_types='r8g.xlarge',
    spot_policy='spot',
    n_workers=10,
    package_sync_ignore=['pyspark', 'pypandoc'],
    worker_options={'nthreads': 1},
)
client = cluster.get_client()
print(f'Started Dask client. Dashboard: {client.dashboard_link}')


[2026-01-20 16:59:45,566][INFO    ][coiled] Fetching latest package priorities...
[2026-01-20 16:59:45,567][INFO    ][coiled.package_sync] Resolving your local subdivisions2 Python environment...
[2026-01-20 16:59:46,049][INFO    ][coiled.package_sync] Scanning 446 conda packages...
[2026-01-20 16:59:46,057][INFO    ][coiled.package_sync] Scanning 261 python packages...
[2026-01-20 16:59:47,475][INFO    ][coiled.software_utils] No username or password found for https://conda.anaconda.org/conda-forge
[2026-01-20 16:59:47,940][INFO    ][coiled] Running pip check...
[2026-01-20 16:59:48,412][INFO    ][coiled] Validating environment...
[2026-01-20 16:59:49,332][INFO    ][coiled] Creating wheel for ~/Documents/Identifying Land Subdivisions/identifyingLandSubdivisions...
[2026-01-20 16:59:49,425][INFO    ][coiled] Creating wheel for /opt/spark-2.2.0/python...
[2026-01-20 16:59:49,593][INFO    ][coiled.package_sync] Package - aiobotocore, Pip check had the following issues that need resolving

Started Dask client. Dashboard: https://cluster-cheop.dask.host/IFEsUAuouKSgYJb3/status



+---------+--------+-----------+---------+
| Package | Client | Scheduler | Workers |
+---------+--------+-----------+---------+
| lz4     | 4.4.4  | 4.4.5     | 4.4.5   |
+---------+--------+-----------+---------+


2026-01-20 17:22:44,712 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [6]:
# ---- build city list from S3 (roads folders) ----
import s3fs

fs = s3fs.S3FileSystem(anon=False)

# Each city folder is under ROADS_PATH/<city>/...
road_city_dirs = fs.ls(ROADS_PATH)
all_cities = sorted([p.split('/')[-1] for p in road_city_dirs])

# remove empty / sentinel items if any
all_cities = [c for c in all_cities if c and c not in ['.DS_Store']]

# ignore the 4 cities you want to skip
cities = [c for c in all_cities if c not in set(IGNORE_CITIES)]

print('roads city dirs:', len(all_cities))
print('cities to run:', len(cities))

if LIMIT:
    cities = cities[:LIMIT]
    print('LIMIT applied ->', len(cities))


roads city dirs: 1234
cities to run: 1234


In [7]:
# ---- logging + runner (inspired by gather_data_executor) ----
import os, time, socket, traceback
from datetime import datetime, timezone
from cloudpathlib import S3Path

from pre_processing import produce_blocks

RUN_ID = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
LOGS_S3_DIR = f"{OUTPUT_PATH}/logs/blocks_calculation/{RUN_ID}"
SUMMARY_S3_PATH = f"{LOGS_S3_DIR}/summary.csv"
LOCAL_LOG_DIR = f"/tmp/blocks_calculation_logs/{RUN_ID}"


def utc_now():
    return datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')


def append_log(path, msg):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'a', encoding='utf-8') as f:
        f.write(msg)


def safe_s3_upload(local_path, s3_uri):
    s3p = S3Path(s3_uri)
    s3p.parent.mkdir(parents=True, exist_ok=True)
    s3p.upload_from(local_path)
    return s3_uri


def blocks_already_exist(city):
    # pre_processing saves: {BLOCKS_PATH}/{city}/{city}_blocks_{YOUR_NAME}.geoparquet
    s3_uri = f"{BLOCKS_PATH}/{city}/{city}_blocks_{YOUR_NAME}.geoparquet"
    try:
        return fs.exists(s3_uri)
    except Exception:
        return False


def required_inputs_exist(city):
    paths = {
        'roads': f"{ROADS_PATH}/{city}/{city}_OSM_roads.geoparquet",
        'natural_features': f"{NATURAL_FEATURES_PATH}/{city}/{city}_OSM_natural_features_and_railroads.geoparquet",
    }
    exists = {}
    ok = True
    for k, p in paths.items():
        try:
            ex = fs.exists(p)
        except Exception:
            ex = False
        exists[k] = ex
        ok = ok and ex
    return ok, exists


def run_city_blocks(city):
    city_clean = city.strip()
    host = socket.gethostname()
    log_path = os.path.join(LOCAL_LOG_DIR, f"{city_clean}.log")

    append_log(log_path, f"[{utc_now()}] START city={city_clean} host={host} run_id={RUN_ID}\n")

    if city_clean in IGNORE_CITIES:
        append_log(log_path, f"[{utc_now()}] SKIP city is in IGNORE_CITIES\n")
        s3_log = safe_s3_upload(log_path, f"{LOGS_S3_DIR}/{city_clean}.log")
        return {'city': city_clean, 'status': 'skipped_ignore', 'log_s3': s3_log}

    if blocks_already_exist(city_clean):
        append_log(log_path, f"[{utc_now()}] SKIP blocks already exist for YOUR_NAME={YOUR_NAME}\n")
        s3_log = safe_s3_upload(log_path, f"{LOGS_S3_DIR}/{city_clean}.log")
        return {'city': city_clean, 'status': 'skipped_exists', 'log_s3': s3_log}

    ok_inputs, exists = required_inputs_exist(city_clean)
    if not ok_inputs:
        append_log(log_path, f"[{utc_now()}] FAIL missing inputs: {exists}\n")
        s3_log = safe_s3_upload(log_path, f"{LOGS_S3_DIR}/{city_clean}.log")
        out = {'city': city_clean, 'status': 'missing_inputs', 'log_s3': s3_log}
        out.update({f"has_{k}": v for k, v in exists.items()})
        return out

    t0 = time.time()
    dt = None
    try:
        blocks = produce_blocks(city_clean, YOUR_NAME).compute()
        n = 0 if blocks is None else len(blocks)
        dt = time.time() - t0
        append_log(log_path, f"[{utc_now()}] OK blocks={n} elapsed_sec={dt:.2f}\n")
        status = 'ok'
        extra = {'n_blocks': n}
    except Exception as e:
        dt = time.time() - t0
        append_log(log_path, f"[{utc_now()}] ERROR elapsed_sec={dt:.2f} err={repr(e)}\n")
        append_log(log_path, traceback.format_exc() + "\n")
        status = 'error'
        extra = {}

    s3_log = safe_s3_upload(log_path, f"{LOGS_S3_DIR}/{city_clean}.log")
    out = {'city': city_clean, 'status': status, 'elapsed_sec': dt, 'log_s3': s3_log}
    out.update(extra)
    return out


  RUN_ID = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')


In [8]:
# ---- execute across cities with dask.bag ----
import dask.bag as db
import pandas as pd

bag = db.from_sequence(cities, partition_size=1)
records = bag.map(run_city_blocks).compute()

summary = pd.DataFrame(records)
print(summary['status'].value_counts(dropna=False))

local_summary = f"/tmp/summary_blocks_calculation_{RUN_ID}.csv"
summary.to_csv(local_summary, index=False, sep=';')

S3Path(LOGS_S3_DIR).mkdir(parents=True, exist_ok=True)
S3Path(SUMMARY_S3_PATH).upload_from(local_summary)

print('✅ Summary written to:', SUMMARY_S3_PATH)


status
skipped_exists    1234
Name: count, dtype: int64
✅ Summary written to: s3://wri-cities-sandbox/identifyingLandSubdivisions/data/output/logs/blocks_calculation/20260120T220155Z/summary.csv


In [9]:
# ---- quick peek at failures ----
# Show a few errors/missing inputs for debugging

import pandas as pd

errs = summary[summary['status'].isin(['error', 'missing_inputs'])].copy()
print('n problematic:', len(errs))
errs.head(25)


n problematic: 0


Unnamed: 0,city,status,log_s3


### Notes
- This notebook builds the city list from existing **ROADS** folders on S3, then skips the 4 ignore cities.
- It also skips cities where a blocks file already exists for this `YOUR_NAME`.
- Logs and a summary CSV are uploaded to: `s3://.../output/logs/calc_blocks/<RUN_ID>/`.
