In [None]:
# ================================================================
# Retrive Million Neighbourhoods (MN) data and create Labels
# ================================================================
# Purpose:
#   1. Read the Million Neighbourhoods (MN) GeoParquet file
#   2. Extract per-country MN blocks intersecting DEPRIMAP city segments
#   3. For each country, create MN-based deprivation labels for segments
#      using Rules A/B/C and majority thresholds 0.1, 0.2, 0.3.
#
# IMPORTANT DATA NOTES (for GitHub / reproducibility):
#   - The original MN GeoParquet is NOT stored in this repo due to size.
#     It can be downloaded from:
#       https://www.millionneighborhoods.africa/download
#     For dataset methodology, see:
#       Bettencourt, L.M.A., Marchio, N. (2025),
#       "Infrastructure deficits and informal settlements in sub-Saharan Africa",
#       Nature 645, 399‚Äì406. https://doi.org/10.1038/s41586-025-09465-2
#
#   - The intermediate per-country MN block files
#       MN_Blocks_by_country/{country}_mn_blocks.gpkg
#     are NOT shipped in the GitHub repo (size + intermediate nature).
#
#   - The final MN comparison files
#       MN_Comparison_Files/{country}/{country}_segments_mnlabels_k{K_THR}_maj{XX}.gpkg/.csv
#     are provided as a single ZIP on Zenodo. In the GitHub repo, a text file
#     points to that Zenodo archive instead of storing all GPKGs/CSVs directly.
#
#   - The label creating part of this notebook MUST be run TWICE:
#       1) with K_THR = 3  (high-k = k_complexity > 3)
#       2) with K_THR = 5  (high-k = k_complexity > 5)
#     Each run writes a separate set of MN label files with the k-threshold
#     encoded in the filename, e.g. *_k3_* and *_k5_*.
# ================================================================


# 1Ô∏è‚É£ MN blocks ‚Üí per-country GPKG (retain full blocks; DuckDB reader)

In [None]:
from pathlib import Path
import re
import duckdb
import pandas as pd
import geopandas as gpd
from shapely import wkb

# --- EDIT THESE PATHS TO MATCH YOUR LOCAL SETUP ---
# MN GeoParquet (NOT included in repo; download from MN website)
PARQUET_PATH = Path(".../MN/africa_geodata.parquet")

# segment files used as ROIs to select MN blocks
# (these are your CSD/RF segment predictions; path may differ on your machine)
ROIS_DIR = Path("../2_modelling/02_application/Filtered_80pct_allattributes/MN_African_Countries") #the African countries that are part of MN dataset are stored in different folder, but they are same as prediction files in 2_modelling/02_application/predictions


# Output folder for per-country MN blocks (INTERMEDIATE; not stored in repo)
OUT_DIR_BLOCKS = Path("../MN/Outputs/MN_Blocks_by_country")

OUT_DIR_BLOCKS.mkdir(parents=True, exist_ok=True)



In [None]:
# Columns we want from the MN GeoParquet (subset of full schema)
WANT_COLS = [
    "country_name", "country_code",
    "urban_id", "urban_center_name", "urban_country_name", "class_urban_hierarchy",
    "k_complexity",
    "building_area_m2", "average_building_area_m2",
    "parcel_count", "average_parcel_area_m2",
    "landscan_population", "landscan_population_un",
    "worldpop_population", "worldpop_population_un",
    "geometry",
]

print("üîé Discovering available columns via DuckDB‚Ä¶")
schema_df = duckdb.query(
    f"DESCRIBE SELECT * FROM parquet_scan('{PARQUET_PATH.as_posix()}')"
).to_df()
available = set(schema_df["column_name"].tolist())

# Keep only columns that actually exist in the parquet
cols = [c for c in WANT_COLS if c in available]
if "geometry" not in cols:
    raise RuntimeError("No 'geometry' column found in the Parquet; cannot proceed.")

missing = [c for c in WANT_COLS if c not in available and c != "geometry"]
if missing:
    print(f"‚ö†Ô∏è Missing columns (will be skipped): {missing}")

cols_sql = ", ".join(cols)
print("üì• Reading selected columns from Parquet via DuckDB‚Ä¶")
df = duckdb.query(
    f"SELECT {cols_sql} FROM parquet_scan('{PARQUET_PATH.as_posix()}')"
).to_df()
print(f"‚úÖ Loaded {len(df):,} MN rows")

# ---- Convert geometry (WKB bytes) ‚Üí Shapely ----
def _to_geom(v):
    if v is None:
        return None
    try:
        return wkb.loads(bytes(v))
    except Exception:
        return None

print("üîÑ Converting WKB geometries to Shapely‚Ä¶")
geom = df["geometry"].apply(_to_geom)
blocks = gpd.GeoDataFrame(df.drop(columns=["geometry"]), geometry=geom)

# Set CRS if missing (MN GeoParquet is typically EPSG:4326)
if blocks.crs is None:
    print("‚ÑπÔ∏è Setting MN blocks CRS to EPSG:4326 (adjust if needed).")
    blocks = blocks.set_crs("EPSG:4326")

blocks_crs = blocks.crs
_ = blocks.sindex  # build spatial index

# ---- Helper: parse country name from "{country}_rf_preds.gpkg" ----
def parse_country_name(path: Path) -> str:
    m = re.match(r"(.+?)_rf_preds\.gpkg$", path.name, flags=re.IGNORECASE)
    return m.group(1) if m else path.stem

roi_files = sorted(ROIS_DIR.glob("*_rf_preds.gpkg"))
print(f"Found {len(roi_files)} ROI files in {ROIS_DIR}")

for roi_path in roi_files:
    country = parse_country_name(roi_path)
    print(f"\n==== {country} ====")

    # Read ROI and reproject to blocks CRS
    try:
        roi = gpd.read_file(roi_path)
    except Exception as e:
        print(f"‚ùå Failed to read ROI {roi_path}: {e}")
        continue

    if roi.empty:
        print("‚ö†Ô∏è ROI empty; skipping.")
        continue

    if roi.crs != blocks_crs:
        try:
            roi = roi.to_crs(blocks_crs)
        except Exception as e:
            print(f"‚ùå Failed to reproject ROI to {blocks_crs}: {e}")
            continue

    # Disjoint segments are fine; unify for efficient spatial filtering
    roi_union = roi.unary_union
    if roi_union.is_empty:
        print("‚ö†Ô∏è ROI union empty; skipping.")
        continue

    # BBOX prefilter to reduce candidate set
    minx, miny, maxx, maxy = roi_union.bounds
    cand_idx = list(blocks.sindex.intersection((minx, miny, maxx, maxy)))
    if not cand_idx:
        print("‚ÑπÔ∏è No MN candidates in ROI bbox; skipping.")
        continue

    cand = blocks.iloc[cand_idx].copy()

    # Keep full original blocks that intersect the ROI (no geometric clipping)
    try:
        mask = cand.intersects(roi_union)
    except Exception as e:
        print(f"‚ùå intersects() failed: {e}")
        continue

    sel = cand.loc[mask]
    if sel.empty:
        print("‚ÑπÔ∏è No intersecting MN blocks; skipping.")
        continue

    # Ensure only requested columns that actually exist (plus geometry)
    out_cols = [c for c in WANT_COLS if c in sel.columns or c == "geometry"]
    sel = sel[out_cols]

    out_path = OUT_DIR_BLOCKS / f"{country}_mn_blocks.gpkg"
    try:
        sel.to_file(out_path, layer="blocks", driver="GPKG")
        print(f"‚úÖ Saved {out_path}  ({len(sel):,} blocks)")
    except Exception as e:
        print(f"‚ùå Failed to write {out_path}: {e}")

# 2Ô∏è‚É£ MN labels via Rules A/B/C (run TWICE: K_THR = 3 and K_THR = 5)

In [None]:
from pathlib import Path
import re
import math
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
from shapely.validation import make_valid
from shapely.prepared import prep
from joblib import Parallel, delayed

# ---- ROOTS ----
# BLK_ROOT: per-country MN blocks created above
# OUT_ROOT: where MN label comparison files are written
SEG_ROOT = Path("../2_modelling/02_application/Filtered_80pct_allattributes/MN_African_Countries")

BLK_ROOT = OUT_DIR_BLOCKS  # use the same folder created above (MN_Country_blocks)
OUT_ROOT = Path("../MN/Outputs/MN_Comparison_Files")

OUT_ROOT.mkdir(parents=True, exist_ok=True)

In [None]:
# ---- THRESHOLDS ----
# IMPORTANT: Run this notebook twice: once with K_THR=3 and once with K_THR=5.
K_THR = 3          # high-k cutoff (k_complexity > 3 or 5)
MAJ_THRS = [0.1, 0.2, 0.3]  # segment-level majority thresholds
COVER_THR_A = 0.90          # Rule A: full-cover threshold (>= 90% of segment)
EPSILON_M2 = 1.0            # ignore tiny overlaps (< 1 m¬≤)
EA_CRS = "EPSG:6933"        # equal-area CRS for area calculations

# ---- PARALLEL ----
N_JOBS = -1      # use all cores; set to e.g. 4 if you prefer
VERBOSE = 5      # joblib verbosity
GRID_SIZE = 0.05 # meters in EPSG:6933; tweak if needed

print(f"Using K_THR = {K_THR}. Run again with K_THR = 5 for alternative labels.")

# 3Ô∏è‚É£ Helper functions

In [None]:
def safe_make_valid(g):
    """Fix invalid geometries with make_valid, fallback to buffer(0)."""
    if g is None:
        return None
    try:
        gg = make_valid(g)
        if gg is None or gg.is_empty:
            gg = g.buffer(0)
        return gg
    except Exception:
        try:
            return g.buffer(0)
        except Exception:
            return g

def parse_country_from_blocks(path: Path) -> str:
    """'angola_mn_blocks.gpkg' -> 'angola'."""
    m = re.match(r"(.+?)_mn_blocks\.gpkg$", path.name, flags=re.IGNORECASE)
    return m.group(1) if m else path.stem

def seg_label_by_rules(full_cover, full_cover_highk, n_cent, share_cent, share_cov, maj_thr):
    """
    Apply Rules A ‚Üí B ‚Üí C given precomputed metrics for a segment.

    Returns:
        label (0/1), rule_used ('A', 'B', or 'C')
    """
    # Rule A: single block covers ‚â• COVER_THR_A of the segment
    if full_cover >= COVER_THR_A:
        return int(full_cover_highk), 'A'

    # Rule B: centroid-majority among blocks whose centroids fall inside the segment
    if n_cent > 0 and not np.isnan(share_cent):
        if share_cent > maj_thr:
            return 1, 'B'
        if share_cent < maj_thr:
            return 0, 'B'
        # tie => fall through to Rule C

    # Rule C: coverage-share of high-k blocks
    share_cov = 0.0 if np.isnan(share_cov) else float(share_cov)
    return (1 if share_cov >= maj_thr else 0), 'C'

def safe_intersection_area(a, b, grid_size=GRID_SIZE):
    """
    Robust area of intersection in equal-area CRS, with grid snapping and validity fixes.
    """
    if a is None or b is None:
        return 0.0
    try:
        inter = shapely.intersection(a, b, grid_size=grid_size)
        if inter is None or inter.is_empty:
            return 0.0
        return float(inter.area)
    except Exception:
        # last-resort: try buffer(0) cleanup
        try:
            aa = make_valid(a)
        except Exception:
            aa = a.buffer(0)
        try:
            bb = make_valid(b)
        except Exception:
            bb = b.buffer(0)
        try:
            inter2 = shapely.intersection(aa, bb, grid_size=grid_size)
            if inter2 is None or inter2.is_empty:
                return 0.0
            return float(inter2.area)
        except Exception:
            return 0.0

# 4Ô∏è‚É£ Process one country (build MN labels via Rules A/B/C)

In [None]:
def process_country(country: str):
    """
    Build MN labels for one country with Rules A/B/C.

    Inputs:
      - {country}_rf_preds_filtered80.gpkg  (segments, in SEG_ROOT)
      - {country}_mn_blocks.gpkg           (MN blocks, in BLK_ROOT)

    Outputs (written to OUT_ROOT / country):
      - {country}_segments_mnlabels_k{K_THR}_maj{XX}.gpkg   (XX = 10, 20, 30)
      - {country}_segments_mnlabels_k{K_THR}_maj{XX}.csv
    """
    seg_path = SEG_ROOT / f"{country}_rf_preds_filtered80.gpkg"
    blk_path = BLK_ROOT / f"{country}_mn_blocks.gpkg"

    if not blk_path.exists():
        return {"country": country, "status": "skip_no_blocks"}

    if not seg_path.exists():
        return {"country": country, "status": "skip_no_segments"}

    # --- output folder per country
    out_dir = OUT_ROOT / country
    out_dir.mkdir(parents=True, exist_ok=True)

    # --- load data
    try:
        segments = gpd.read_file(seg_path)
        blocks   = gpd.read_file(blk_path)
    except Exception as e:
        return {"country": country, "status": f"read_error: {e}"}

    if segments.empty:
        return {"country": country, "status": "empty_segments"}
    if blocks.empty:
        return {"country": country, "status": "empty_blocks"}

    # --- CRS alignment (work in blocks CRS)
    if blocks.crs is None:
        blocks = blocks.set_crs("EPSG:4326")
    if segments.crs != blocks.crs:
        segments = segments.to_crs(blocks.crs)

    # --- fix geometries (native CRS)
    segments["geometry"] = segments.geometry.apply(safe_make_valid)
    blocks["geometry"]   = blocks.geometry.apply(safe_make_valid)

    # --- ensure required block fields
    if "k_complexity" not in blocks.columns:
        return {"country": country, "status": "missing_k_complexity"}

    # --- derive high-k flag
    blocks["highk"] = (blocks["k_complexity"] > K_THR).astype("int8")

    # --- equal-area versions for area calcs & precompute segment area
    segments_ea = segments.to_crs(EA_CRS)
    blocks_ea   = blocks.to_crs(EA_CRS)

    segments_ea["geometry"] = segments_ea.geometry.apply(safe_make_valid)
    blocks_ea["geometry"]   = blocks_ea.geometry.apply(safe_make_valid)

    seg_area = segments_ea.geometry.area.values

    # --- spatial index & centroids (native CRS)
    sindex = blocks.sindex
    blocks_cent = blocks.geometry.centroid

    # --- arrays to store metrics per segment
    n = len(segments)
    full_cover = np.zeros(n, dtype="float64")
    full_cover_highk = np.zeros(n, dtype="int8")
    n_blocks_intersect = np.zeros(n, dtype="int32")

    n_blocks_cent = np.zeros(n, dtype="int32")
    share_cent = np.full(n, np.nan, dtype="float64")

    share_cov = np.full(n, np.nan, dtype="float64")

    # --- iterate segments (sequential within a country)
    for i in range(n):
        seg = segments.geometry.iat[i]
        seg_ea = segments_ea.geometry.iat[i]
        area_i = float(seg_area[i])

        if seg is None or seg.is_empty or area_i <= 0:
            continue

        # candidate blocks by bbox; then precise intersects (native CRS)
        minx, miny, maxx, maxy = seg.bounds
        cand_idx = list(sindex.intersection((minx, miny, maxx, maxy)))
        if not cand_idx:
            continue

        cand     = blocks.iloc[cand_idx]
        cand_ea  = blocks_ea.iloc[cand_idx]

        inter_mask = cand.geometry.intersects(seg)
        if not inter_mask.any():
            continue

        cand     = cand.loc[inter_mask].copy()
        cand_ea  = cand_ea.loc[inter_mask].copy()
        n_blocks_intersect[i] = len(cand)

        # ---------- Rule A/C metrics: coverage within segment in EA CRS ----------
        seg_ea_valid = segments_ea.geometry.iat[i]
        prep_seg_ea  = prep(seg_ea_valid)

        overlaps = []
        for geom in cand_ea.geometry.values:
            # quick reject via prepared predicate
            if not prep_seg_ea.intersects(geom):
                overlaps.append(0.0)
                continue
            a = safe_intersection_area(geom, seg_ea_valid, grid_size=GRID_SIZE)
            overlaps.append(a)

        overlaps = np.array(overlaps, dtype="float64")
        overlaps[overlaps < EPSILON_M2] = 0.0

        cov = overlaps / area_i if area_i > 0 else np.zeros_like(overlaps)

        max_cov = float(cov.max()) if cov.size else 0.0
        full_cover[i] = max_cov
        if max_cov >= COVER_THR_A:
            j = int(cov.argmax())
            full_cover_highk[i] = int(cand["highk"].iloc[j])

        # ---------- Rule B metrics: centroid-majority (native CRS) ----------
        cent = blocks_cent.loc[cand.index]
        cent_mask = cent.within(seg)
        n_cent = int(cent_mask.sum())
        n_blocks_cent[i] = n_cent
        if n_cent > 0:
            assigned = cand.loc[cent_mask]
            share_cent[i] = float(assigned["highk"].sum()) / n_cent

        # ---------- Rule C metric: coverage-share (of high-k) ----------
        if cov.size:
            hk_mask = cand["highk"].values.astype(bool)
            share_cov[i] = float(cov[hk_mask].sum())

    # --- outputs for each majority threshold (reuse metrics) ---
    summaries = []
    for maj_thr in MAJ_THRS:
        labels = np.zeros(n, dtype="int8")
        rules  = np.array([""]*n, dtype=object)

        for i in range(n):
            lab, rule = seg_label_by_rules(
                full_cover=full_cover[i],
                full_cover_highk=full_cover_highk[i],
                n_cent=int(n_blocks_cent[i]),
                share_cent=share_cent[i],
                share_cov=share_cov[i],
                maj_thr=maj_thr
            )
            labels[i] = lab
            rules[i]  = rule

        seg_out = segments.copy()
        seg_out["mn_label"]            = labels
        seg_out["mn_rule_used"]        = rules
        seg_out["mn_full_cover"]       = full_cover
        seg_out["mn_n_blocks_intersect"]   = n_blocks_intersect
        seg_out["mn_n_blocks_centroid"]    = n_blocks_cent
        seg_out["mn_share_highk_centroid"] = share_cent
        seg_out["mn_share_highk_cov"]      = share_cov
        seg_out["mn_k_thr"]                = K_THR
        seg_out["mn_maj_thr"]             = maj_thr

        # write files
        thr_tag = f"{int(maj_thr*100):02d}"  # 10, 20, 30
        out_gpkg = out_dir / f"{country}_segments_mnlabels_k{K_THR}_maj{thr_tag}.gpkg"
        out_csv  = out_dir / f"{country}_segments_mnlabels_k{K_THR}_maj{thr_tag}.csv"
        try:
            seg_out.to_file(out_gpkg, layer="segments", driver="GPKG")
            seg_out.drop(columns=["geometry"]).to_csv(out_csv, index=False)
        except Exception as e:
            return {"country": country, "status": f"write_error({thr_tag}): {e}"}

        # quick per-threshold summary
        s = (
            pd.Series(labels)
              .value_counts()
              .rename_axis("label")
              .reset_index(name="n")
        )
        summaries.append({"maj_thr": maj_thr, "summary": s})

    return {"country": country, "status": "ok", "n_segments": n, "summaries": summaries}

# 5Ô∏è‚É£ Run all countries in parallel & aggregate counts

In [None]:
# discover countries from MN_Blocks_by_country folder
block_files = sorted(BLK_ROOT.glob("*_mn_blocks.gpkg"))
countries = [parse_country_from_blocks(p) for p in block_files]

# keep only those that have corresponding segments
countries = [
    c for c in countries 
    if (SEG_ROOT / f"{c}_rf_preds_filtered80.gpkg").exists()
]

print(f"Will process {len(countries)} countries with K_THR={K_THR}:")
print(", ".join(countries))

results = Parallel(n_jobs=N_JOBS, verbose=VERBOSE)(
    delayed(process_country)(c) for c in countries
)

# Collate a simple report
report = pd.DataFrame(results)
report_path = OUT_ROOT / f"mn_label_build_report_k{K_THR}.csv"
report.to_csv(report_path, index=False)
print(f"\n‚úÖ Saved build report to: {report_path}")

try:
    from IPython.display import display
    display(report)
except Exception:
    pass