# Apply Random Forest Model to City Segments (Global Predictions)

This notebook:

1. Loads the trained Random Forest model (`rf_best_model.joblib`).
2. Iterates over all country folders containing:
   - `{country}_segments.shp`
   - `{country}_segments_vars_with_ratios.csv`
3. Computes `rf_prob` and `rf_label` (using a chosen threshold, e.g. œÑ = 0.40).
4. Writes one **GeoPackage per country**:
   - `{country}_rf_preds.gpkg` with geometry + RF outputs.

---

## Data & model locations

**Model**

- The best RF model is not stored in this repository (file is too large).
- There is a text file at:

`../01_training/rf_outputs/bestmodel_joblib.txt`

which contains the link to a Zenodo archive where  
`rf_best_model.joblib` can be downloaded.

**Inputs (same as informed in 1_preprocessing step)**

1. **City Segments v1 raw data** (shapefiles)  
   Must be downloaded from Harvard Dataverse:  
   https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/XLRSF0  
   Folder structure expected (example):

   data/raw/CitySegments/

   argentina/

   argentina_segments.shp

   argentina_segments_vars_with_ratios.csv

   brazil/

   brazil_segments.shp

   brazil_segments_vars_with_ratios.csv
   
   ...


2. **Preprocessed CSVs** (`*_segments_vars_with_ratios.csv`)  
These are produced in the preprocessing step (1_preprocessing/01_preprocess_city_segments.ipynb) and stored alongside the shapefiles.

**Outputs**

- GPKGs are written to:

`2_modelling/02_application/predictions/`

Each file:

`{country}_rf_preds.gpkg`

Due to size, the full set of prediction GPKGs (for 107 countries)  
is not stored in this repository. Instead, a text file in the `predictions/` folder
contains a Zenodo link where the complete zip archive is hosted.


# Imports and configuration

In [None]:
import os
from pathlib import Path

import joblib
import pandas as pd
import geopandas as gpd
from joblib import Parallel, delayed


In [None]:
# ------------------------------------------------
# Configuration (relative paths for the repo)
# ------------------------------------------------

# RF model path (downloaded from Zenodo, see TXT in rf_outputs folder)
MODEL_PATH = Path("../01_training/rf_outputs/rf_best_model.joblib")

# Parent folder containing one subfolder per country with:
#   {country}_segments.shp
#   {country}_segments_vars_with_ratios.csv
PARENT_FOLDER = Path("../../data/raw/CitySegments")

# Output folder for per-country prediction GPKGs
OUTPUT_FOLDER = Path("predictions")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

# Classification threshold
THRESHOLD = 0.40

# Region mapping JSON (created during RF training)
REGION_MAP_PATH = Path("../01_training/rf_outputs/region_mapping.json")

# EXACT predictor order used in training
predictor_cols = [
    "i5_par_area", "i1_pop_area", "i6_paru_area", "i8_paru_par", "B_AVG_SEG",
    "i9_roads_par", "PARU_A_SEG", "B_AREA_SEG", "B_CV_SEG",
    "REGION_CODE"
]

# Parallel workers (use all cores by default)
N_JOBS = -1

print("MODEL_PATH:", MODEL_PATH.resolve())
print("PARENT_FOLDER:", PARENT_FOLDER.resolve())
print("OUTPUT_FOLDER:", OUTPUT_FOLDER.resolve())


In [None]:
# ------------------
# Load trained RF model
# ------------------
if not MODEL_PATH.exists():
    raise FileNotFoundError(
        f"Model file not found at {MODEL_PATH}.\n"
        "Download rf_best_model.joblib from the Zenodo link in "
        "../01_training/rf_outputs/bestmodel_joblib.txt"
    )

rf = joblib.load(MODEL_PATH)
print(f"‚úÖ Loaded model: {MODEL_PATH.name}")

try:
    print("Model expects n_features_in_ =", rf.n_features_in_)
except Exception:
    pass


In [None]:
# ------------------
# Optional region mapping
# ------------------
region_map = None
if REGION_MAP_PATH.exists():
    import json
    with REGION_MAP_PATH.open("r", encoding="utf-8") as f:
        region_map = json.load(f)
    region_map.setdefault("Unknown", 0)
    print(f"‚ÑπÔ∏è Loaded region mapping with {len(region_map)} entries.")
else:
    print("‚ÑπÔ∏è No region_mapping.json found. If REGION_CODE is missing in CSVs, default 0 will be used.")


In [None]:
def map_region_code_from_text(s):
    """Map text region ‚Üí numeric code. If no map, return 0. Handles NaN."""
    if region_map is None:
        return 0
    if pd.isna(s):
        return region_map.get("Unknown", 0)
    return region_map.get(str(s), region_map.get("Unknown", 0))


# Per country worker

In [None]:
def process_country(country_path: Path):
    country_name = country_path.name
    print(f"\nüü° Processing: {country_name}")

    try:
        # Input paths
        csv_path = country_path / f"{country_name}_segments_vars_with_ratios.csv"
        shp_path = country_path / f"{country_name}_segments.shp"

        if not csv_path.exists():
            print(f"‚ùå Missing CSV: {csv_path}")
            return
        if not shp_path.exists():
            print(f"‚ùå Missing SHP: {shp_path}")
            return

        # Load CSV
        df = pd.read_csv(csv_path)
        n0 = len(df)
        print(f"üìÑ Loaded {csv_path.name} ‚Üí {n0} rows")

        # Ensure REGION_CODE exists if it's part of predictors
        if "REGION_CODE" in predictor_cols and "REGION_CODE" not in df.columns:
            if "REG1_GHSL" in df.columns:
                df["REGION_CODE"] = df["REG1_GHSL"].map(map_region_code_from_text)
                print("üîß Built REGION_CODE from REG1_GHSL using region map (or default 0).")
            else:
                df["REGION_CODE"] = 0
                print("üîß REGION_CODE not found and REG1_GHSL missing; using 0 for all rows.")

        # Check predictors exist
        missing_pred = [c for c in predictor_cols if c not in df.columns]
        if missing_pred:
            print(f"‚ùå Missing predictors in CSV: {missing_pred}")
            return

        # Drop NA in predictors
        df = df.dropna(subset=predictor_cols)
        n_after = len(df)
        print(f"üßπ Dropped NA in predictors ‚Üí {n_after} rows (removed {n0 - n_after})")
        if df.empty:
            print(f"‚ö†Ô∏è Skipped {country_name}: all rows dropped after NA-removal in predictors.")
            return

        # Predict
        X = df[predictor_cols].to_numpy()
        if hasattr(rf, "n_features_in_"):
            if rf.n_features_in_ != X.shape[1]:
                print(f"‚ùå Feature count mismatch: model expects {rf.n_features_in_}, got {X.shape[1]}")
                return

        prob = rf.predict_proba(X)[:, 1]
        df["rf_prob"] = prob
        df["rf_label"] = (df["rf_prob"] >= THRESHOLD).astype(int)

        # Build join key (ID_HDC_G0 + ID_SEG, concatenated as strings)
        if "ID_HDC_G0" not in df.columns or "ID_SEG" not in df.columns:
            print(f"‚ùå CSV missing join columns ID_HDC_G0/ID_SEG")
            return
        df["JOIN_KEY"] = df["ID_HDC_G0"].astype(str) + df["ID_SEG"].astype(str)

        # Load SHP (geometry)
        gdf = gpd.read_file(shp_path)
        print(f"üó∫Ô∏è  Loaded {shp_path.name} ‚Üí {len(gdf)} shapes")

        if "ID_HDC_G0" not in gdf.columns or "ID_SEG" not in gdf.columns:
            print(f"‚ùå SHP missing join columns ID_HDC_G0/ID_SEG")
            return
        gdf["JOIN_KEY"] = gdf["ID_HDC_G0"].astype(str) + gdf["ID_SEG"].astype(str)

        # Decide which CSV columns to add (avoid overwriting existing GDF cols)
        csv_cols_to_add = [c for c in df.columns if c not in gdf.columns and c != "JOIN_KEY"]
        # Ensure we include predictions even if names collide
        for must in ["rf_prob", "rf_label"]:
            if must not in csv_cols_to_add:
                csv_cols_to_add.append(must)

        # Merge (left join on shapes)
        merged = gdf.merge(
            df[["JOIN_KEY"] + csv_cols_to_add],
            on="JOIN_KEY",
            how="left"
        )

        # Diagnostics
        matched = merged["rf_prob"].notna().sum()
        print(f"üîó Match rate: {matched}/{len(merged)} shapes ({matched/len(merged):.1%})")

        # Save to GeoPackage
        out_path = OUTPUT_FOLDER / f"{country_name}_rf_preds.gpkg"
        merged.to_file(out_path, driver="GPKG")
        print(f"‚úÖ Saved: {out_path}")

    except Exception as e:
        print(f"‚ùå Error processing {country_name}: {e}")


# Run over all countries in parallel

In [None]:
countries = [p for p in PARENT_FOLDER.iterdir() if p.is_dir()]
print(f"\nFound {len(countries)} country folders under: {PARENT_FOLDER}")

Parallel(n_jobs=N_JOBS, backend="loky")(
    delayed(process_country)(country_path) for country_path in countries
)

print("\nüéâ Done.")
