# Creating Labeled Training Data for Random Forest (8 IDEABench Cities)

This notebook merges:
1. **City Segments processed features**  
   ‚Üí created in Notebook 01 (`*_segments_vars_with_ratios.csv`)
2. **IDEABench-derived slum labels**  
   ‚Üí stored in 8 per-city GeoPackages (GPKG)

The output is a set of clean CSV files containing:
- join keys (`ID_HDC_G0`, `ID_SEG`)
- built-environment predictors (i1‚Äìi10, base variables)
- `slum_fraction` from IDEABench
- `slum_label1` using a threshold of 0.30 (slum_fraction ‚â• 0.30)

These labeled CSVs are used in the Random Forest training stage.

---

## ‚ö†Ô∏è **Data Availability Notes**

### **City Segments v1 dataset (raw input)**
Not included due to size; download from Harvard Dataverse:

https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/XLRSF0


### **IDEABench slum reference data (GPKG files)**
Not included in this repository due to access restrictions.  
You must obtain the data from the authors:

https://phys-techsciences.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/PT/X4NJII

We cannot share the GPKGs in this repository.  
Only **the processed labeled CSVs** generated by this notebook will be included in the repo.

---

## üìÅ Required Directory Structure

Place IDEABench-labelled GPKG files here:

../data/private/ideabench_labeled_segments/

Buenos_Aires_CSV_with_slum_labels.gpkg

Jakarta_CSV_with_slum_labels.gpkg

Lagos_CSV_with_slum_labels.gpkg

...




## Output labeled CSVs will be saved into:

../LabelledData_For_RF/

# Imports and configuration

In [None]:
from pathlib import Path
import geopandas as gpd
import pandas as pd

# ---------------------------------------
# Directory Configuration (relative)
# ---------------------------------------

# IDEABench-labelled city gpkg directory (not included in repo)
GPKG_DIR = Path("../data/private/ideabench_labeled_segments")

# Processed City Segments (from Notebook 01)
COUNTRY_PARENT = Path("../data/raw/CitySegments")

# Output folder (these CSVs WILL be included in the repo)
OUT_DIR = Path("../LabelledData_For_RF")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Label threshold
THRESHOLD = 0.30  # slum_fraction ‚â• 0.30 ‚Üí slum_label1 = 1

# Join keys
KEY_COLS = ["ID_HDC_G0", "ID_SEG"]

# Mapping from GPKG filename ‚Üí country folder name
CITY_TO_COUNTRY = {
    "Buenos_Aires_CSV_with_slum_labels.gpkg": "argentina",
    "Jakarta_CSV_with_slum_labels.gpkg":      "indonesia",
    "Lagos_CSV_with_slum_labels.gpkg":        "nigeria",
    "Medellin_CSV_with_slum_labels.gpkg":     "colombia",
    "Mexico_City_CSV_with_slum_labels.gpkg":  "mexico",
    "Mumbai_CSV_with_slum_labels.gpkg":       "india",
    "Nairobi_CSV_with_slum_labels.gpkg":      "kenya",
    "Salvador_CSV_with_slum_labels.gpkg":     "brazil",
}

GPKG_DIR, COUNTRY_PARENT, OUT_DIR


# Helper functions

In [None]:
def read_gpkg_labels(gpkg_path: Path) -> pd.DataFrame:
    """
    Reads GPKG for one city and extracts join keys and slum attributes.
    Geometry is removed to ensure clean CSV output.
    """
    gdf = gpd.read_file(gpkg_path)

    # drop geometry completely
    if gdf.geometry is not None:
        geom_col = gdf.geometry.name
        if geom_col in gdf.columns:
            gdf = gdf.drop(columns=geom_col)

    df = pd.DataFrame(gdf)

    # Ensure join keys exist
    for c in KEY_COLS:
        if c not in df.columns:
            raise ValueError(f"{gpkg_path.name}: missing join key {c}")

    # Keep only relevant columns
    keep_cols = [c for c in ["slum_fraction", "slum_label"] if c in df.columns]
    df = df[KEY_COLS + keep_cols].copy()

    # Standardize key types
    for c in KEY_COLS:
        df[c] = df[c].astype(str)

    print(f"‚úîÔ∏è Loaded {gpkg_path.name} ‚Äî columns kept: {df.columns.tolist()}")
    return df


In [None]:
def read_country_subset(country_dir: Path, join_keys: pd.DataFrame) -> pd.DataFrame:
    """
    Reads the country's *_with_ratios.csv file and keeps only rows
    matching the city's join keys.
    """
    country = country_dir.name
    csv_path = country_dir / f"{country}_segments_vars_with_ratios.csv"

    if not csv_path.exists():
        raise FileNotFoundError(f"Missing CSV: {csv_path}")

    df = pd.read_csv(csv_path)

    for c in KEY_COLS:
        if c not in df.columns:
            raise ValueError(f"{csv_path.name}: missing join key {c}")
        df[c] = df[c].astype(str)

    # Subset by join keys
    return df.merge(join_keys[KEY_COLS], on=KEY_COLS, how="inner")


In [None]:
def merge_and_label(df_csv: pd.DataFrame, df_gpkg: pd.DataFrame, threshold: float) -> pd.DataFrame:
    """
    Merge City Segments features (df_csv) with slum_fraction (df_gpkg)
    and create slum_label1 using the provided threshold.
    """
    merged = df_csv.merge(df_gpkg, on=KEY_COLS, how="inner")

    # remove any geometry remnants
    if "geometry" in merged.columns:
        merged = merged.drop(columns="geometry")

    merged = pd.DataFrame(merged)

    # Create thresholded label
    if "slum_fraction" in merged.columns:
        merged["slum_fraction"] = pd.to_numeric(merged["slum_fraction"], errors="coerce")
        mask = merged["slum_fraction"].notna()

        new_label = pd.Series(pd.NA, index=merged.index, dtype="Int64")
        new_label.loc[mask] = (merged.loc[mask, "slum_fraction"] >= threshold).astype("Int64")
        merged["slum_label1"] = new_label
    else:
        merged["slum_label1"] = pd.Series(pd.NA, dtype="Int64", index=merged.index)

    return merged


# Main workflow

In [None]:
def main():
    summaries = []

    for city_file, country_name in CITY_TO_COUNTRY.items():
        try:
            city_path = GPKG_DIR / city_file
            if not city_path.exists():
                print(f"‚ùå Missing GPKG: {city_file}")
                continue

            country_dir = COUNTRY_PARENT / country_name
            if not country_dir.exists():
                print(f"‚ùå Missing country folder: {country_dir}")
                continue

            # Step 1: load slum labels from GPKG
            df_gpkg = read_gpkg_labels(city_path)

            # Step 2: load only the matching City Segments rows
            df_csv = read_country_subset(country_dir, df_gpkg)

            # Step 3: merge + threshold
            df_merged = merge_and_label(df_csv, df_gpkg, THRESHOLD)

            # Step 4: remove geometry-like columns if they sneak in
            geom_like = [c for c in df_merged.columns if "geom" in c.lower()]
            if geom_like:
                print(f"‚ö†Ô∏è {city_file} ‚Äî geometry-like columns removed: {geom_like}")
                df_merged = df_merged.drop(columns=geom_like)

            # Step 5: save CSV output (included in GitHub repo)
            city_stem = city_path.stem.replace("_CSV_with_slum_labels", "")
            out_name = f"{city_stem.lower()}_labeled_thr030.csv"
            out_path = OUT_DIR / out_name
            df_merged.to_csv(out_path, index=False)

            # Step 6: summarize
            n = len(df_merged)
            n_nan_frac = df_merged["slum_fraction"].isna().sum() if "slum_fraction" in df_merged else n
            n_ones = int((df_merged["slum_label1"] == 1).sum())
            n_zeros = int((df_merged["slum_label1"] == 0).sum())
            n_na = df_merged["slum_label1"].isna().sum()

            summaries.append(
                f"‚úîÔ∏è {city_stem:<12} | {country_name:<10} | rows: {n:>7,d} | "
                f"slum_fraction NaN: {n_nan_frac:>6,d} | "
                f"label1 ‚Üí 1:{n_ones:>6,d}, 0:{n_zeros:>6,d}, NA:{n_na:>6,d} | "
                f"saved ‚Üí {out_name}"
            )

        except Exception as e:
            summaries.append(f"‚ùå {city_file:<35} | {country_name:<10} | ERROR: {e}")

    print("\n".join(summaries))
    print("\nüéâ Done ‚Äî all labeled CSVs created.")


# Run the workflow

In [None]:
main()

## Outputs generated

../LabelledData_For_RF/

    buenos_aires_labeled_thr030.csv 

    jakarta_labeled_thr030.csv

    lagos_labeled_thr030.csv

    ...

