# City Segments ‚Äî Preprocessing Script
This notebook processes raw City Segments v1 data and computes all required
built-environment indicators (i1‚Äìi10), along with contextual filling rules.

**Input:**  
Raw City Segments dataset (download separately)  
‚Üí https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/XLRSF0

**Output:**  
For each country folder:
`{country}_segments_vars_with_ratios.csv`

Raw data is **not** included in the repository due to size.  
Place the downloaded folders inside: `../data/raw/CitySegments/`



The structure should look like:

data/

‚îî‚îÄ‚îÄ raw/

‚îî‚îÄ‚îÄ CitySegments/

‚îú‚îÄ‚îÄ algeria/

‚îÇ ‚îú‚îÄ‚îÄ algeria_segments_vars.csv

‚îú‚îÄ‚îÄ india/

‚îÇ ‚îú‚îÄ‚îÄ india_segments_vars.csv

‚îî‚îÄ‚îÄ ...


This notebook keeps the file outputs inside each country folder.


# Configuration (relative paths)

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

# ---------------------------------------
# Path to raw City Segments dataset
# ---------------------------------------
PARENT = Path("../data/raw/CitySegments")

# Use all available cores
N_JOBS = -1

print("Using dataset directory:", PARENT.resolve())


# Helper functions

In [None]:
def safe_div(num, den):
    """
    Safe division for computing ratio indicators.
    Returns NaN where denominator <= 0.
    """
    return np.where(den > 0, num / den, np.nan)


# Country processing function

In [None]:
def process_country(country_dir: Path):
    """
    Process a single country's City Segments data.
    Computes ratio indicators and applies contextual filling.

    Saves:
        {country}_segments_vars_with_ratios.csv
    """
    country = country_dir.name
    csv_path = country_dir / f"{country}_segments_vars.csv"

    if not csv_path.exists():
        return f"‚ùå Missing CSV for {country}"

    try:
        df = pd.read_csv(csv_path)

        # Ensure numeric fields exist
        base_cols = ["POP_SEG", "AREAHA_SEG", "ROAD_SEG", "PAR_N_SEG", "PARU_N_SEG"]
        for col in base_cols:
            if col not in df.columns:
                df[col] = np.nan

        df[base_cols] = df[base_cols].apply(pd.to_numeric, errors="coerce")

        # ---------------------------------------------
        # Compute all ratio indicators (i1‚Äìi10)
        # ---------------------------------------------
        df["i1_pop_area"]    = safe_div(df["POP_SEG"], df["AREAHA_SEG"])
        df["i2_pop_par"]     = safe_div(df["POP_SEG"], df["PAR_N_SEG"])
        df["i3_pop_paru"]    = safe_div(df["POP_SEG"], df["PARU_N_SEG"])
        df["i4_pop_roads"]   = safe_div(df["POP_SEG"], df["ROAD_SEG"])
        df["i6_paru_area"]   = safe_div(df["AREAHA_SEG"] * 10_000, df["PARU_N_SEG"])
        df["i7_roads_area"]  = safe_div(df["ROAD_SEG"], df["AREAHA_SEG"])
        df["i8_paru_par"]    = safe_div(df["PARU_N_SEG"], df["PAR_N_SEG"])
        df["i10_roads_paru"] = safe_div(df["ROAD_SEG"], df["PARU_N_SEG"])
        # i5_par_area and i9_roads_par expected in source

        # ---------------------------------------------
        # Contextual filling rules
        # ---------------------------------------------

        # A. Untouched-parcel structural zeros
        if "PARU_A_SEG" in df.columns:
            df.loc[df["PARU_N_SEG"] == 0, "PARU_A_SEG"] = 0

        no_paru = df["PARU_N_SEG"] == 0
        df.loc[no_paru, ["i3_pop_paru", "i6_paru_area", "i10_roads_paru"]] = 0

        # B. Parcel/building variation zeros
        df.loc[df["PAR_N_SEG"] <= 1, "PAR_CV_SEG"] = 0
        if "B_CV_SEG" in df.columns:
            df.loc[(df["PAR_N_SEG"] <= 1) | (df["B_AREA_SEG"] == 0), "B_CV_SEG"] = 0

        # C. Other ratio zeros
        df.loc[df["AREAHA_SEG"] <= 0, "i1_pop_area"] = 0
        df.loc[df["PAR_N_SEG"] == 0, "i2_pop_par"] = 0
        df.loc[df["ROAD_SEG"] == 0, "i4_pop_roads"] = 0

        if "i5_par_area" in df.columns:
            df.loc[df["PAR_N_SEG"] == 0, "i5_par_area"] = 0
            df.loc[df["AREAHA_SEG"] <= 0, "i5_par_area"] = 0

        df.loc[df["AREAHA_SEG"] <= 0, "i7_roads_area"] = 0
        df.loc[df["ROAD_SEG"] == 0, "i7_roads_area"] = 0

        df.loc[df["PAR_N_SEG"] == 0, "i8_paru_par"] = 0
        df.loc[df["PAR_N_SEG"] == 0, "i9_roads_par"] = 0

        # D. Diagnostics
        if {"B_CV_SEG", "PAR_N_SEG", "B_AREA_SEG"}.issubset(df.columns):
            unresolved = df[(df["PAR_N_SEG"] > 1) &
                            (df["B_AREA_SEG"] > 0) &
                            df["B_CV_SEG"].isna()]
            if len(unresolved):
                print(f"‚ö†Ô∏è {country}: {len(unresolved)} rows ‚Üí ambiguous building CV")

        # ---------------------------------------------
        # Save output
        # ---------------------------------------------
        out_path = country_dir / f"{country}_segments_vars_with_ratios.csv"
        df.to_csv(out_path, index=False)

        total_rows = len(df)
        nan_rows = df.filter(like="i").isna().any(axis=1).sum()
        pct_nan = (nan_rows / total_rows) * 100 if total_rows else 0

        return f"‚úÖ {country} | Rows: {total_rows:,} | NaN rows: {nan_rows:,} ({pct_nan:.1f}%)"

    except Exception as e:
        return f"‚ùå Error in {country}: {str(e)}"


# Run in parallel for all countries

In [None]:
countries = [p for p in PARENT.iterdir() if p.is_dir()]
print(f"Found {len(countries)} country folders.")

results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=5)(
    delayed(process_country)(c) for c in countries
)

for r in results:
    print(r)

print("\nüéâ All countries processed.")
