In [None]:
# ===============================================================
# Figure 3 — Country lollipop plots + city-size inset bar charts
#
# Inputs (relative to repo root):
#   - 2_modelling/02_application/summary_statistics/
#       └─ city_deprivation_with_sizeclass_80pct.csv
#   - 4_Figures_Tables/
#       └─ country_ISOcodes_new.csv
#
# Outputs:
#   - 4_Figures_Tables/Figures/Figure3_Africa_CitySize.png
#   - 4_Figures_Tables/Figures/Figure3_Asia_CitySize.png
#   - 4_Figures_Tables/Figures/Figure3_LAC_CitySize.png
#   - 4_Figures_Tables/Figures/Figure3_Lollipops_withcitysize.jpg (manually created and uploaded)
# ===============================================================

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# ---------------------------------------------------------------
# 1️⃣ Paths
# ---------------------------------------------------------------
REPO_ROOT = Path.cwd()  # assume notebook is run from repo root
CITY_PATH = REPO_ROOT / "2_modelling" / "02_application" / "summary_statistics" / "city_deprivation_with_sizeclass_80pct.csv"
ISO_PATH  = REPO_ROOT / "4_Figures_Tables" / "country_ISOcodes_new.csv"
FIG_DIR   = REPO_ROOT / "4_Figures_Tables" / "Figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

print("City file:", CITY_PATH)
print("ISO file :", ISO_PATH)
print("Figures  :", FIG_DIR)

In [None]:
# ---------------------------------------------------------------
# 2️⃣ Load data
# ---------------------------------------------------------------
city_df = pd.read_csv(CITY_PATH)
iso_df  = pd.read_csv(ISO_PATH)

# Aggregate to country-level
country_df = (
    city_df.groupby(["Region", "Country"])
    .agg(
        TotalPop=("TotalPop", "sum"),
        DeprivedPop=("DeprivedPop", "sum"),
    )
    .reset_index()
)
country_df["PctDeprived"] = (country_df["DeprivedPop"] / country_df["TotalPop"]) * 100

# Merge ISO codes
country_df = country_df.merge(iso_df, on="Country", how="left")

# Global max for scaling dot size consistently
global_max_deprived = country_df["DeprivedPop"].max()

# Manual fine-tuning offsets in percentage points (for labels of abs. deprived pop)
label_offsets = {
    "NGA": 2.5,
    "EGY": 2.5,
    "COD": 2.5,
    "IND": 2.5,
    "PAK": 2.5,
    "BGD": 2.5,
    "BRA": 3.0,
    "COL": 3.0,
    "MEX": 2.5,
    # add/adjust if needed
}

plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["figure.dpi"] = 300

In [None]:
# ---------------------------------------------------------------
# 3️⃣ Lollipop plotting function (per region)
# ---------------------------------------------------------------
def plot_lollipop(ax, sub, region_name, add_legend=False, color="#d62728"):
    """
    sub: country_df subset for a given region
    """
    # Sort by % Deprived
    sub = sub.sort_values("PctDeprived", ascending=False)

    # Region average (pop-weighted)
    region_avg = (sub["DeprivedPop"].sum() / sub["TotalPop"].sum()) * 100

    # Normalize dot sizes (sqrt scaling, consistent with global max)
    sizes = np.sqrt(sub["DeprivedPop"]) / np.sqrt(global_max_deprived) * 400

    # Stems
    ax.hlines(
        y=sub["ISO"],
        xmin=0,
        xmax=sub["PctDeprived"],
        color="lightgrey",
        linewidth=1,
    )

    # Dots
    ax.scatter(
        sub["PctDeprived"],
        sub["ISO"],
        s=sizes,
        color=color,
        alpha=0.8,
        zorder=3,
    )

    # Region average line
    ax.axvline(region_avg, color="navy", linestyle="--", linewidth=1)
    ax.text(
        region_avg + 0.5,
        len(sub) - 1.5,
        f"Region Avg: {region_avg:.1f}%",
        color="navy",
        fontsize=11,
        va="bottom",
    )

    # --- Highlight top 3 by % deprived (percent labels) ---
    top3_pct = sub.head(3)
    for _, row in top3_pct.iterrows():
        ax.text(
            row["PctDeprived"] + 1,
            row["ISO"],
            f"{row['PctDeprived']:.1f}%",
            va="center",
            fontsize=9,
            color="black",
        )

    # --- Highlight top 3 by absolute deprived population (millions) ---
    top3_abs = sub.nlargest(3, "DeprivedPop")
    for _, row in top3_abs.iterrows():
        iso = row["ISO"]
        offset_x = label_offsets.get(iso, 2.0)  # default offset if not specified
        ax.text(
            row["PctDeprived"] + offset_x,
            row["ISO"],
            f"{row['DeprivedPop'] / 1e6:.0f}M",
            va="center",
            ha="left",
            fontsize=10,
            color="darkred",
            fontweight="bold",
        )

    # Formatting
    ax.set_title(
        f"Share of population in deprived segments by country — {region_name}",
        fontsize=11,
        weight="bold",
    )
    ax.set_xlabel("Deprived population (%)")
    ax.set_ylabel("")
    ax.grid(axis="x", linestyle=":", alpha=0.3)
    ax.set_xlim(0, 75)  # synchronized across panels

    # Add dot size legend only once (Africa panel)
    if add_legend:
        for pop in [1e6, 10e6, 50e6]:  # 1M, 10M, 50M deprived
            ax.scatter(
                [],
                [],
                s=np.sqrt(pop) / np.sqrt(global_max_deprived) * 400,
                color=color,
                alpha=0.6,
                label=f"{int(pop / 1e6)}M",
            )
        ax.legend(
            title="Population in deprived\nsegments (millions)",
            loc="upper right",
            fontsize=9,
            title_fontsize=10,
            frameon=False,
        )

In [None]:
# ---------------------------------------------------------------
# 4️⃣ Main lollipop figure (Africa, Asia, LAC)
# ---------------------------------------------------------------
fig = plt.figure(figsize=(14, 12))
gs = fig.add_gridspec(
    2,
    2,
    width_ratios=[1.1, 1],
    height_ratios=[1.2, 0.8],
)

# Africa (left, full height, with legend)
ax_africa = fig.add_subplot(gs[:, 0])
plot_lollipop(
    ax_africa,
    country_df[country_df["Region"] == "Africa"],
    "Africa",
    add_legend=True,
)

# Asia (top-right)
ax_asia = fig.add_subplot(gs[0, 1])
plot_lollipop(
    ax_asia,
    country_df[country_df["Region"] == "Asia"],
    "Asia",
)

# LAC (bottom-right)
ax_lac = fig.add_subplot(gs[1, 1])
plot_lollipop(
    ax_lac,
    country_df[country_df["Region"] == "Latin America and the Caribbean"],
    "LAC",
)

plt.tight_layout()

outfile_lollipop = FIG_DIR / "Figure3_Lollipop_Regional_Deprivation.png"
plt.savefig(outfile_lollipop, dpi=500, bbox_inches="tight")
print(f"✅ Saved lollipop figure to: {outfile_lollipop}")

plt.show()

In [None]:
# ---------------------------------------------------------------
# 5️⃣ City-size inset plots
# ---------------------------------------------------------------
def plot_region_citysize(df, region_name, filename=None):
    """
    City-size stacked bar chart for a given region.
    Uses columns:
      - Region
      - CitySizeClass (Small / Medium / Large / Very large / Megacity)
      - TotalPop, DeprivedPop
    """
    region_cities = df[df["Region"] == region_name]

    # Aggregate by size class
    size_summary = (
        region_cities.groupby("CitySizeClass")
        .agg(
            TotalPop=("TotalPop", "sum"),
            DeprivedPop=("DeprivedPop", "sum"),
        )
        .reset_index()
    )
    size_summary["NonDeprivedPop"] = (
        size_summary["TotalPop"] - size_summary["DeprivedPop"]
    )
    size_summary["DeprivedPct"] = (
        size_summary["DeprivedPop"] / size_summary["TotalPop"] * 100
    )
    size_summary["NonDeprivedPct"] = 100 - size_summary["DeprivedPct"]

    # WUP order for city sizes
    order = ["Small", "Medium", "Large", "Very large", "Megacity"]
    size_summary["CitySizeClass"] = pd.Categorical(
        size_summary["CitySizeClass"],
        categories=order,
        ordered=True,
    )
    size_summary = size_summary.sort_values("CitySizeClass")

    # --- Plot ---
    fig, ax = plt.subplots(figsize=(6, 4))

    bars1 = ax.barh(
        size_summary["CitySizeClass"],
        size_summary["NonDeprivedPct"],
        color="lightgrey",
        label="Non-deprived",
        height=0.8,
    )
    bars2 = ax.barh(
        size_summary["CitySizeClass"],
        size_summary["DeprivedPct"],
        left=size_summary["NonDeprivedPct"],
        color="#d62728",
        label="Deprived",
        height=0.8,
    )

    # Labels inside deprived segment
    for bar, pct, absval, base in zip(
        bars2,
        size_summary["DeprivedPct"],
        size_summary["DeprivedPop"] / 1e6,
        size_summary["NonDeprivedPct"],
    ):
        if pct > 2:
            ax.text(
                base + pct / 2,
                bar.get_y() + bar.get_height() / 2,
                f"{pct:.1f}%\n({absval:.0f}M)",
                ha="center",
                va="center",
                color="white",
                fontsize=10,
                fontweight="bold",
            )

    # Style
    ax.set_xlabel("Population share (%)", fontsize=11)
    ax.set_xlim(0, 100)
    ax.set_title(
        "Population share by city size",
        fontsize=12,
        weight="bold",
    )
    ax.tick_params(axis="y", labelsize=11)
    ax.grid(axis="x", linestyle=":", alpha=0.4)

    ax.legend(
        loc="lower left",
        frameon=True,
        framealpha=1,
        edgecolor="black",
        fancybox=False,
        fontsize=9,
        title_fontsize=10,
    )

    plt.tight_layout()

    if filename:
        plt.savefig(filename, dpi=500, bbox_inches="tight")
        print(f"✅ Saved city-size figure to: {filename}")

    plt.show()

# Africa
plot_region_citysize(
    city_df,
    "Africa",
    filename=FIG_DIR / "Figure3_Africa_CitySize.png",
)

# Asia
plot_region_citysize(
    city_df,
    "Asia",
    filename=FIG_DIR / "Figure3_Asia_CitySize.png",
)

# LAC
plot_region_citysize(
    city_df,
    "Latin America and the Caribbean",
    filename=FIG_DIR / "Figure3_LAC_CitySize.png",
)