
# Malaria Spending & Outcomes — Shift–Share Analysis

This notebook reproduces the portfolio visuals from the processed datasets:
- `data/malaria_spending_outcomes_master.csv` (5-year buckets; includes bundled shares and outcomes)
- `data/high_incidence_fastminuslow_abs_usd_2yr.csv` (fast − slow median **absolute USD** differences, 2-year buckets, high-incidence bins)
- `data/high_incidence_fastminuslow_velocity_2yr.csv` (fast − slow median **velocity of share change** per year, 2-year buckets, high-incidence bins)

> Note: This notebook **does not** rebuild 2-year buckets from raw IHME/WHO/OWID sources; it **loads the pre-computed** 2-year difference tables.  
> The 5-year share-based boxplots are computed directly from the master CSV.


In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({"figure.figsize": (7,4)})

DATA_DIR = "data"
FIG_DIR = "figures"

MASTER_PATH = os.path.join(DATA_DIR, "malaria_spending_outcomes_master.csv")
ABS_DIFF_2YR_PATH = os.path.join(DATA_DIR, "high_incidence_fastminuslow_abs_usd_2yr.csv")
VEL_DIFF_2YR_PATH = os.path.join(DATA_DIR, "high_incidence_fastminuslow_velocity_2yr.csv")

os.makedirs(FIG_DIR, exist_ok=True)


## Load 5-year master and compute Fast vs Slow improvers (within bins)

In [None]:

master = pd.read_csv(MASTER_PATH)

needed_cols = [
    "iso3","location_name","period","incidence_per_1000_at_risk_avg","malaria_death_rate_avg",
    "total_specific_spend_usd",
    "share_txdiag","share_vector","share_systems"
]
missing = [c for c in needed_cols if c not in master.columns]
if missing:
    detail_cols = [
        "share_anti_malarial_medicines","share_diagnostics",
        "share_itns","share_insecticide_and_spraying_materials",
        "share_human_resources_and_technical_assistance",
        "share_communication_and_advocacy","share_monitoring_and_evaluation",
        "share_prevention_other","share_iepaopsm"
    ]
    if all(c in master.columns for c in detail_cols):
        master["share_txdiag"] = master["share_anti_malarial_medicines"].fillna(0) + master["share_diagnostics"].fillna(0)
        master["share_vector"] = master["share_itns"].fillna(0) + master["share_insecticide_and_spraying_materials"].fillna(0)
        master["share_systems"] = (
            master["share_human_resources_and_technical_assistance"].fillna(0) +
            master["share_communication_and_advocacy"].fillna(0) +
            master["share_monitoring_and_evaluation"].fillna(0) +
            master["share_prevention_other"].fillna(0) +
            master["share_iepaopsm"].fillna(0)
        )
    else:
        raise ValueError(f"Master missing columns: {missing} and cannot rebuild from detail shares.")

def bin_incidence(x):
    if pd.isna(x): return np.nan
    if x < 0.1: return "0–0.1"
    if 0.1 <= x < 10: return "0.1–10"
    if 10 <= x < 50: return "10–50"
    if 50 <= x < 100: return "50–100"
    return "100+"

if "incidence_bin" not in master.columns:
    master["incidence_bin"] = master["incidence_per_1000_at_risk_avg"].apply(bin_incidence)

period_order_5y = ["2000–2004","2005–2009","2010–2014","2015–2019","2020"]
pmap = {p: (period_order_5y[i+1] if i+1 < len(period_order_5y) else None) for i,p in enumerate(period_order_5y)}
master["next_period"] = master["period"].map(pmap)

nxt = master[["iso3","period","incidence_per_1000_at_risk_avg"]].rename(
    columns={"period":"next_period","incidence_per_1000_at_risk_avg":"incidence_next"}
)
master = master.merge(nxt, on=["iso3","next_period"], how="left")
master["delta_incidence_next"] = master["incidence_next"] - master["incidence_per_1000_at_risk_avg"]

def label_fast_slow(group, delta_col="delta_incidence_next"):
    vals = group[delta_col].dropna()
    if len(vals) < 8:
        group["improve_group"] = np.nan
        return group
    q25 = np.nanpercentile(vals, 25)
    q75 = np.nanpercentile(vals, 75)
    def lab(x):
        if pd.isna(x): return np.nan
        if x <= q25: return "fast"
        if x >= q75: return "slow"
        return "middle"
    group["improve_group"] = group[delta_col].apply(lab)
    return group

master = master.groupby(["period","incidence_bin"], group_keys=False).apply(label_fast_slow)
master_fastslow = master[master["improve_group"].isin(["fast","slow"])].copy()
master_fastslow.head()


## Boxplots: Bundle shares for Fast vs Slow improvers by incidence bin (5-year buckets)

In [None]:

bundle_cols = ["share_txdiag","share_vector","share_systems"]
bin_order = ["0–0.1","0.1–10","10–50","50–100","100+"]
bin_order = [b for b in bin_order if b in master_fastslow["incidence_bin"].unique()]

summary_box = (
    master_fastslow.groupby(["incidence_bin","improve_group"])[bundle_cols]
    .median()
    .reindex(bin_order, level="incidence_bin")
    .reset_index()
)
summary_box_path = os.path.join(DATA_DIR, "shiftshare_boxplot_summary.csv")
summary_box.to_csv(summary_box_path, index=False)

for b in bin_order:
    sub = master_fastslow[master_fastslow["incidence_bin"]==b]
    if sub.empty or sub["improve_group"].nunique() < 2:
        continue
    fig, ax = plt.subplots(figsize=(8,5))
    positions = np.arange(len(bundle_cols))
    data_fast = [sub[sub["improve_group"]=="fast"][c].dropna() for c in bundle_cols]
    data_slow = [sub[sub["improve_group"]=="slow"][c].dropna() for c in bundle_cols]
    ax.boxplot(data_fast, positions=positions-0.2, widths=0.35, patch_artist=True)
    ax.boxplot(data_slow, positions=positions+0.2, widths=0.35, patch_artist=True)
    ax.set_xticks(positions)
    ax.set_xticklabels(["Tx/Diag","Vector","Systems"])
    ax.set_ylabel("Bundle share")
    ax.set_title(f"Fast vs Slow bundle shares — Incidence bin {b} (5-year)")
    outp = os.path.join(FIG_DIR, f"boxplots_fast_slow_bin_{b.replace('>','plus').replace('/','-')}.png")
    fig.tight_layout()
    fig.savefig(outp, dpi=200, bbox_inches="tight")
    plt.show()

summary_box_path


## Heatmaps & Bar Charts: High-incidence (2-year buckets), Fast − Slow differences

In [None]:

import os
have_abs = os.path.exists(ABS_DIFF_2YR_PATH)
have_vel = os.path.exists(VEL_DIFF_2YR_PATH)

if not (have_abs and have_vel):
    print("2-year difference CSVs not found in 'data/'. Skipping heatmaps and bar charts section.")
else:
    abs_diff = pd.read_csv(ABS_DIFF_2YR_PATH)
    vel_diff = pd.read_csv(VEL_DIFF_2YR_PATH)

    abs_cols = ["spend_txdiag_usd","spend_vector_usd","spend_systems_usd","total_specific_spend_usd"]
    vel_cols = ["velocity_share_txdiag_per_year","velocity_share_vector_per_year","velocity_share_systems_per_year"]
    bin_order_hi = ["50–100","100+"]
    abs_plot = abs_diff.set_index("incidence_bin").reindex(bin_order_hi).reset_index()
    vel_plot = vel_diff.set_index("incidence_bin").reindex(bin_order_hi).reset_index()

    # Heatmap: ABS USD differences
    mat_abs = abs_plot[abs_cols].values.T
    fig, ax = plt.subplots(figsize=(6,3.5))
    im = ax.imshow(mat_abs, aspect="auto", interpolation="nearest")
    fig.colorbar(im, ax=ax)
    ax.set_yticks(range(len(abs_cols)))
    ax.set_yticklabels(abs_cols)
    ax.set_xticks(range(len(bin_order_hi)))
    ax.set_xticklabels(bin_order_hi)
    ax.set_title("Fast − Slow median ABS USD (2-year, high-incidence)")
    outp_abs = os.path.join(FIG_DIR, "heatmap_abs_usd.png")
    fig.tight_layout()
    fig.savefig(outp_abs, dpi=200, bbox_inches="tight")
    plt.show()

    # Heatmap: Velocity differences
    mat_vel = vel_plot[vel_cols].values.T
    fig, ax = plt.subplots(figsize=(6,3.5))
    im = ax.imshow(mat_vel, aspect="auto", interpolation="nearest")
    fig.colorbar(im, ax=ax)
    ax.set_yticks(range(len(vel_cols)))
    ax.set_yticklabels(vel_cols)
    ax.set_xticks(range(len(bin_order_hi)))
    ax.set_xticklabels(bin_order_hi)
    ax.set_title("Fast − Slow median velocity (share/year) (2-year, high-incidence)")
    outp_vel = os.path.join(FIG_DIR, "heatmap_velocity.png")
    fig.tight_layout()
    fig.savefig(outp_vel, dpi=200, bbox_inches="tight")
    plt.show()

    # Bar charts for differences
    for _, row in abs_plot.iterrows():
        binlab = row["incidence_bin"]
        fig, ax = plt.subplots(figsize=(7,4))
        vals = [row[c] for c in abs_cols]
        ax.bar(["Tx/Diag","Vector","Systems","Total"], vals)
        ax.axhline(0, linewidth=0.8)
        ax.set_ylabel("USD difference (Fast minus Slow)")
        ax.set_title(f"High-incidence {binlab}: Fast − Slow median ABS USD")
        outp = os.path.join(FIG_DIR, f"bar_abs_usd_{binlab.replace('+','plus')}.png")
        fig.tight_layout()
        fig.savefig(outp, dpi=200, bbox_inches="tight")
        plt.show()

    for _, row in vel_plot.iterrows():
        binlab = row["incidence_bin"]
        fig, ax = plt.subplots(figsize=(7,4))
        vals = [row[c] for c in vel_cols]
        ax.bar(["Tx/Diag","Vector","Systems"], vals)
        ax.axhline(0, linewidth=0.8)
        ax.set_ylabel("Share change per year (Fast minus Slow)")
        ax.set_title(f"High-incidence {binlab}: Fast − Slow median velocity (share/year)")
        outp = os.path.join(FIG_DIR, f"bar_velocity_{binlab.replace('+','plus')}.png")
        fig.tight_layout()
        fig.savefig(outp, dpi=200, bbox_inches="tight")
        plt.show()



### Notes
- **Fast improvers** = bottom quartile of next-period incidence change (most negative).  
- **Slow improvers** = top quartile (least negative / positive).  
- Heatmaps and bars visualize **differences (fast − slow)**, not raw group medians.
- See `data/shiftshare_boxplot_summary.csv` for per-bin medians (fast vs slow) on bundle shares.
