# 03 â€” IDPs (latest snapshot by province)

**Dataset (source file):** `raw_data/hdx_hapi_idps_afg.csv`  
**Geo boundary used by the website:** `raw_data/geo/afghanistan_adm1.geojson`  
**Website section:** `js/sections/idps.js`

This notebook:
1) reads raw IDPs data,
2) filters to `admin_level=1` and a valid date + positive population,
3) finds the latest snapshot date,
4) aggregates by province,
5) saves:
   - cleaned snapshot rows to `dataset/cleaned/idps_admin1_latest_clean.csv`
   - dropped rows to `dataset/cleaned/idps_dropped.csv`
   - aggregates to `dataset/derived/idps_latest_by_province.csv` + `dataset/derived/idps_latest_total.json`


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

RAW_DIR = Path("raw_data")        # raw inputs live here
DATASET_DIR = Path("dataset")     # website reads from here
CLEAN_DIR = DATASET_DIR / "cleaned"
DERIVED_DIR = DATASET_DIR / "derived"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
DERIVED_DIR.mkdir(parents=True, exist_ok=True)

def to_number(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s.astype(str).str.replace(",", "").str.strip(), errors="coerce")

def parse_year(val) -> float:
    if pd.isna(val):
        return np.nan
    s = str(val).strip()
    if len(s) >= 4 and s[:4].isdigit():
        return float(s[:4])
    for sep in ("/", "-"):
        parts = s.split(sep)
        if len(parts) == 3 and parts[-1].isdigit():
            return float(parts[-1])
    return np.nan

def show_filter_report(df_before: pd.DataFrame, mask: pd.Series, title: str, sample_n: int = 10):
    kept = df_before[mask].copy()
    dropped = df_before[~mask].copy()
    print(f"=== {title} ===")
    print("rows_before:", len(df_before))
    print("rows_kept:", len(kept))
    print("rows_dropped:", len(dropped))
    if len(dropped):
        print("\nSample dropped rows:")
        display(dropped.head(sample_n))
    return kept, dropped


In [None]:
raw_path = RAW_DIR / "hdx_hapi_idps_afg.csv"
df = pd.read_csv(raw_path)
df.head()


In [None]:
required = {"admin_level","reference_period_start","population"}
missing = sorted(required - set(df.columns))
assert not missing, f"Missing columns: {missing}"

df2 = df.copy()
df2["admin_level_str"] = df2["admin_level"].astype(str).str.strip()
df2["date"] = pd.to_datetime(df2["reference_period_start"], errors="coerce")
df2["population_num"] = to_number(df2["population"])

mask = (df2["admin_level_str"] == "1") & df2["date"].notna() & df2["population_num"].notna() & (df2["population_num"] > 0)
kept, dropped = show_filter_report(df2, mask, "IDPs filter (admin_level=1, valid date, numeric pop, pop>0)")


In [None]:
latest_date = kept["date"].max()
assert pd.notna(latest_date), "No valid dates after filtering"
latest_iso = latest_date.strftime("%Y-%m-%d")

snap = kept[kept["date"] == latest_date].copy()
snap["admin1_name"] = snap.get("admin1_name", "").astype(str).str.strip()

snap_out = CLEAN_DIR / "idps_admin1_latest_clean.csv"
snap.to_csv(snap_out, index=False)
snap_out, latest_iso


In [None]:
by_prov = (snap.groupby("admin1_name", as_index=False)["population_num"]
             .sum()
             .sort_values("population_num", ascending=False))
by_prov.rename(columns={"population_num":"idps_count"}, inplace=True)

out_csv = DERIVED_DIR / "idps_latest_by_province.csv"
by_prov.to_csv(out_csv, index=False)

total = float(by_prov["idps_count"].sum())
out_json = DERIVED_DIR / "idps_latest_total.json"
out_json.write_text(json.dumps({"latest_date": latest_iso, "idps_total_admin1": total}, indent=2), encoding="utf-8")

(out_csv, out_json)


In [None]:
dropped_out = CLEAN_DIR / "idps_dropped.csv"
dropped.to_csv(dropped_out, index=False)
dropped_out
