In [2]:
# %%
# Sanity / validation checks for adm2_risk_daily.csv

import pandas as pd
import numpy as np
from pathlib import Path

OUT_DIR = Path("out")
CSV_PATH = OUT_DIR / "adm2_risk_daily.csv"

df = pd.read_csv(CSV_PATH)
print(f"Loaded {len(df):,} rows × {df.shape[1]} columns from {CSV_PATH}\n")

# --- Basic structure and completeness ---
print("Columns:", list(df.columns))
print("\nMissing values per column:")
print(df.isna().sum().sort_values(ascending=False))

# --- Numeric coverage and all-zero detection ---
num_cols = df.select_dtypes(include=[np.number]).columns
zero_cols = [c for c in num_cols if (df[c].fillna(0) == 0).all()]
const_cols = [c for c in num_cols if df[c].nunique(dropna=True) <= 1]

print("\n⚠️ Columns that are entirely zero:", zero_cols)
print("⚠️ Columns with constant values:", const_cols)

# --- Non-zero rate (fraction of rows with any value > 0) ---
nz_rate = (df[num_cols] > 0).sum() / len(df)
print("\nNon-zero rate by column:")
print(nz_rate.round(3))

# --- Basic numeric ranges for main indices ---
key_cols = ["DCR100", "PRS100", "priority100", "access_A", "strain_H", "mvi", "cast_state"]
for c in key_cols:
    if c in df.columns:
        print(f"\n{c} range: {df[c].min():.3f} to {df[c].max():.3f}, "
              f"mean={df[c].mean():.3f}, nonzero%={(df[c] > 0).mean() * 100:.1f}")

# --- ADM1-level zero checks (e.g., ensure no entire state is zero) ---
group_keys = ["adm1_name"]
check_cols = ["facilities", "pop_wra", "DCR100", "PRS100", "priority100", "access_A", "strain_H", "mvi"]

print("\nChecking for ADM1 with all-zero values in key metrics...")
if not all(col in df.columns for col in group_keys):
    print("[Skip] ADM1 check — missing adm1_name column.")
else:
    zero_report = []
    for adm1, g in df.groupby("adm1_name"):
        for c in [col for col in check_cols if col in g.columns]:
            if (g[c].fillna(0) == 0).all():
                zero_report.append((adm1, c))
    if zero_report:
        print("⚠️ ADM1-level all-zero metrics detected:")
        for adm1, c in zero_report:
            print(f"  - {adm1}: all zeros in {c}")
    else:
        print("✓ All ADM1 regions have non-zero values for at least one row in each key metric.")

# --- Sanity thresholds / warnings ---
if len(zero_cols) > 0:
    print("\n[Warning] Some numeric columns are all zeros; check upstream calculations.")
if len(df) == 0:
    print("\n[Warning] Empty CSV — pipeline may have filtered everything out.")
if "priority100" in df.columns and df["priority100"].max() <= 0:
    print("\n[Warning] priority100 has no positive values (possible normalization issue).")

# --- Optional: quick correlation sanity check (should have some variability) ---
if len(df) > 5:
    corr = df[num_cols].corr()
    print("\nTop correlations among key metrics:")
    print(corr.loc[[c for c in key_cols if c in corr.index], [c for c in key_cols if c in corr.columns]].round(2))

print("\nValidation complete.")

Loaded 2,457 rows × 17 columns from out/adm2_risk_daily.csv

Columns: ['run_date', 'adm1_name', 'adm2_name', 'adm2_code', 'pop_wra', 'w_exposure', 'v30', 'v3m', 'dlt_v30_raw', 'spillover', 'cast_state', 'access_A', 'strain_H', 'mvi', 'DCR100', 'PRS100', 'priority100']

Missing values per column:
run_date       0
spillover      0
PRS100         0
DCR100         0
mvi            0
strain_H       0
access_A       0
cast_state     0
dlt_v30_raw    0
adm1_name      0
v3m            0
v30            0
w_exposure     0
pop_wra        0
adm2_code      0
adm2_name      0
priority100    0
dtype: int64

⚠️ Columns that are entirely zero: []
⚠️ Columns with constant values: []

Non-zero rate by column:
pop_wra        0.999
w_exposure     1.000
v30            0.058
v3m            0.185
dlt_v30_raw    0.041
spillover      0.280
cast_state     0.924
access_A       0.868
strain_H       0.950
mvi            0.950
DCR100         0.910
PRS100         0.913
priority100    0.913
dtype: float64

DCR100 rang

In [2]:
# %% [markdown]
# Input Join Checker — ADM2/ADM1 consistency and ACLED spatial/name joins

# %%
import os
from pathlib import Path
import pandas as pd
import geopandas as gpd
import numpy as np
from unidecode import unidecode

# Optional: spatial join test for ACLED event points if present
ACLED_POINTS_CSV = Path("data/acled_events_90d.csv")  # optional; needs latitude, longitude

DATA_DIR = Path("data")
OUT_DIR  = Path("out/checks")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Core inputs (as produced/used in your pipeline)
ADM2_SHP         = DATA_DIR / "mex_admbnda_govmex_20210618_SHP" / "mex_admbnda_adm2_govmex_20210618.shp"
POP_CSV          = DATA_DIR / "pop_adm2.csv"                       # adm2_code, pop_total, pop_wra
CLUES_CSV        = DATA_DIR / "clues_facility_counts_adm2.csv"     # adm2_code, facilities
CONEVAL_CSV      = DATA_DIR / "coneval_muni.csv"                    # adm2_code, poverty_rate
CAST_STATE_CSV   = DATA_DIR / "cast_state.csv"                      # adm1_name, cast_raw (or scaled)

# Utility: consistent name normalization (strip accents, case, extra spaces)
def norm_name(s: pd.Series) -> pd.Series:
    return (
        s.fillna("")
         .astype(str)
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
         .apply(lambda x: unidecode(x).strip().title())
    )

def as_str_no_nan(x):
    return x.fillna("").astype(str)

print("Loading ADM2 shapefile…")
adm2 = gpd.read_file(ADM2_SHP)
# Keep only what we need
keep_cols = ["ADM1_ES","ADM1_PCODE","ADM2_ES","ADM2_PCODE","geometry"]
adm2 = adm2[keep_cols].rename(columns={
    "ADM1_ES": "adm1_name",
    "ADM1_PCODE": "adm1_code",
    "ADM2_ES": "adm2_name",
    "ADM2_PCODE":"adm2_code"
}).to_crs(4326)

# Normalize readable names (codes remain as-is)
adm2["adm1_name_norm"] = norm_name(adm2["adm1_name"])
adm2["adm2_name_norm"] = norm_name(adm2["adm2_name"])
adm2["adm2_code"]      = as_str_no_nan(adm2["adm2_code"])

print(f"ADM2 polygons: {len(adm2):,} rows, {adm2.crs}")

# --- CAST (ADM1) check ---
print("\nChecking CAST (state-level)…")
if CAST_STATE_CSV.exists():
    cast = pd.read_csv(CAST_STATE_CSV)
    # Support either cast_raw or pre-scaled column
    if "cast_raw" in cast.columns:
        cast["cast_state"] = cast["cast_raw"]
    elif "cast_state" not in cast.columns:
        cast["cast_state"] = np.nan
    cast["adm1_name_norm"] = norm_name(cast["adm1_name"])
    # Distinct to avoid dupes
    cast1 = cast[["adm1_name_norm","cast_state"]].drop_duplicates()

    # Left-join CAST to ADM2 by normalized ADM1 name
    adm2_cast = adm2.merge(cast1, on="adm1_name_norm", how="left")

    # Report coverage
    missing_cast = adm2_cast[adm2_cast["cast_state"].isna()][["adm1_name","adm1_code"]].drop_duplicates()
    print(f"CAST coverage: matched {adm2_cast['cast_state'].notna().mean():.1%} of ADM2 rows (by state).")
    if not missing_cast.empty:
        print("States with missing CAST matches:")
        print(missing_cast.to_string(index=False))
        missing_cast.to_csv(OUT_DIR / "cast_missing_states.csv", index=False)
else:
    print("CAST file not found; skipping CAST checks.")

# --- CONEVAL (ADM2) check ---
print("\nChecking CONEVAL municipal poverty (ADM2)…")
if CONEVAL_CSV.exists():
    coneval = pd.read_csv(CONEVAL_CSV, dtype={"adm2_code":"string"})
    coneval["adm2_code"] = as_str_no_nan(coneval["adm2_code"])
    # coverage
    adm2_codes = set(adm2["adm2_code"])
    cv_codes   = set(coneval["adm2_code"])
    missing_in_coneval = sorted(adm2_codes - cv_codes)
    extra_in_coneval   = sorted(cv_codes - adm2_codes)

    print(f"CONEVAL coverage: {len(cv_codes & adm2_codes):,} / {len(adm2_codes):,} ADM2 codes matched ({len(missing_in_coneval)} missing, {len(extra_in_coneval)} extra).")
    pd.DataFrame({"adm2_code": missing_in_coneval}).to_csv(OUT_DIR/"coneval_missing_adm2.csv", index=False)
    pd.DataFrame({"adm2_code": extra_in_coneval}).to_csv(OUT_DIR/"coneval_extra_adm2.csv", index=False)

    # Quick distribution of poverty_rate
    if "poverty_rate" in coneval.columns:
        desc = coneval["poverty_rate"].describe()
        print("poverty_rate describe():")
        print(desc)
else:
    print("CONEVAL file not found; skipping CONEVAL checks.")

# --- CLUES (ADM2) check ---
print("\nChecking CLUES facility counts (ADM2)…")
if CLUES_CSV.exists():
    clues = pd.read_csv(CLUES_CSV, dtype={"adm2_code":"string"})
    clues["adm2_code"] = as_str_no_nan(clues["adm2_code"])

    cl_codes = set(clues["adm2_code"])
    missing_in_clues = sorted(adm2_codes - cl_codes)
    extra_in_clues   = sorted(cl_codes - adm2_codes)

    print(f"CLUES coverage: {len(cl_codes & adm2_codes):,} / {len(adm2_codes):,} ADM2 codes matched ({len(missing_in_clues)} missing, {len(extra_in_clues)} extra).")
    pd.DataFrame({"adm2_code": missing_in_clues}).to_csv(OUT_DIR/"clues_missing_adm2.csv", index=False)
    pd.DataFrame({"adm2_code": extra_in_clues}).to_csv(OUT_DIR/"clues_extra_adm2.csv", index=False)

    # sanity on counts
    if "facilities" in clues.columns:
        print("facilities describe():")
        print(clues["facilities"].describe())
else:
    print("CLUES file not found; skipping CLUES checks.")

# --- POP (ADM2) check ---
print("\nChecking Population (ADM2)…")
if POP_CSV.exists():
    pop = pd.read_csv(POP_CSV, dtype={"adm2_code":"string"})
    pop["adm2_code"] = as_str_no_nan(pop["adm2_code"])
    pp_codes = set(pop["adm2_code"])
    missing_in_pop = sorted(adm2_codes - pp_codes)
    extra_in_pop   = sorted(pp_codes - adm2_codes)

    print(f"Population coverage: {len(pp_codes & adm2_codes):,} / {len(adm2_codes):,} ADM2 codes matched ({len(missing_in_pop)} missing, {len(extra_in_pop)} extra).")
    pd.DataFrame({"adm2_code": missing_in_pop}).to_csv(OUT_DIR/"pop_missing_adm2.csv", index=False)
    pd.DataFrame({"adm2_code": extra_in_pop}).to_csv(OUT_DIR/"pop_extra_adm2.csv", index=False)

    for col in ["pop_total","pop_wra"]:
        if col in pop.columns:
            print(f"{col} describe():")
            print(pop[col].describe())
else:
    print("Population file not found; skipping POP checks.")

# --- Optional: ACLED points name-join vs spatial-join comparison ---
print("\nACLED events verification (optional)…")
if ACLED_POINTS_CSV.exists():
    # Expect columns: latitude, longitude; optionally admin1/admin2
    events = pd.read_csv(ACLED_POINTS_CSV)
    # Basic cleaning
    latcol = next((c for c in events.columns if c.lower() == "latitude"), None)
    loncol = next((c for c in events.columns if c.lower() == "longitude"), None)
    if not latcol or not loncol:
        print("ACLED points present but missing latitude/longitude columns; skipping spatial join test.")
    else:
        # Spatial join
        ev_gdf = gpd.GeoDataFrame(
            events.dropna(subset=[latcol, loncol]).copy(),
            geometry=gpd.points_from_xy(events[loncol], events[latcol]),
            crs=4326
        )

        # Keep small sample if extremely large
        if len(ev_gdf) > 250_000:
            ev_gdf = ev_gdf.sample(250_000, random_state=42).copy()
            print(f"Sampled 250,000 events for spatial join speed (from {len(events):,}).")

        ev_in_adm2 = gpd.sjoin(ev_gdf, adm2[["adm2_code","adm1_name_norm","adm2_name_norm","geometry"]],
                               how="left", predicate="within").drop(columns=["index_right"])

        # Name-join attempt (if admin1/admin2 exist)
        a1 = next((c for c in events.columns if c.lower() in {"admin1","adm1","state"}), None)
        a2 = next((c for c in events.columns if c.lower() in {"admin2","adm2","municipio","municipality"}), None)

        if a1 and a2:
            tmp = ev_in_adm2.copy()
            tmp["admin1_norm"] = norm_name(tmp[a1])
            tmp["admin2_norm"] = norm_name(tmp[a2])

            # merge to ADM2 names to get a code by names
            name_join = tmp.merge(
                adm2[["adm2_code","adm1_name_norm","adm2_name_norm"]],
                left_on=["admin1_norm","admin2_norm"],
                right_on=["adm1_name_norm","adm2_name_norm"],
                how="left",
                suffixes=("","_adm")
            )

            # compare codes: spatial vs name-based
            both = name_join[["adm2_code", "adm2_code_adm"]].copy()
            both["match"] = both["adm2_code"].fillna("") == both["adm2_code_adm"].fillna("")
            rate = both["match"].mean()
            mism = both[~both["match"]].head(20)
            print(f"Name vs spatial join agreement: {rate:.1%} (sample of {len(both):,} events)")
            if not mism.empty:
                mism.to_csv(OUT_DIR/"acled_name_vs_spatial_mismatches_sample.csv", index=False)
                print("Wrote sample mismatches to out/checks/acled_name_vs_spatial_mismatches_sample.csv")
        else:
            print("ACLED events lack admin1/admin2 columns; only spatial coverage was checked.")

        # Spatial coverage rate
        cov = ev_in_adm2["adm2_code"].notna().mean()
        print(f"Spatial join coverage (events within ADM2 polygons): {cov:.1%} of events")
else:
    print("No ACLED events CSV at data/acled_events_90d.csv; skipping ACLED spatial/name join checks.")

print("\nChecks complete. Reports (if any) are in out/checks/")

Loading ADM2 shapefile…
ADM2 polygons: 2,457 rows, EPSG:4326

Checking CAST (state-level)…
CAST coverage: matched 98.6% of ADM2 rows (by state).
States with missing CAST matches:
           adm1_name adm1_code
    Distrito Federal      MX09
Querétaro de Arteaga      MX22

Checking CONEVAL municipal poverty (ADM2)…
CONEVAL coverage: 2,457 / 2,457 ADM2 codes matched (0 missing, 12 extra).
poverty_rate describe():
count    2466.000000
mean       62.002065
std        21.903723
min         5.450951
25%        45.580691
50%        62.745101
75%        80.316135
max        99.646676
Name: poverty_rate, dtype: float64

Checking CLUES facility counts (ADM2)…
CLUES coverage: 932 / 2,457 ADM2 codes matched (1525 missing, 6 extra).
facilities describe():
count    938.000000
mean       6.152452
std       12.040008
min        1.000000
25%        1.000000
50%        2.000000
75%        6.000000
max      135.000000
Name: facilities, dtype: float64

Checking Population (ADM2)…
Population coverage: 2,45

In [1]:
# %%
import os
import datetime as dt
import requests
import pandas as pd
from dotenv import load_dotenv

# --- Config ---
TOKEN_URL = "https://acleddata.com/oauth/token"
READ_URL  = "https://acleddata.com/api/acled/read"  # default JSON
COUNTRY   = "Mexico"

# --- Auth ---
load_dotenv()
ACLED_USER = os.getenv("ACLED_USER")
ACLED_PASS = os.getenv("ACLED_PASS")
assert ACLED_USER and ACLED_PASS, "Set ACLED_USER and ACLED_PASS in your .env"

tok = requests.post(
    TOKEN_URL,
    headers={"Content-Type": "application/x-www-form-urlencoded"},
    data={
        "username": ACLED_USER,
        "password": ACLED_PASS,
        "grant_type": "password",
        "client_id": "acled",
    },
    timeout=60,
)
tok.raise_for_status()
access_token = tok.json()["access_token"]

# --- Date window: last 30 days ---
end = dt.date.today()
start = end - dt.timedelta(days=30)
print(f"Requesting {COUNTRY} events {start} → {end}")

# --- Fetch JSON (default) ---
params = {
    "country": COUNTRY,
    "event_date": f"{start}|{end}",
    "event_date_where": "BETWEEN",
    "limit": 5000,  # pagination not needed for a 30-day window typically
}
r = requests.get(
    READ_URL,
    headers={"Authorization": f"Bearer {access_token}"},
    params=params,
    timeout=120,
)
r.raise_for_status()

js = r.json()
# ACLED responses include a "status" field; 200 means OK even if 'data' is empty.
status = js.get("status")
data = js.get("data", [])

if status != 200:
    print(f"ACLED returned status {status}. Full response:\n{js}")
elif not data:
    # Helpful diagnostics when empty
    msg = js.get("message") or js.get("detail") or "(no message)"
    print("No rows returned. Possible causes: recency cap or filters too narrow.")
    print(f"Server message: {msg}")
else:
    df = pd.DataFrame(data)
    print(f"Downloaded {len(df):,} rows; columns: {list(df.columns)[:10]}…")
    out = "acled_mexico_30d.csv"
    df.to_csv(out, index=False)
    print(f"Saved → {out}")

Requesting Mexico events 2025-09-25 → 2025-10-25
No rows returned. Possible causes: recency cap or filters too narrow.
Server message: (no message)
