In [1]:
from __future__ import annotations

import re
import gc
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

YEARS = list(range(2014, 2020))

CACHE_DIR = Path("data/cache")
OUT_DIR   = Path("data/processed")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNKSIZE = 300_000

CANDIDATE_MA_ROOTS = [
    Path("/scion/5261/econ470001/ma-data/ma"),
    Path("/econ470/a0/work/ma-data/ma"),
    Path.cwd().parent / "ma-data" / "ma",
]

def pick_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p.exists():
            return p
    return None

MA_ROOT = pick_existing(CANDIDATE_MA_ROOTS)
if MA_ROOT is None:
    raise FileNotFoundError("Could not find ma-data/ma. Update CANDIDATE_MA_ROOTS if needed.")

ENROLL_EXTRACTED = MA_ROOT / "enrollment" / "Extracted Data"
SAREA_EXTRACTED  = MA_ROOT / "service-area" / "Extracted Data"
PEN_EXTRACTED    = MA_ROOT / "penetration" / "Extracted Data"

print("MA_ROOT:", MA_ROOT)
print("ENROLL_EXTRACTED exists:", ENROLL_EXTRACTED.exists())
print("SAREA_EXTRACTED exists:", SAREA_EXTRACTED.exists())
print("PEN_EXTRACTED exists:", PEN_EXTRACTED.exists())

FORCE_REBUILD_CACHE   = True
FORCE_REBUILD_OUTPUTS = True

print("FORCE_REBUILD_CACHE:", FORCE_REBUILD_CACHE)
print("FORCE_REBUILD_OUTPUTS:", FORCE_REBUILD_OUTPUTS)

# Optional cleanup
if FORCE_REBUILD_CACHE:
    n = 0
    for p in CACHE_DIR.glob("plan_county_year_*.csv"):
        p.unlink()
        n += 1
    print(f"Deleted {n} cache files in {CACHE_DIR}/")

if FORCE_REBUILD_OUTPUTS:
    targets = [
        OUT_DIR / "county_plan_counts_2014_2019.csv",
        OUT_DIR / "county_hhi_ma_share_2014_2019.csv",
    ]
    for p in targets:
        if p.exists():
            p.unlink()
            print("Deleted output:", p)

MA_ROOT: /scion/5261/econ470001/ma-data/ma
ENROLL_EXTRACTED exists: True
SAREA_EXTRACTED exists: True
PEN_EXTRACTED exists: True
FORCE_REBUILD_CACHE: True
FORCE_REBUILD_OUTPUTS: True
Deleted 6 cache files in data/cache/
Deleted output: data/processed/county_plan_counts_2014_2019.csv
Deleted output: data/processed/county_hhi_ma_share_2014_2019.csv


In [2]:
TRUE_SET  = {"1", "1.0", "true", "t", "yes", "y"}
FALSE_SET = {"0", "0.0", "false", "f", "no", "n", "nan", "none", ""}

def parse_boolish_series(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.strip().str.lower()
    s = s.replace({"<na>": "", "na": "", "n/a": ""})

    out = pd.Series(False, index=s.index)
    out[s.isin(TRUE_SET)] = True
    out[s.isin(FALSE_SET)] = False

    num = pd.to_numeric(s, errors="coerce")
    out[num.notna()] = (num[num.notna()] != 0)
    return out

def norm_colname(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [norm_colname(c) for c in df.columns]
    return df

def first_existing_col(cols: list[str], candidates: list[str]) -> str | None:
    s = set(cols)
    for c in candidates:
        if c in s:
            return c
    return None

def coerce_id_series(x: pd.Series, width: int | None = None) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    if width is not None:
        s = s.str.zfill(width)
    return s

def to_numeric_clean(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.replace(",", "", regex=False).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan, "*": np.nan, "$-": np.nan, "-": np.nan})
    s = s.str.replace("$", "", regex=False)
    return pd.to_numeric(s, errors="coerce")

def _digits_zfill(x: pd.Series, width: int) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    s = s.str.replace(r"[^0-9]", "", regex=True)
    s = s.where(s.str.len() > 0, np.nan)
    return s.str.zfill(width)

def coerce_fips(df: pd.DataFrame) -> pd.Series:
    cols = list(df.columns)

    fips_col = first_existing_col(
        cols,
        [
            "fips",
            "fips_state_county_code",
            "fipsstatecountycode",
            "county_fips",
            "countyfips",
            "cnty_fips",
            "fipscounty",
            "fipscnty",
        ],
    )
    if fips_col is not None:
        return _digits_zfill(df[fips_col], 5)

    st_col = first_existing_col(cols, ["state_fips", "statefips", "fipsst", "fips_state"])
    ct_col = first_existing_col(cols, ["county_fips", "countyfips", "fipscnty", "fips_county"])

    if st_col is not None and ct_col is not None:
        st = _digits_zfill(df[st_col], 2)
        ct = _digits_zfill(df[ct_col], 3)
        out = (st.fillna("") + ct.fillna("")).replace({"": np.nan})
        return out

    return pd.Series([np.nan] * len(df), index=df.index, dtype="object")

In [3]:
def find_month_file(root: Path, year: int, month: int, kind: str) -> Path:
    m2 = f"{month:02d}"
    files = [p for p in root.rglob("*") if p.is_file()]

    if kind == "enroll":
        pats = [rf".*{year}.*{m2}.*enroll.*\.(csv|txt)$", rf".*enroll.*{year}.*{m2}.*\.(csv|txt)$"]
    elif kind == "sarea":
        pats = [
            rf".*{year}.*{m2}.*(service|sa|sarea|cnty_sa|service_area).*?\.(csv|txt)$",
            rf".*(service|sa|sarea|cnty_sa|service_area).*{year}.*{m2}.*?\.(csv|txt)$",
        ]
    else:
        raise ValueError("unknown kind")

    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*{m2}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year}-{m2} under {root}")

def find_year_file(root: Path, year: int, kind: str) -> Path:
    files = [p for p in root.rglob("*") if p.is_file()]
    if kind != "penetration":
        raise ValueError("unknown kind")

    pats = [rf".*{year}.*penet.*\.(csv|txt)$", rf".*penet.*{year}.*\.(csv|txt)$"]
    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year} under {root}")

In [4]:
def read_csv_any(path: Path, usecols=None) -> pd.DataFrame:
    return pd.read_csv(
        path,
        dtype=str,
        na_values=["*", "NA", "N/A", ""],
        keep_default_na=True,
        encoding_errors="replace",
        low_memory=False,
        usecols=usecols,
    )

def read_enroll_min(path: Path) -> pd.DataFrame:
    want = {
        "contract_number",
        "contract_id",
        "plan_id",
        "fips_state_county_code",
        "enrollment",
        "snp",
        "snp_type",
        "partc",
        "part_c",
        "plan_type",
        "contract_type",
    }

    df = read_csv_any(path, usecols=lambda c: norm_colname(c) in want)
    df = normalize_columns(df)

    contract_col = first_existing_col(list(df.columns), ["contract_id", "contract_number"])
    if contract_col is None:
        raise KeyError("Enrollment file missing contract column")
    if "plan_id" not in df.columns:
        raise KeyError("Enrollment file missing plan_id column")
    if "enrollment" not in df.columns:
        raise KeyError("Enrollment file missing enrollment column")

    df["contract_id"] = coerce_id_series(df[contract_col], None)
    df["plan_id"] = coerce_id_series(df["plan_id"], 3)

    fips_src = df.get("fips_state_county_code", pd.Series([np.nan] * len(df), index=df.index))
    df["fips"] = _digits_zfill(fips_src, 5)

    df["enrollment"] = to_numeric_clean(df["enrollment"])

    keep = ["contract_id", "plan_id", "fips", "enrollment"]
    for c in ["snp", "snp_type", "partc", "part_c", "plan_type", "contract_type"]:
        if c in df.columns:
            keep.append(c)

    return df[keep]

def read_sarea_min(path: Path) -> pd.DataFrame:
    keep = {
        "contract_id",
        "contract_number",
        "plan_id",
        "plan",
        "planid",
        "plan_number",
        "fips",
        "fips_state_county_code",
        "fipscounty",
        "fipscnty",
        "county_fips",
        "cnty_fips",
    }

    df = read_csv_any(path, usecols=lambda c: norm_colname(c) in keep)
    df = normalize_columns(df)

    cols = list(df.columns)
    contract_col = first_existing_col(cols, ["contract_id", "contract_number", "contract"])
    if contract_col is None:
        raise KeyError(f"Service area missing contract column. Sample cols: {sorted(cols)[:60]}")

    df["contract_id"] = coerce_id_series(df[contract_col], None)

    plan_col = first_existing_col(cols, ["plan_id", "planid", "plan", "plan_number"])
    if plan_col is None:
        df["plan_id"] = np.nan
    else:
        df["plan_id"] = coerce_id_series(df[plan_col], 3)

    df["fips"] = coerce_fips(df)

    return df[["contract_id", "plan_id", "fips"]]

In [5]:
def flag_exclusions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Produces:
      is_snp, is_800_series, is_pdp_only, drop_hw2  (all 0/1 ints)
    """
    df = df.copy()

    df["plan_id"] = df["plan_id"].astype(str).str.strip().str.zfill(3)
    df["is_800_series"] = df["plan_id"].str.startswith("8").astype(int)

    cols = list(df.columns)

    # SNP
    snp_col = first_existing_col(cols, ["snp"])
    snp_type_col = first_existing_col(cols, ["snp_type"])

    if snp_col is not None:
        df["is_snp"] = parse_boolish_series(df[snp_col]).astype(int)
    elif snp_type_col is not None:
        st = df[snp_type_col].astype(str).str.strip().str.lower()
        df["is_snp"] = st.str.contains("snp", na=False).astype(int)
    else:
        df["is_snp"] = 0

    # PDP-only (no Part C)
    partc_col = first_existing_col(cols, ["partc", "part_c"])
    if partc_col is not None:
        is_partc = parse_boolish_series(df[partc_col])
        df["is_pdp_only"] = (~is_partc).astype(int)
    else:
        plan_type_col = first_existing_col(cols, ["plan_type", "contract_type"])
        if plan_type_col is None:
            df["is_pdp_only"] = 0
        else:
            s = df[plan_type_col].astype(str).str.lower().str.strip()
            df["is_pdp_only"] = (
                s.str.contains(r"\bpdp\b", regex=True, na=False)
                | s.str.contains("prescription drug", na=False)
                | s.str.contains("part d", na=False)
            ).astype(int)

    df["drop_hw2"] = ((df["is_snp"] == 1) | (df["is_800_series"] == 1) | (df["is_pdp_only"] == 1)).astype(int)
    return df

In [6]:
def build_plan_county_year(year: int) -> Path:
    out_path = CACHE_DIR / f"plan_county_year_{year}.csv"

    if FORCE_REBUILD_CACHE and out_path.exists():
        out_path.unlink()
        print("Deleted cache:", out_path)

    if out_path.exists():
        return out_path

    accum = None

    for m in range(1, 13):
        enroll_path = find_month_file(ENROLL_EXTRACTED, year, m, "enroll")
        sarea_path  = find_month_file(SAREA_EXTRACTED,  year, m, "sarea")

        enroll = read_enroll_min(enroll_path)
        sarea  = read_sarea_min(sarea_path)

        enroll = enroll.dropna(subset=["contract_id", "plan_id", "fips"])
        sarea  = sarea.dropna(subset=["contract_id", "fips"])

        has_plan = sarea["plan_id"].notna().any()
        if has_plan:
            sarea = sarea.dropna(subset=["plan_id"])
            merged = enroll.merge(sarea, on=["contract_id", "plan_id", "fips"], how="inner")
        else:
            merged = enroll.merge(sarea[["contract_id", "fips"]], on=["contract_id", "fips"], how="inner")

        merged["year"] = int(year)
        merged = flag_exclusions(merged)

        g = merged.groupby(["contract_id", "plan_id", "fips", "year"], as_index=False).agg(
            enroll_sum=("enrollment", "sum"),
            is_snp=("is_snp", "max"),
            is_800_series=("is_800_series", "max"),
            is_pdp_only=("is_pdp_only", "max"),
            drop_hw2=("drop_hw2", "max"),
        )

        g["months_observed"] = 1
        g["dec_enrollment"] = g["enroll_sum"] if m == 12 else np.nan

        if accum is None:
            accum = g
        else:
            keys = ["contract_id", "plan_id", "fips", "year"]
            accum = accum.merge(g, on=keys, how="outer", suffixes=("", "_new"))

            accum["enroll_sum"] = accum["enroll_sum"].fillna(0) + accum["enroll_sum_new"].fillna(0)
            accum["months_observed"] = accum["months_observed"].fillna(0) + accum["months_observed_new"].fillna(0)

            for c in ["is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]:
                accum[c] = np.maximum(
                    pd.to_numeric(accum[c], errors="coerce").fillna(0),
                    pd.to_numeric(accum[f"{c}_new"], errors="coerce").fillna(0),
                ).astype(int)

            accum["dec_enrollment"] = accum["dec_enrollment"].combine_first(accum["dec_enrollment_new"])

            dropcols = [c for c in accum.columns if c.endswith("_new")]
            accum = accum.drop(columns=dropcols)

        del enroll, sarea, merged, g
        gc.collect()

    accum["avg_enrollment"] = accum["enroll_sum"] / accum["months_observed"].replace({0: np.nan})

    out = accum[
        [
            "contract_id",
            "plan_id",
            "fips",
            "year",
            "avg_enrollment",
            "dec_enrollment",
            "months_observed",
            "is_snp",
            "is_800_series",
            "is_pdp_only",
            "drop_hw2",
        ]
    ].copy()

    # enforce clean types for downstream notebooks
    out["contract_id"] = out["contract_id"].astype(str).str.strip()
    out["plan_id"] = out["plan_id"].astype(str).str.strip().str.zfill(3)
    out["fips"] = out["fips"].astype(str).str.strip().str.zfill(5)
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype(int)

    for c in ["is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).astype(int)

    for c in ["avg_enrollment", "dec_enrollment", "months_observed"]:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    # IMPORTANT: enforce uniqueness
    out = out.drop_duplicates(subset=["contract_id", "plan_id", "fips", "year"]).copy()

    out.to_csv(out_path, index=False)

    del accum, out
    gc.collect()
    return out_path

In [7]:
def build_county_plan_counts_2014_2019() -> Path:
    out_path = OUT_DIR / "county_plan_counts_2014_2019.csv"

    if FORCE_REBUILD_OUTPUTS and out_path.exists():
        out_path.unlink()
        print("Deleted output:", out_path)

    if out_path.exists():
        return out_path

    rows = []

    for y in YEARS:
        print("Plan counts. Year:", y)
        yp = build_plan_county_year(y)

        usecols = ["fips", "year", "drop_hw2"]
        pieces = []
        for chunk in pd.read_csv(yp, dtype=str, usecols=usecols, chunksize=CHUNKSIZE):
            keep = chunk["drop_hw2"].astype(str).str.strip().isin(["0", "0.0"])
            c = chunk.loc[keep, ["fips", "year"]].dropna().copy()
            c["fips"] = c["fips"].astype(str).str.zfill(5)
            c["year"] = pd.to_numeric(c["year"], errors="coerce").astype(int)
            # Each row is already one (contract-plan)-county-year after the cache build.
            tmp = c.groupby(["fips", "year"], as_index=False).size().rename(columns={"size": "plan_count"})
            pieces.append(tmp)

            del chunk, c, tmp
            gc.collect()

        out_y = pd.concat(pieces, ignore_index=True).groupby(["fips","year"], as_index=False)["plan_count"].sum()
        rows.append(out_y)
        del pieces, out_y
        gc.collect()

    out = pd.concat(rows, ignore_index=True).sort_values(["year","fips"]).reset_index(drop=True)
    out.to_csv(out_path, index=False)
    print("Wrote:", out_path, "rows:", len(out))
    return out_path

county_counts_path = build_county_plan_counts_2014_2019()

Plan counts. Year: 2014
Plan counts. Year: 2015
Plan counts. Year: 2016
Plan counts. Year: 2017
Plan counts. Year: 2018
Plan counts. Year: 2019
Wrote: data/processed/county_plan_counts_2014_2019.csv rows: 19059


In [8]:
def read_penetration_year(year: int) -> pd.DataFrame:
    path = find_year_file(PEN_EXTRACTED, year, "penetration")
    df = read_csv_any(path)
    df = normalize_columns(df)

    df["fips"] = coerce_fips(df)

    cols = list(df.columns)
    ma_col = first_existing_col(cols, ["ma_enrollment", "ma_enroll", "enrolled", "enroll"])
    elig_col = first_existing_col(cols, ["eligibles", "total_eligibles", "medicare_eligibles", "tot_eligibles"])

    if ma_col is None or elig_col is None:
        raise KeyError(f"Penetration file missing enrolled or eligibles. Col sample: {sorted(cols)[:100]}")

    out = df[["fips", ma_col, elig_col]].copy()
    out = out.rename(columns={ma_col: "ma_enrollment", elig_col: "eligibles"})
    out["ma_enrollment"] = to_numeric_clean(out["ma_enrollment"])
    out["eligibles"] = to_numeric_clean(out["eligibles"])
    out["year"] = int(year)

    out = out.dropna(subset=["fips"])
    out["fips"] = out["fips"].astype(str).str.zfill(5)
    return out

pen = pd.concat([read_penetration_year(y) for y in YEARS], ignore_index=True)
print("Penetration shape:", pen.shape)

Penetration shape: (19494, 4)


In [9]:
def build_county_hhi_ma_share_2014_2019(pen: pd.DataFrame) -> Path:
    out_path = OUT_DIR / "county_hhi_ma_share_2014_2019.csv"

    if FORCE_REBUILD_OUTPUTS and out_path.exists():
        out_path.unlink()
        print("Deleted output:", out_path)

    if out_path.exists():
        return out_path

    pen2 = pen.copy()
    pen2["year"] = pen2["year"].astype(int)
    pen2["fips"] = pen2["fips"].astype(str).str.zfill(5)

    rows = []

    for y in YEARS:
        print("HHI + MA share. Year:", y)
        yp = build_plan_county_year(y)

        usecols = ["fips", "year", "contract_id", "dec_enrollment", "avg_enrollment", "drop_hw2"]
        contract_pieces = []

        for chunk in pd.read_csv(yp, dtype=str, usecols=usecols, chunksize=CHUNKSIZE):
            keep = chunk["drop_hw2"].astype(str).str.strip().isin(["0", "0.0"])
            chunk = chunk.loc[keep].copy()

            chunk["fips"] = chunk["fips"].astype(str).str.zfill(5)
            chunk["year"] = pd.to_numeric(chunk["year"], errors="coerce").astype(int)
            chunk["contract_id"] = chunk["contract_id"].astype(str).str.strip()

            dec = pd.to_numeric(chunk["dec_enrollment"], errors="coerce")
            avg = pd.to_numeric(chunk["avg_enrollment"], errors="coerce")
            e = dec.fillna(avg).fillna(0.0)
            chunk["e"] = e

            # IMPORTANT: HHI across firms => aggregate to contract within county-year first
            tmp = (
                chunk.groupby(["fips", "year", "contract_id"], as_index=False)
                     .agg(contract_enroll=("e", "sum"))
            )
            contract_pieces.append(tmp)

            del chunk, tmp
            gc.collect()

        cdf = pd.concat(contract_pieces, ignore_index=True)
        cdf = cdf.groupby(["fips", "year", "contract_id"], as_index=False)["contract_enroll"].sum()

        cdf["enroll_sq"] = cdf["contract_enroll"] * cdf["contract_enroll"]

        out = (
            cdf.groupby(["fips", "year"], as_index=False)
               .agg(
                   ma_total_from_plans=("contract_enroll", "sum"),
                   sumsq_enroll=("enroll_sq", "sum"),
               )
        )

        out["hhi"] = out["sumsq_enroll"] / (out["ma_total_from_plans"] * out["ma_total_from_plans"]).replace({0: np.nan})

        out = out.merge(pen2, on=["fips", "year"], how="left")
        out["ma_share"] = out["ma_enrollment"] / out["eligibles"]
        out["ma_share_clipped"] = out["ma_share"].clip(lower=0, upper=1)

        rows.append(out)

        del contract_pieces, cdf, out
        gc.collect()

    final = pd.concat(rows, ignore_index=True).sort_values(["year","fips"]).reset_index(drop=True)
    final.to_csv(out_path, index=False)
    print("Wrote:", out_path, "rows:", len(final))
    return out_path

county_hhi_path = build_county_hhi_ma_share_2014_2019(pen)

HHI + MA share. Year: 2014
Deleted cache: data/cache/plan_county_year_2014.csv
HHI + MA share. Year: 2015
Deleted cache: data/cache/plan_county_year_2015.csv
HHI + MA share. Year: 2016
Deleted cache: data/cache/plan_county_year_2016.csv
HHI + MA share. Year: 2017
Deleted cache: data/cache/plan_county_year_2017.csv
HHI + MA share. Year: 2018
Deleted cache: data/cache/plan_county_year_2018.csv
HHI + MA share. Year: 2019
Deleted cache: data/cache/plan_county_year_2019.csv
Wrote: data/processed/county_hhi_ma_share_2014_2019.csv rows: 19059


In [10]:
counts = pd.read_csv(OUT_DIR / "county_plan_counts_2014_2019.csv", dtype={"fips": str})
hhi_df = pd.read_csv(OUT_DIR / "county_hhi_ma_share_2014_2019.csv", dtype={"fips": str})

counts["fips"] = counts["fips"].str.zfill(5)
counts["year"] = counts["year"].astype(int)

hhi_df["fips"] = hhi_df["fips"].str.zfill(5)
hhi_df["year"] = hhi_df["year"].astype(int)

print("Counts rows:", counts.shape)
print("HHI rows:", hhi_df.shape)

print("\nPlan count sanity:")
print(counts.groupby("year")["plan_count"].agg(["count","mean","median","min","max"]))

hhi_df["hhi"] = pd.to_numeric(hhi_df["hhi"], errors="coerce")
hhi_df["ma_share"] = pd.to_numeric(hhi_df["ma_share"], errors="coerce")

print("\nHHI sanity:")
print(hhi_df.groupby("year")["hhi"].agg(["count","mean","median","min","max"]))

print("\nMA share sanity:")
print(hhi_df.groupby("year")["ma_share"].agg(["count","mean","median","min","max"]))

print("\nPenetration merge missingness (ma_enrollment):", hhi_df["ma_enrollment"].isna().mean())
print("Penetration merge missingness (eligibles):", hhi_df["eligibles"].isna().mean())

# .gitignore: never push cache
gitignore_path = Path(".gitignore")
existing = gitignore_path.read_text().splitlines() if gitignore_path.exists() else []

rules = [
    "data/cache/",
    "data/cache/cms_payment_extracted/",
]

new_lines = existing[:]
for r in rules:
    if r not in new_lines:
        new_lines.append(r)

gitignore_path.write_text("\n".join(new_lines) + "\n")
print("Updated .gitignore with cache rules.")

Counts rows: (19059, 3)
HHI rows: (19059, 9)

Plan count sanity:
      count       mean  median  min  max
year                                    
2014   3162  23.867805    15.0    1  389
2015   3169  24.770275    17.0    1  359
2016   3175  26.181102    17.0    1  398
2017   3172  26.918979    18.0    1  408
2018   3185  32.416954    22.0    1  469
2019   3196  36.135169    24.0    1  525

HHI sanity:
      count      mean    median       min  max
year                                          
2014   2980  0.452670  0.387899  0.101179  1.0
2015   2977  0.450364  0.380093  0.089584  1.0
2016   2988  0.453184  0.383756  0.103697  1.0
2017   2987  0.459304  0.387529  0.091483  1.0
2018   2994  0.440822  0.367966  0.089141  1.0
2019   2987  0.398212  0.316754  0.086397  1.0

MA share sanity:
      count      mean    median       min       max
year                                               
2014   3108  0.224649  0.194643  0.004293  1.156381
2015   3108  0.237273  0.210380  0.003261  1