In [1]:
from __future__ import annotations

import re
import gc
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

YEARS = list(range(2014, 2020))

CACHE_DIR = Path("data/cache")
OUT_DIR   = Path("data/processed")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNKSIZE = 300_000

In [2]:
CANDIDATE_MA_ROOTS = [
    Path("/scion/5261/econ470001/ma-data/ma"),
    Path("/econ470/a0/work/ma-data/ma"),
    Path.cwd().parent / "ma-data" / "ma",
]

def pick_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p.exists():
            return p
    return None

MA_ROOT = pick_existing(CANDIDATE_MA_ROOTS)
if MA_ROOT is None:
    raise FileNotFoundError("Could not find ma-data/ma. Update CANDIDATE_MA_ROOTS if needed.")

ENROLL_EXTRACTED = MA_ROOT / "enrollment" / "Extracted Data"
SAREA_EXTRACTED  = MA_ROOT / "service-area" / "Extracted Data"
PEN_EXTRACTED    = MA_ROOT / "penetration" / "Extracted Data"

print("MA_ROOT:", MA_ROOT)
print("ENROLL_EXTRACTED exists:", ENROLL_EXTRACTED.exists())
print("SAREA_EXTRACTED exists:", SAREA_EXTRACTED.exists())
print("PEN_EXTRACTED exists:", PEN_EXTRACTED.exists())

MA_ROOT: /scion/5261/econ470001/ma-data/ma
ENROLL_EXTRACTED exists: True
SAREA_EXTRACTED exists: True
PEN_EXTRACTED exists: True


In [3]:
FORCE_REBUILD_CACHE   = True  
FORCE_REBUILD_OUTPUTS = True   

print("FORCE_REBUILD_CACHE:", FORCE_REBUILD_CACHE)
print("FORCE_REBUILD_OUTPUTS:", FORCE_REBUILD_OUTPUTS)

FORCE_REBUILD_CACHE: True
FORCE_REBUILD_OUTPUTS: True


In [4]:
# One-time cleanup so you don't keep reading old cached/processed files
if FORCE_REBUILD_CACHE:
    n = 0
    for p in CACHE_DIR.glob("plan_county_year_*.csv"):
        p.unlink()
        n += 1
    print(f"Deleted {n} cache files in {CACHE_DIR}/")

if FORCE_REBUILD_OUTPUTS:
    targets = [
        OUT_DIR / "county_plan_counts_2014_2019.csv",
        OUT_DIR / "county_hhi_ma_share_2014_2019.csv",
    ]
    for p in targets:
        if p.exists():
            p.unlink()
            print("Deleted output:", p)

Deleted 6 cache files in data/cache/
Deleted output: data/processed/county_plan_counts_2014_2019.csv
Deleted output: data/processed/county_hhi_ma_share_2014_2019.csv


In [5]:
TRUE_SET  = {"1", "1.0", "true", "t", "yes", "y"}
FALSE_SET = {"0", "0.0", "false", "f", "no", "n", "nan", "none", ""}

def parse_boolish_series(x: pd.Series) -> pd.Series:
    """
    Robust bool parser: handles 0/1, True/False, "1.0", yes/no, etc.
    Unknowns default to False.
    """
    s = x.astype(str).str.strip().str.lower()
    s = s.replace({"<na>": "", "na": "", "n/a": ""})

    out = pd.Series(False, index=s.index)
    out[s.isin(TRUE_SET)] = True
    out[s.isin(FALSE_SET)] = False

    # If numeric-ish, treat nonzero as True
    num = pd.to_numeric(s, errors="coerce")
    out[num.notna()] = (num[num.notna()] != 0)
    return out

def norm_colname(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [norm_colname(c) for c in df.columns]
    return df

def first_existing_col(cols: list[str], candidates: list[str]) -> str | None:
    s = set(cols)
    for c in candidates:
        if c in s:
            return c
    return None

def coerce_id_series(x: pd.Series, width: int | None = None) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    if width is not None:
        s = s.str.zfill(width)
    return s

def to_numeric_clean(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.replace(",", "", regex=False).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan, "*": np.nan, "$-": np.nan, "-": np.nan})
    s = s.str.replace("$", "", regex=False)
    return pd.to_numeric(s, errors="coerce")

def _digits_zfill(x: pd.Series, width: int) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    s = s.str.replace(r"[^0-9]", "", regex=True)
    s = s.where(s.str.len() > 0, np.nan)
    return s.str.zfill(width)

def coerce_fips(df: pd.DataFrame) -> pd.Series:
    cols = list(df.columns)

    fips_col = first_existing_col(
        cols,
        [
            "fips",
            "fips_state_county_code",
            "fipsstatecountycode",
            "county_fips",
            "countyfips",
            "cnty_fips",
            "fipscounty",
            "fipscnty",
        ],
    )
    if fips_col is not None:
        return _digits_zfill(df[fips_col], 5)

    st_col = first_existing_col(cols, ["state_fips", "statefips", "fipsst", "fips_state"])
    ct_col = first_existing_col(cols, ["county_fips", "countyfips", "fipscnty", "fips_county"])

    if st_col is not None and ct_col is not None:
        st = _digits_zfill(df[st_col], 2)
        ct = _digits_zfill(df[ct_col], 3)
        out = (st.fillna("") + ct.fillna("")).replace({"": np.nan})
        return out

    return pd.Series([np.nan] * len(df), index=df.index, dtype="object")

In [6]:
def find_month_file(root: Path, year: int, month: int, kind: str) -> Path:
    m2 = f"{month:02d}"
    files = [p for p in root.rglob("*") if p.is_file()]

    if kind == "enroll":
        pats = [rf".*{year}.*{m2}.*enroll.*\.(csv|txt)$", rf".*enroll.*{year}.*{m2}.*\.(csv|txt)$"]
    elif kind == "sarea":
        pats = [
            rf".*{year}.*{m2}.*(service|sa|sarea|cnty_sa|service_area).*?\.(csv|txt)$",
            rf".*(service|sa|sarea|cnty_sa|service_area).*{year}.*{m2}.*?\.(csv|txt)$",
        ]
    else:
        raise ValueError("unknown kind")

    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*{m2}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year}-{m2} under {root}")

def find_year_file(root: Path, year: int, kind: str) -> Path:
    files = [p for p in root.rglob("*") if p.is_file()]
    if kind != "penetration":
        raise ValueError("unknown kind")

    pats = [rf".*{year}.*penet.*\.(csv|txt)$", rf".*penet.*{year}.*\.(csv|txt)$"]
    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year} under {root}")

In [7]:
def read_csv_any(path: Path, usecols=None) -> pd.DataFrame:
    return pd.read_csv(
        path,
        dtype=str,
        na_values=["*", "NA", "N/A", ""],
        keep_default_na=True,
        encoding_errors="replace",
        low_memory=False,
        usecols=usecols,
    )

def read_enroll_min(path: Path) -> pd.DataFrame:
    want = {
        "contract_number",
        "contract_id",
        "plan_id",
        "fips_state_county_code",
        "enrollment",
        "snp",
        "snp_type",
        "partc",
        "part_c",
        "plan_type",
        "contract_type",
    }

    df = read_csv_any(path, usecols=lambda c: norm_colname(c) in want)
    df = normalize_columns(df)

    contract_col = first_existing_col(list(df.columns), ["contract_id", "contract_number"])
    if contract_col is None:
        raise KeyError("Enrollment file missing contract column")
    if "plan_id" not in df.columns:
        raise KeyError("Enrollment file missing plan_id column")
    if "enrollment" not in df.columns:
        raise KeyError("Enrollment file missing enrollment column")

    df["contract_id"] = coerce_id_series(df[contract_col], None)
    df["plan_id"] = coerce_id_series(df["plan_id"], 3)

    fips_src = df.get("fips_state_county_code", pd.Series([np.nan] * len(df), index=df.index))
    df["fips"] = _digits_zfill(fips_src, 5)

    df["enrollment"] = to_numeric_clean(df["enrollment"])

    keep = ["contract_id", "plan_id", "fips", "enrollment"]
    for c in ["snp", "snp_type", "partc", "part_c", "plan_type", "contract_type"]:
        if c in df.columns:
            keep.append(c)

    return df[keep]

def read_sarea_min(path: Path) -> pd.DataFrame:
    keep = {
        "contract_id",
        "contract_number",
        "plan_id",
        "plan",
        "planid",
        "plan_number",
        "fips",
        "fips_state_county_code",
        "fipscounty",
        "fipscnty",
        "county_fips",
        "cnty_fips",
    }

    df = read_csv_any(path, usecols=lambda c: norm_colname(c) in keep)
    df = normalize_columns(df)

    cols = list(df.columns)
    contract_col = first_existing_col(cols, ["contract_id", "contract_number", "contract"])
    if contract_col is None:
        raise KeyError(f"Service area missing contract column. Sample cols: {sorted(cols)[:60]}")

    df["contract_id"] = coerce_id_series(df[contract_col], None)

    plan_col = first_existing_col(cols, ["plan_id", "planid", "plan", "plan_number"])
    if plan_col is None:
        df["plan_id"] = np.nan
    else:
        df["plan_id"] = coerce_id_series(df[plan_col], 3)

    df["fips"] = coerce_fips(df)

    return df[["contract_id", "plan_id", "fips"]]

In [8]:
def flag_exclusions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Produces:
      is_snp, is_800_series, is_pdp_only, drop_hw2  (all 0/1 ints)
    """
    df = df.copy()

    # plan id + 800-series
    df["plan_id"] = df["plan_id"].astype(str).str.strip().str.zfill(3)
    df["is_800_series"] = df["plan_id"].str.startswith("8").astype(int)

    cols = list(df.columns)

    # SNP
    snp_col = first_existing_col(cols, ["snp"])
    snp_type_col = first_existing_col(cols, ["snp_type"])

    if snp_col is not None:
        df["is_snp"] = parse_boolish_series(df[snp_col]).astype(int)
    elif snp_type_col is not None:
        st = df[snp_type_col].astype(str).str.strip().str.lower()
        df["is_snp"] = st.str.contains("snp", na=False).astype(int)
    else:
        df["is_snp"] = 0

    # PDP-only
    partc_col = first_existing_col(cols, ["partc", "part_c"])
    if partc_col is not None:
        is_partc = parse_boolish_series(df[partc_col])
        df["is_pdp_only"] = (~is_partc).astype(int)
    else:
        plan_type_col = first_existing_col(cols, ["plan_type", "contract_type"])
        if plan_type_col is None:
            df["is_pdp_only"] = 0
        else:
            s = df[plan_type_col].astype(str).str.lower().str.strip()
            df["is_pdp_only"] = (
                s.str.contains(r"\bpdp\b", regex=True, na=False)
                | s.str.contains("prescription drug", na=False)
                | s.str.contains("part d", na=False)
            ).astype(int)

    df["drop_hw2"] = ((df["is_snp"] == 1) | (df["is_800_series"] == 1) | (df["is_pdp_only"] == 1)).astype(int)
    return df

In [9]:
def build_plan_county_year(year: int) -> Path:
    out_path = CACHE_DIR / f"plan_county_year_{year}.csv"

    if FORCE_REBUILD_CACHE and out_path.exists():
        out_path.unlink()
        print("Deleted cache:", out_path)

    if out_path.exists():
        return out_path

    accum = None

    for m in range(1, 13):
        enroll_path = find_month_file(ENROLL_EXTRACTED, year, m, "enroll")
        sarea_path  = find_month_file(SAREA_EXTRACTED,  year, m, "sarea")

        enroll = read_enroll_min(enroll_path)
        sarea  = read_sarea_min(sarea_path)

        enroll = enroll.dropna(subset=["contract_id", "plan_id", "fips"])
        sarea  = sarea.dropna(subset=["contract_id", "fips"])

        # If service-area includes plan_id, merge on contract-plan-fips; else merge contract-fips only
        has_plan = sarea["plan_id"].notna().any()
        if has_plan:
            sarea = sarea.dropna(subset=["plan_id"])
            merged = enroll.merge(sarea, on=["contract_id", "plan_id", "fips"], how="inner")
        else:
            merged = enroll.merge(sarea[["contract_id", "fips"]], on=["contract_id", "fips"], how="inner")

        merged["year"] = int(year)
        merged = flag_exclusions(merged)

        # collapse within month
        g = merged.groupby(["contract_id", "plan_id", "fips", "year"], as_index=False).agg(
            enroll_sum=("enrollment", "sum"),
            is_snp=("is_snp", "max"),
            is_800_series=("is_800_series", "max"),
            is_pdp_only=("is_pdp_only", "max"),
            drop_hw2=("drop_hw2", "max"),
        )

        g["months_observed"] = 1
        g["dec_enrollment"] = g["enroll_sum"] if m == 12 else np.nan

        if accum is None:
            accum = g
        else:
            keys = ["contract_id", "plan_id", "fips", "year"]
            accum = accum.merge(g, on=keys, how="outer", suffixes=("", "_new"))

            accum["enroll_sum"] = accum["enroll_sum"].fillna(0) + accum["enroll_sum_new"].fillna(0)
            accum["months_observed"] = accum["months_observed"].fillna(0) + accum["months_observed_new"].fillna(0)

            for c in ["is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]:
                accum[c] = np.maximum(
                    pd.to_numeric(accum[c], errors="coerce").fillna(0),
                    pd.to_numeric(accum[f"{c}_new"], errors="coerce").fillna(0),
                ).astype(int)

            accum["dec_enrollment"] = accum["dec_enrollment"].combine_first(accum["dec_enrollment_new"])

            dropcols = [c for c in accum.columns if c.endswith("_new")]
            accum = accum.drop(columns=dropcols)

        del enroll, sarea, merged, g
        gc.collect()

    accum["avg_enrollment"] = accum["enroll_sum"] / accum["months_observed"].replace({0: np.nan})

    out = accum[
        [
            "contract_id",
            "plan_id",
            "fips",
            "year",
            "avg_enrollment",
            "dec_enrollment",
            "months_observed",
            "is_snp",
            "is_800_series",
            "is_pdp_only",
            "drop_hw2",
        ]
    ].copy()

    # enforce clean types for downstream notebooks
    out["contract_id"] = out["contract_id"].astype(str).str.strip()
    out["plan_id"] = out["plan_id"].astype(str).str.strip().str.zfill(3)
    out["fips"] = out["fips"].astype(str).str.strip().str.zfill(5)
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype(int)

    for c in ["is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).astype(int)

    for c in ["avg_enrollment", "dec_enrollment", "months_observed"]:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    out.to_csv(out_path, index=False)

    del accum, out
    gc.collect()
    return out_path

In [10]:
yp = build_plan_county_year(2018)
print("Built / found:", yp)

df = pd.read_csv(yp, dtype=str)
df["plan_id"] = df["plan_id"].str.zfill(3)
df["fips"] = df["fips"].str.zfill(5)

print("Rows:", df.shape[0])
print("Unique counties:", df["fips"].nunique())

keep = df["drop_hw2"].astype(str).str.strip().isin(["0", "0.0"])
print("Drop rate (drop_hw2):", (~keep).mean())

df_nodrop = df.loc[keep].copy()
df_nodrop["plan_key"] = df_nodrop["contract_id"].str.strip() + "-" + df_nodrop["plan_id"].str.strip()

plan_ct = df_nodrop.groupby(["fips", "year"])["plan_key"].nunique()
contract_ct = df_nodrop.groupby(["fips", "year"])["contract_id"].nunique()

print("\n2018 plan counts (contract-plan) summary:")
print(plan_ct.describe(percentiles=[0.05,0.25,0.5,0.75,0.95]))

print("\n2018 contract counts (firms) summary:")
print(contract_ct.describe(percentiles=[0.05,0.25,0.5,0.75,0.95]))

Built / found: data/cache/plan_county_year_2018.csv
Rows: 1366487
Unique counties: 3225
Drop rate (drop_hw2): 0.9244427499127325

2018 plan counts (contract-plan) summary:
count    3185.000000
mean       32.416954
std        39.516647
min         1.000000
5%          4.000000
25%        12.000000
50%        22.000000
75%        39.000000
95%        96.000000
max       469.000000
Name: plan_key, dtype: float64

2018 contract counts (firms) summary:
count    3185.000000
mean       11.720879
std        10.009149
min         1.000000
5%          2.000000
25%         5.000000
50%         9.000000
75%        14.000000
95%        31.000000
max        82.000000
Name: contract_id, dtype: float64


In [11]:
def build_county_plan_counts_2014_2019() -> Path:
    out_path = OUT_DIR / "county_plan_counts_2014_2019.csv"

    if FORCE_REBUILD_OUTPUTS and out_path.exists():
        out_path.unlink()
        print("Deleted output:", out_path)

    # IMPORTANT: after cleanup, this should NOT exist
    if out_path.exists():
        print("Existing file:", out_path)
        return out_path

    counts_sets: dict[tuple[str, int], set[str]] = {}

    for y in YEARS:
        print("Counting unique plans. Year:", y)
        yp = build_plan_county_year(y)

        usecols = ["fips", "year", "contract_id", "plan_id", "drop_hw2"]
        for chunk in pd.read_csv(yp, dtype=str, usecols=usecols, chunksize=CHUNKSIZE):
            keep = chunk["drop_hw2"].astype(str).str.strip().isin(["0", "0.0"])
            chunk = chunk.loc[keep, ["fips", "year", "contract_id", "plan_id"]].dropna()

            chunk["fips"] = chunk["fips"].astype(str).str.zfill(5)
            chunk["year"] = pd.to_numeric(chunk["year"], errors="coerce").astype(int)
            chunk["plan_id"] = chunk["plan_id"].astype(str).str.zfill(3)

            chunk["plan_key"] = chunk["contract_id"].astype(str).str.strip() + "-" + chunk["plan_id"].astype(str).str.strip()

            for (fips, year), g in chunk.groupby(["fips", "year"]):
                key = (str(fips), int(year))
                counts_sets.setdefault(key, set()).update(g["plan_key"].unique().tolist())

            del chunk
            gc.collect()

    out = pd.DataFrame(
        [(fips, year, len(s)) for (fips, year), s in counts_sets.items()],
        columns=["fips", "year", "plan_count"]
    ).sort_values(["year","fips"]).reset_index(drop=True)

    out.to_csv(out_path, index=False)
    return out_path

county_counts_path = build_county_plan_counts_2014_2019()
print("Wrote:", county_counts_path)

counts = pd.read_csv(county_counts_path, dtype={"fips": str})
counts["fips"] = counts["fips"].str.zfill(5)
counts["year"] = counts["year"].astype(int)

print("\nPlan count summary by year:")
print(counts.groupby("year")["plan_count"].describe(percentiles=[0.05,0.25,0.5,0.75,0.95]))

print("\nMedians by year:")
print(counts.groupby("year")["plan_count"].median())

Counting unique plans. Year: 2014
Counting unique plans. Year: 2015
Counting unique plans. Year: 2016
Counting unique plans. Year: 2017
Counting unique plans. Year: 2018
Deleted cache: data/cache/plan_county_year_2018.csv
Counting unique plans. Year: 2019
Wrote: data/processed/county_plan_counts_2014_2019.csv

Plan count summary by year:
       count       mean        std  min   5%   25%   50%   75%    95%    max
year                                                                        
2014  3162.0  23.867805  29.273318  1.0  3.0   9.0  15.0  29.0   65.0  389.0
2015  3169.0  24.770275  29.153822  1.0  4.0  10.0  17.0  30.0   71.0  359.0
2016  3175.0  26.181102  31.409309  1.0  4.0  10.0  17.0  31.5   78.0  398.0
2017  3172.0  26.918979  32.100049  1.0  4.0  10.0  18.0  32.0   78.0  408.0
2018  3185.0  32.416954  39.516647  1.0  4.0  12.0  22.0  39.0   96.0  469.0
2019  3196.0  36.135169  43.513600  1.0  4.0  14.0  24.0  43.0  105.0  525.0

Medians by year:
year
2014    15.0
2015    

In [12]:
def read_penetration_year(year: int) -> pd.DataFrame:
    path = find_year_file(PEN_EXTRACTED, year, "penetration")
    df = read_csv_any(path)
    df = normalize_columns(df)

    df["fips"] = coerce_fips(df)

    cols = list(df.columns)
    ma_col = first_existing_col(cols, ["ma_enrollment", "ma_enroll", "enrolled", "enroll"])
    elig_col = first_existing_col(cols, ["eligibles", "total_eligibles", "medicare_eligibles", "tot_eligibles"])

    if ma_col is None or elig_col is None:
        raise KeyError(f"Penetration file missing enrolled or eligibles. Col sample: {sorted(cols)[:100]}")

    out = df[["fips", ma_col, elig_col]].copy()
    out = out.rename(columns={ma_col: "ma_enrollment", elig_col: "eligibles"})
    out["ma_enrollment"] = to_numeric_clean(out["ma_enrollment"])
    out["eligibles"] = to_numeric_clean(out["eligibles"])
    out["year"] = int(year)

    out = out.dropna(subset=["fips"])
    out["fips"] = out["fips"].astype(str).str.zfill(5)
    return out

pen = pd.concat([read_penetration_year(y) for y in YEARS], ignore_index=True)
print("Penetration shape:", pen.shape)
pen.head()

Penetration shape: (19494, 4)


Unnamed: 0,fips,ma_enrollment,eligibles,year
0,1001,3000.0,9438,2014
1,1003,13183.0,41640,2014
2,1005,859.0,6004,2014
3,1007,1665.0,4599,2014
4,1009,4680.0,11193,2014


In [13]:
def build_county_hhi_ma_share_2014_2019(pen: pd.DataFrame) -> Path:
    out_path = OUT_DIR / "county_hhi_ma_share_2014_2019.csv"

    if FORCE_REBUILD_OUTPUTS and out_path.exists():
        out_path.unlink()
        print("Deleted output:", out_path)

    if out_path.exists():
        print("Existing file:", out_path)
        return out_path

    pen2 = pen.copy()
    pen2["year"] = pen2["year"].astype(int)
    pen2["fips"] = pen2["fips"].astype(str).str.zfill(5)

    rows = []

    for y in YEARS:
        print("HHI and MA share. Year:", y)
        yp = build_plan_county_year(y)

        usecols = ["fips", "year", "dec_enrollment", "avg_enrollment", "drop_hw2"]
        pieces = []

        for chunk in pd.read_csv(yp, dtype=str, usecols=usecols, chunksize=CHUNKSIZE):
            keep = chunk["drop_hw2"].astype(str).str.strip().isin(["0", "0.0"])
            chunk = chunk.loc[keep].copy()

            dec = pd.to_numeric(chunk["dec_enrollment"], errors="coerce")
            avg = pd.to_numeric(chunk["avg_enrollment"], errors="coerce")
            e = dec.fillna(avg).fillna(0.0)

            chunk["e"] = e
            chunk["e2"] = e * e

            tmp = (
                chunk.groupby(["fips", "year"], as_index=False)
                     .agg(total_enroll=("e", "sum"), sumsq_enroll=("e2", "sum"))
            )
            pieces.append(tmp)

            del chunk, tmp
            gc.collect()

        out = pd.concat(pieces, ignore_index=True)
        out = out.groupby(["fips", "year"], as_index=False).agg(
            total_enroll=("total_enroll", "sum"),
            sumsq_enroll=("sumsq_enroll", "sum"),
        )

        out["hhi"] = out["sumsq_enroll"] / (out["total_enroll"] * out["total_enroll"])
        out = out.rename(columns={"total_enroll": "ma_total_from_plans"})
        out["fips"] = out["fips"].astype(str).str.zfill(5)
        out["year"] = out["year"].astype(int)

        out = out.merge(pen2, on=["fips", "year"], how="left")
        out["ma_share"] = out["ma_enrollment"] / out["eligibles"]

        rows.append(out)

        del out, pieces
        gc.collect()

    final = pd.concat(rows, ignore_index=True).sort_values(["year", "fips"]).reset_index(drop=True)
    final.to_csv(out_path, index=False)
    return out_path

county_hhi_path = build_county_hhi_ma_share_2014_2019(pen)
print("Wrote:", county_hhi_path)

pd.read_csv(county_hhi_path, dtype={"fips": str}, nrows=5)

HHI and MA share. Year: 2014
Deleted cache: data/cache/plan_county_year_2014.csv
HHI and MA share. Year: 2015
Deleted cache: data/cache/plan_county_year_2015.csv
HHI and MA share. Year: 2016
Deleted cache: data/cache/plan_county_year_2016.csv
HHI and MA share. Year: 2017
Deleted cache: data/cache/plan_county_year_2017.csv
HHI and MA share. Year: 2018
Deleted cache: data/cache/plan_county_year_2018.csv
HHI and MA share. Year: 2019
Deleted cache: data/cache/plan_county_year_2019.csv
Wrote: data/processed/county_hhi_ma_share_2014_2019.csv


Unnamed: 0,fips,year,ma_total_from_plans,sumsq_enroll,hhi,ma_enrollment,eligibles,ma_share
0,1001,2014,2996.0,1222586.0,0.136206,3000.0,9438.0,0.317864
1,1003,2014,12948.0,27882662.0,0.166314,13183.0,41640.0,0.316595
2,1005,2014,844.0,251720.0,0.353373,859.0,6004.0,0.143071
3,1007,2014,1596.0,481766.0,0.189134,1665.0,4599.0,0.362035
4,1009,2014,4511.0,3401253.0,0.167145,4680.0,11193.0,0.418118


In [14]:
counts = pd.read_csv(OUT_DIR / "county_plan_counts_2014_2019.csv")
hhi_df = pd.read_csv(OUT_DIR / "county_hhi_ma_share_2014_2019.csv")

print("Counts rows:", counts.shape)
print("HHI rows:", hhi_df.shape)

print("\nPlan count sanity:")
print(counts.groupby("year")["plan_count"].agg(["count","mean","median","min","max"]))

print("\nHHI sanity:")
hhi_df["hhi"] = pd.to_numeric(hhi_df["hhi"], errors="coerce")
print(hhi_df.groupby("year")["hhi"].agg(["count","mean","median","min","max"]))

print("\nMA share sanity:")
hhi_df["ma_share"] = pd.to_numeric(hhi_df["ma_share"], errors="coerce")
print(hhi_df.groupby("year")["ma_share"].agg(["count","mean","median","min","max"]))

print("\nPenetration merge missingness (ma_enrollment):", hhi_df["ma_enrollment"].isna().mean())
print("Penetration merge missingness (eligibles):", hhi_df["eligibles"].isna().mean())

Counts rows: (19059, 3)
HHI rows: (19059, 8)

Plan count sanity:
      count       mean  median  min  max
year                                    
2014   3162  23.867805    15.0    1  389
2015   3169  24.770275    17.0    1  359
2016   3175  26.181102    17.0    1  398
2017   3172  26.918979    18.0    1  408
2018   3185  32.416954    22.0    1  469
2019   3196  36.135169    24.0    1  525

HHI sanity:
      count      mean    median       min  max
year                                          
2014   2980  0.323920  0.244332  0.051506  1.0
2015   2977  0.318471  0.238641  0.047353  1.0
2016   2988  0.315634  0.233470  0.051027  1.0
2017   2987  0.312165  0.228132  0.055574  1.0
2018   2994  0.286846  0.212237  0.046200  1.0
2019   2987  0.251319  0.182044  0.039916  1.0

MA share sanity:
      count      mean    median       min       max
year                                               
2014   3108  0.224649  0.194643  0.004293  1.156381
2015   3108  0.237273  0.210380  0.003261  1

In [15]:
gitignore_path = Path(".gitignore")
existing = gitignore_path.read_text().splitlines() if gitignore_path.exists() else []

rules = [
    "data/cache/",
]

new_lines = existing[:]
for r in rules:
    if r not in new_lines:
        new_lines.append(r)

gitignore_path.write_text("\n".join(new_lines) + "\n")

print("Updated .gitignore")
print("Keep these for HW2 analysis:")
print(" ", OUT_DIR / "county_plan_counts_2014_2019.csv")
print(" ", OUT_DIR / "county_hhi_ma_share_2014_2019.csv")
print("Do not push data/cache/")

Updated .gitignore
Keep these for HW2 analysis:
  data/processed/county_plan_counts_2014_2019.csv
  data/processed/county_hhi_ma_share_2014_2019.csv
Do not push data/cache/
