In [1]:
from __future__ import annotations

import re
import zipfile
import shutil
import gc
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

YEARS = list(range(2014, 2020))

CACHE_DIR = Path("data/cache")
OUT_DIR   = Path("data/processed")
CACHE_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

CANDIDATE_MA_ROOTS = [
    Path("/scion/5261/econ470001/ma-data/ma"),
    Path("/econ470/a0/work/ma-data/ma"),
    Path.cwd().parent / "ma-data" / "ma",
]

def pick_existing(paths: list[Path]) -> Path:
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError("No candidate path exists:\n" + "\n".join(map(str, paths)))

MA_ROOT = pick_existing(CANDIDATE_MA_ROOTS)

CMSPAY_ZIP_DIR = MA_ROOT / "cms-payment"
if not CMSPAY_ZIP_DIR.exists():
    raise FileNotFoundError(f"cms-payment not found: {CMSPAY_ZIP_DIR}")

print("MA_ROOT:", MA_ROOT)
print("CMSPAY_ZIP_DIR exists:", CMSPAY_ZIP_DIR.exists())
print("CMSPAY zip examples:", sorted([p.name for p in CMSPAY_ZIP_DIR.glob("*.zip")])[:15])

MA_ROOT: /scion/5261/econ470001/ma-data/ma
CMSPAY_ZIP_DIR exists: True
CMSPAY zip examples: ['2006paymentdata-.zip', '2007paymentdata-.zip', '2008paymentdata.zip', '2009paymentdata-.zip', '2010paymentdata-.zip', '2011paymentdata.zip', '2012paymentdata.zip', '2013paymentdata.zip', '2014paymentdata.zip', '2015-payment-data.zip', '2016paymentdata.zip', '2017paymentdata.zip', '2018paymentdata.zip', '2019paymentdata_0.zip', '2020Paymentdata_1_0.zip']


In [2]:
def norm_colname(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [norm_colname(c) for c in df.columns]
    return df

def clean_contract_id(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "none": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    return s

def clean_plan_id(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "none": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    s = s.str.replace(r"[^0-9]", "", regex=True)
    s = s.where(s.str.len() > 0, np.nan)
    return s.str.zfill(3)

def to_num(x: pd.Series) -> pd.Series:
    s = x.astype("string").str.strip()
    s = s.replace({"nan": pd.NA, "none": pd.NA, "None": pd.NA, "": pd.NA, "*": pd.NA, "$-": pd.NA, "-": pd.NA})
    s = s.str.replace(",", "", regex=False)
    s = s.str.replace("$", "", regex=False)
    s = s.str.replace("%", "", regex=False)
    return pd.to_numeric(s, errors="coerce")

In [3]:
def find_payment_zip(year: int) -> Path:
    patterns = [
        f"{year}paymentdata*.zip",
        f"{year}*payment*data*.zip",
        f"{year}*payment*.zip",
        f"*{year}*payment*data*.zip",
        f"*{year}*payment*.zip",
    ]
    hits = []
    for pat in patterns:
        hits.extend(list(CMSPAY_ZIP_DIR.glob(pat)))
    hits = [p for p in hits if p.is_file()]
    if not hits:
        raise FileNotFoundError(f"No payment zip found for {year} in {CMSPAY_ZIP_DIR}")
    hits = sorted(hits, key=lambda p: (len(p.name), p.name.lower()))
    return hits[0]

def year_dir_has_data(year_dir: Path) -> bool:
    if not year_dir.exists():
        return False
    exts = {".xlsx", ".xls", ".csv", ".txt", ".tsv"}
    files = [
        p for p in year_dir.rglob("*")
        if p.is_file()
        and p.suffix.lower() in exts
        and not p.name.startswith("~$")
        and p.name.lower() != "_extracted_ok.txt"
    ]
    return len(files) > 0

def extract_payment_zip(year: int) -> Path:
    year_dir = CACHE_DIR / "cms_payment_extracted" / str(year)
    ok_path = year_dir / "_EXTRACTED_OK.txt"

    if year_dir_has_data(year_dir) and ok_path.exists():
        return year_dir

    zip_path = find_payment_zip(year)

    if year_dir.exists():
        shutil.rmtree(year_dir)
    year_dir.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(year_dir)

    if not year_dir_has_data(year_dir):
        raise RuntimeError(f"Extraction for {year} produced no readable data files. Zip was {zip_path}")

    ok_path.write_text("ok\n")
    print("Extracted", year, "from", zip_path.name, "into", year_dir)
    return year_dir

In [4]:
import openpyxl

def score_cols(cols: list[str]) -> int:
    c = set([norm_colname(x) for x in cols])

    contract_keys = {"contract_id", "contract_number", "contract"}
    plan_keys = {"plan_id", "pbp", "plan_benefit_package", "planbenefitpackage"}

    if not (any(k in c for k in contract_keys) and any(k in c for k in plan_keys)):
        return -10

    sc = 10
    sc += 2 if ("average_rebate_pmpm_payment" in c or "rebate" in c) else 0
    sc += 1 if ("average_ab_pmpm_payment" in c or "payment" in c) else 0
    sc += 1 if ("average_part_c_risk_score" in c or "average_part_risk_score" in c or "risk" in c) else 0
    return sc

def read_sheet_as_table_openpyxl(path: Path, sheet_name: str, max_scan_rows: int = 250) -> pd.DataFrame | None:
    wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
    if sheet_name not in wb.sheetnames:
        return None
    ws = wb[sheet_name]

    rows = []
    for i, row in enumerate(ws.iter_rows(values_only=True)):
        if i >= max_scan_rows:
            break
        rows.append([("" if v is None else str(v)) for v in row])

    if len(rows) == 0:
        return None

    keys = ["contract", "contract number", "pbp", "plan benefit", "plan_benefit", "rebate", "risk", "payment"]

    best_i = None
    best_hits = -1
    for i, r in enumerate(rows):
        txt = " ".join(r).lower()
        hits = sum(k in txt for k in keys)
        if hits > best_hits:
            best_hits = hits
            best_i = i

    if best_i is None or best_hits <= 0:
        return None

    header = [norm_colname(x) for x in rows[best_i]]
    header = [("col" + str(j) if h == "" else h) for j, h in enumerate(header)]

    data_rows = []
    for row in ws.iter_rows(min_row=best_i + 2, values_only=True):
        r = [("" if v is None else str(v)) for v in row]
        if all(x.strip() == "" for x in r):
            continue
        data_rows.append(r)

    if len(data_rows) == 0:
        return None

    max_len = max(len(header), max(len(r) for r in data_rows))
    header = header + [f"col_extra_{j}" for j in range(len(header), max_len)]
    data_rows = [r + [""] * (max_len - len(r)) for r in data_rows]

    df = pd.DataFrame(data_rows, columns=header)
    df = df.dropna(how="all")
    df = normalize_columns(df)

    keep_cols = [c for c in df.columns if df[c].astype(str).str.strip().replace({"nan": ""}).ne("").any()]
    df = df[keep_cols]

    if df.shape[0] == 0 or df.shape[1] == 0:
        return None

    return df

def pick_best_sheet(path: Path) -> tuple[str, pd.DataFrame, int]:
    xls = pd.ExcelFile(path, engine="openpyxl")
    best = None  # (score, sheet, df)

    for sh in xls.sheet_names:
        try:
            df = read_sheet_as_table_openpyxl(path, sh)
            if df is None:
                continue
            sc = score_cols(list(df.columns))
            if best is None or sc > best[0]:
                best = (sc, sh, df)
        except Exception:
            continue

    if best is None:
        raise RuntimeError(f"Could not find a usable sheet/table in {path.name}")

    return best[1], best[2], best[0]

In [5]:
def pick_partc_plan_workbook(year_dir: Path, year: int) -> Path:
    xlsx = [p for p in year_dir.rglob("*.xlsx") if p.is_file() and "~$" not in p.name]
    if not xlsx:
        raise FileNotFoundError(f"No xlsx files in {year_dir}")

    y = str(year)
    want = []
    for p in xlsx:
        n = p.name.lower()
        if (y in p.name) and ("partc" in n or "part_c" in n) and ("plan" in n) and ("level" in n):
            want.append(p)

    if want:
        want = sorted(want, key=lambda p: (len(p.name), p.name.lower()))
        return want[0]

    want2 = []
    for p in xlsx:
        n = p.name.lower()
        if (y in p.name) and ("partc" in n or "part_c" in n) and ("plan" in n):
            want2.append(p)

    if want2:
        want2 = sorted(want2, key=lambda p: (len(p.name), p.name.lower()))
        return want2[0]

    xlsx = sorted(xlsx, key=lambda p: (len(p.name), p.name.lower()))
    return xlsx[0]

In [6]:
def load_planlevel_payment(year: int) -> pd.DataFrame:
    year_dir = extract_payment_zip(year)
    wb = pick_partc_plan_workbook(year_dir, year)

    sh, df, sc = pick_best_sheet(wb)
    df = normalize_columns(df)
    cols = list(df.columns)
    cset = set(cols)

    contract_col = "contract_number" if "contract_number" in cset else ("contract_id" if "contract_id" in cset else None)
    plan_col = "plan_benefit_package" if "plan_benefit_package" in cset else ("pbp" if "pbp" in cset else ("plan_id" if "plan_id" in cset else None))

    rebate_col = "average_rebate_pmpm_payment" if "average_rebate_pmpm_payment" in cset else ("rebate" if "rebate" in cset else None)
    abpay_col  = "average_ab_pmpm_payment" if "average_ab_pmpm_payment" in cset else None
    risk_col   = "average_part_c_risk_score" if "average_part_c_risk_score" in cset else ("average_part_risk_score" if "average_part_risk_score" in cset else None)

    if contract_col is None or plan_col is None:
        raise KeyError(f"Missing contract or plan cols in {wb.name} sheet {sh}. Cols: {cols[:80]}")
    if rebate_col is None:
        raise KeyError(f"Missing rebate column in {wb.name} sheet {sh}. Cols: {cols[:80]}")

    out = pd.DataFrame()
    out["contract_id"] = clean_contract_id(df[contract_col])
    out["plan_id"]     = clean_plan_id(df[plan_col])
    out["rebate_pmpm"] = to_num(df[rebate_col])
    out["ab_pmpm_payment"] = to_num(df[abpay_col]) if abpay_col is not None else np.nan
    out["risk_score"] = to_num(df[risk_col]) if risk_col is not None else np.nan

    out["year"] = int(year)
    out["source_file"] = wb.name
    out["source_sheet"] = sh
    out = out.dropna(subset=["contract_id", "plan_id"])
    out = out.drop_duplicates(subset=["contract_id", "plan_id", "year"]).reset_index(drop=True)
    return out

In [7]:
pay_list = []
for y in YEARS:
    print("Loading cms payment plan level:", y)
    d = load_planlevel_payment(y)
    print(" ", y, "rows", d.shape[0], "file", d["source_file"].iloc[0], "sheet", d["source_sheet"].iloc[0])
    pay_list.append(d)
    gc.collect()

cms_pay_planlevel = pd.concat(pay_list, ignore_index=True)
print("cms_pay_planlevel shape:", cms_pay_planlevel.shape)

cms_pay_out = OUT_DIR / "cms_payment_planlevel_2014_2019.csv"
cms_pay_planlevel.to_csv(cms_pay_out, index=False)
print("Wrote:", cms_pay_out)

Loading cms payment plan level: 2014
  2014 rows 2825 file 2014PartCPlan Level.xlsx sheet result.srx
Loading cms payment plan level: 2015
  2015 rows 2742 file 2015PartCPlanLevel.xlsx sheet result.srx
Loading cms payment plan level: 2016
  2016 rows 2774 file 2016PartCPlanLevel.xlsx sheet 2016
Loading cms payment plan level: 2017
  2017 rows 2810 file 2017PartCPlanLevel.xlsx sheet 2017
Loading cms payment plan level: 2018
  2018 rows 3129 file 2018PartCPlanLevel.xlsx sheet result.srx
Loading cms payment plan level: 2019
  2019 rows 3617 file 2019PartCPlanLevel.xlsx sheet result.html
cms_pay_planlevel shape: (17897, 8)
Wrote: data/processed/cms_payment_planlevel_2014_2019.csv


In [8]:
b = cms_pay_planlevel.copy()
b["ab_pmpm_payment"] = pd.to_numeric(b["ab_pmpm_payment"], errors="coerce")
b["rebate_pmpm"] = pd.to_numeric(b["rebate_pmpm"], errors="coerce")

b["bid_pmpm"] = b["ab_pmpm_payment"] - b["rebate_pmpm"]

print("Bid nonmissing share:", b["bid_pmpm"].notna().mean())
print("Ab payment nonmissing share:", b["ab_pmpm_payment"].notna().mean())
print("Rebate nonmissing share:", b["rebate_pmpm"].notna().mean())

plan_bids_out = OUT_DIR / "plan_bids_pmpm_2014_2019.csv"
b.to_csv(plan_bids_out, index=False)
print("Wrote:", plan_bids_out)

Bid nonmissing share: 1.0
Ab payment nonmissing share: 1.0
Rebate nonmissing share: 1.0
Wrote: data/processed/plan_bids_pmpm_2014_2019.csv


In [9]:
b2014 = b[b["year"] == 2014].copy()
b2018 = b[b["year"] == 2018].copy()

b2014_out = OUT_DIR / "plan_bids_pmpm_2014.csv"
b2018_out = OUT_DIR / "plan_bids_pmpm_2018.csv"

b2014.to_csv(b2014_out, index=False)
b2018.to_csv(b2018_out, index=False)

print("Wrote:", b2014_out, "rows", len(b2014), "bid nonmissing", b2014["bid_pmpm"].notna().mean())
print("Wrote:", b2018_out, "rows", len(b2018), "bid nonmissing", b2018["bid_pmpm"].notna().mean())

Wrote: data/processed/plan_bids_pmpm_2014.csv rows 2825 bid nonmissing 1.0
Wrote: data/processed/plan_bids_pmpm_2018.csv rows 3129 bid nonmissing 1.0


In [10]:
pc18_path = CACHE_DIR / "plan_county_year_2018.csv"
if not pc18_path.exists():
    raise FileNotFoundError(
        f"Missing {pc18_path}. Run hw2_enrollment first so this cache file exists."
    )

pc18 = pd.read_csv(pc18_path, dtype=str, low_memory=False)
pc18 = normalize_columns(pc18)

for c in ["avg_enrollment", "dec_enrollment", "months_observed", "drop_hw2"]:
    if c in pc18.columns:
        pc18[c] = pd.to_numeric(pc18[c], errors="coerce")

pc18["fips"] = pc18["fips"].astype(str).str.zfill(5)
pc18["year"] = pd.to_numeric(pc18["year"], errors="coerce").astype(int)
pc18["contract_id"] = clean_contract_id(pc18["contract_id"])
pc18["plan_id"] = clean_plan_id(pc18["plan_id"])

pc18 = pc18[pc18["drop_hw2"] != 1].copy()

e = pc18["dec_enrollment"].copy()
e = e.fillna(pc18["avg_enrollment"])
e = e.fillna(0.0)
pc18["enroll_for_weight"] = e

b18 = b[b["year"] == 2018].copy()
b18 = b18[["contract_id", "plan_id", "year", "bid_pmpm"]].copy()
b18["contract_id"] = clean_contract_id(b18["contract_id"])
b18["plan_id"] = clean_plan_id(b18["plan_id"])
b18["year"] = pd.to_numeric(b18["year"], errors="coerce").astype(int)

pc18m = pc18.merge(b18, on=["contract_id", "plan_id", "year"], how="left")

print("pc18 rows:", pc18.shape, "merged rows:", pc18m.shape, "bid missing share:", pc18m["bid_pmpm"].isna().mean())

pc18 rows: (1366487, 12) merged rows: (1366487, 13) bid missing share: 0.6062465285070403


In [11]:
tmp = pc18m.dropna(subset=["bid_pmpm"]).copy()

tmp["w"] = pd.to_numeric(tmp["enroll_for_weight"], errors="coerce").fillna(0.0)
tmp["wx"] = tmp["w"] * pd.to_numeric(tmp["bid_pmpm"], errors="coerce")

county_bid_2018 = tmp.groupby(["fips", "year"], as_index=False).agg(
    n_plans=("bid_pmpm", "count"),
    bid_mean_unweighted=("bid_pmpm", "mean"),
    w_sum=("w", "sum"),
    wx_sum=("wx", "sum"),
)

county_bid_2018["bid_mean_weighted"] = county_bid_2018["wx_sum"] / county_bid_2018["w_sum"].replace({0: np.nan})

county_bid_out = OUT_DIR / "county_bid_2018.csv"
county_bid_2018.to_csv(county_bid_out, index=False)
print("Wrote:", county_bid_out, "rows", len(county_bid_2018))

Wrote: data/processed/county_bid_2018.csv rows 3225


In [12]:
hhi_path = Path("data/processed/county_hhi_ma_share_2014_2019.csv")
if not hhi_path.exists():
    raise FileNotFoundError(f"Missing {hhi_path}. Run hw2_enrollment first.")

hhi = pd.read_csv(hhi_path, dtype=str, low_memory=False)
hhi = normalize_columns(hhi)
hhi["fips"] = hhi["fips"].astype(str).str.zfill(5)
hhi["year"] = pd.to_numeric(hhi["year"], errors="coerce").astype(int)
hhi["hhi"] = pd.to_numeric(hhi["hhi"], errors="coerce")

hhi18 = hhi[hhi["year"] == 2018].copy()

merged18 = county_bid_2018.merge(hhi18[["fips", "year", "hhi"]], on=["fips", "year"], how="left")

p33 = merged18["hhi"].quantile(0.33)
p66 = merged18["hhi"].quantile(0.66)

merged18["market_type"] = np.where(
    merged18["hhi"] <= p33,
    "competitive",
    np.where(merged18["hhi"] >= p66, "uncompetitive", "middle")
)

out18 = OUT_DIR / "county_bid_hhi_2018.csv"
merged18.to_csv(out18, index=False)

print("Wrote:", out18, "rows", len(merged18))
print("HHI cutoffs p33:", p33, "p66:", p66)
print(merged18["market_type"].value_counts(dropna=False))

Wrote: data/processed/county_bid_hhi_2018.csv rows 3225
HHI cutoffs p33: 0.14846246259672544 p66: 0.23219048935982964
market_type
middle           1183
uncompetitive    1036
competitive      1006
Name: count, dtype: int64


In [13]:
print("Notebook 1c outputs:")
print(" ", OUT_DIR / "cms_payment_planlevel_2014_2019.csv")
print(" ", OUT_DIR / "plan_bids_pmpm_2014_2019.csv")
print(" ", OUT_DIR / "plan_bids_pmpm_2014.csv")
print(" ", OUT_DIR / "plan_bids_pmpm_2018.csv")
print(" ", OUT_DIR / "county_bid_2018.csv")
print(" ", OUT_DIR / "county_bid_hhi_2018.csv")

print("Do not push:")
print(" ", CACHE_DIR / "cms_payment_extracted")

Notebook 1c outputs:
  data/processed/cms_payment_planlevel_2014_2019.csv
  data/processed/plan_bids_pmpm_2014_2019.csv
  data/processed/plan_bids_pmpm_2014.csv
  data/processed/plan_bids_pmpm_2018.csv
  data/processed/county_bid_2018.csv
  data/processed/county_bid_hhi_2018.csv
Do not push:
  data/cache/cms_payment_extracted
