In [2]:
from __future__ import annotations

import re
from pathlib import Path
import sys

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

def pick_existing(paths: list[Path]) -> Path:
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError("No candidate path exists\n" + "\n".join(map(str, paths)))

CANDIDATE_MA_ROOTS = [
    Path("/scion/5261/econ470001/ma-data/ma"),
    Path("/econ470/a0/work/ma-data/ma"),
    Path.cwd().parent / "ma-data" / "ma",
]

MA_ROOT = pick_existing(CANDIDATE_MA_ROOTS)
LANDSCAPE_DIR = pick_existing([MA_ROOT / "landscape" / "Extracted Data"])
FFSCOST_ROOT  = pick_existing([MA_ROOT.parent / "ffs-costs"])
CMSPAY_DIR    = MA_ROOT / "cms-payment"

OUTPUT_DIR = Path("data/output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("MA_ROOT", MA_ROOT)
print("LANDSCAPE_DIR", LANDSCAPE_DIR)
print("FFSCOST_ROOT", FFSCOST_ROOT)
print("CMSPAY_DIR exists", CMSPAY_DIR.exists())
print("OUTPUT_DIR", OUTPUT_DIR.resolve())

YEARS = list(range(2014, 2020))
print("Target years", YEARS)

MA_ROOT /scion/5261/econ470001/ma-data/ma
LANDSCAPE_DIR /scion/5261/econ470001/ma-data/ma/landscape/Extracted Data
FFSCOST_ROOT /scion/5261/econ470001/ma-data/ffs-costs
CMSPAY_DIR exists True
OUTPUT_DIR /home/rpat638/econ470/a0/work/hwk2/data/output
Target years [2014, 2015, 2016, 2017, 2018, 2019]


In [3]:
def norm_colname(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def parse_year_from_name(p: Path) -> int | None:
    m = re.search(r"(20\d{2})", p.name)
    return int(m.group(1)) if m else None

def detect_header_row_csv(path: Path, nrows: int = 100) -> int:
    preview = pd.read_csv(
        path,
        header=None,
        nrows=nrows,
        dtype=str,
        encoding_errors="replace",
        engine="python",
        on_bad_lines="skip",
    ).fillna("").astype(str)

    keys = [
        "state", "county", "contract", "contract id",
        "plan", "plan id", "pbp",
        "organization", "bid", "benchmark", "premium",
        "code", "enrollment", "reimbursement", "per capita",
    ]

    best_i = 0
    best_hits = -1
    for i in range(preview.shape[0]):
        row = " ".join(preview.iloc[i].tolist()).lower()
        hits = sum(1 for k in keys if k in row)
        if hits > best_hits:
            best_hits = hits
            best_i = i
    return best_i

def read_csv_autoheader(path: Path) -> pd.DataFrame:
    header_row = detect_header_row_csv(path)
    df = pd.read_csv(
        path,
        skiprows=header_row,
        header=0,
        dtype=str,
        encoding_errors="replace",
        engine="python",
        on_bad_lines="skip",
    )
    df.columns = [norm_colname(c) for c in df.columns]
    df = df.dropna(how="all")
    return df

def first_col(cols: list[str], needles: list[str]) -> str | None:
    cols_l = [c.lower() for c in cols]
    for n in needles:
        for i, c in enumerate(cols_l):
            if n in c:
                return cols[i]
    return None

def to_num(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.astype(str)
         .str.replace(",", "", regex=False)
         .str.replace("$", "", regex=False)
         .str.replace("%", "", regex=False)
         .str.strip(),
        errors="coerce",
    )

In [4]:
def landscape_files_for_year(root: Path, year: int) -> list[Path]:
    files = sorted([p for p in root.rglob("*.csv") if p.is_file()])
    hits = []
    for p in files:
        y = parse_year_from_name(p)
        if y != year:
            continue
        n = p.name.lower()
        if "sanction" in n or "importantnotes" in n or "partd" in n or "part_d" in n or "premium" in n:
            continue
        hits.append(p)
    return hits

for y in YEARS:
    lf = landscape_files_for_year(LANDSCAPE_DIR, y)
    print(y, "landscape files", len(lf))
    for p in lf[:4]:
        print(" ", p.name)

2014 landscape files 4
  2014LandscapeSource file MA_AtoM 05292014.csv
  2014LandscapeSource file MA_NtoW 05292014.csv
  508_AlabamatoMontana 05292014.csv
  508_NebraskatoWyoming 05292014.csv
2015 landscape files 4
  2015LandscapeSource file MA_AtoM 11042014.csv
  2015LandscapeSource file MA_NtoW 11042014.csv
  508_AlabamatoMontana 03182015.csv
  508_NebraskatoWyoming 03182015.csv
2016 landscape files 6
  2016LandscapeSource file MA_AtoM 04222016.csv
  2016LandscapeSource file MA_NtoW 04222016.csv
  508_AlabamatoMontana 04222016.csv
  508_NebraskatoWyoming 04222016.csv
2017 landscape files 4
  2017LandscapeSource file MA_AtoM 10182016.csv
  2017LandscapeSource file MA_NtoW 10182016.csv
  508_AlabamatoMontana 09222017.csv
  508_NebraskatoWyoming 09222017.csv
2018 landscape files 4
  2018LandscapeSource file MA_AtoM 10142017.csv
  2018LandscapeSource file MA_NtoW 10142017.csv
  508_AlabamatoMontana 10012018.csv
  508_NebraskatoWyoming 10012018.csv
2019 landscape files 4
  2019LandscapeSo

In [5]:
def build_landscape_clean(years: list[int]) -> pd.DataFrame:
    out = []
    for y in years:
        files = landscape_files_for_year(LANDSCAPE_DIR, y)
        if not files:
            print("No landscape CSVs found for", y)
            continue

        chunks = []
        for p in files:
            df = read_csv_autoheader(p)
            df["year"] = y
            df["source_file"] = p.name
            chunks.append(df)
            print("loaded", y, p.name, "shape", df.shape)

        d = pd.concat(chunks, ignore_index=True)
        cols = list(d.columns)

        contract_col = first_col(cols, ["contract_id", "contractid", "contract"])
        planid_col   = first_col(cols, ["plan_id", "planid", "pbp"])
        if planid_col is None:
            planid_col = first_col(cols, ["plan_id_"])
        if planid_col is None:
            planid_col = first_col(cols, ["plan"])
        state_col  = first_col(cols, ["state"])
        county_col = first_col(cols, ["county"])
        bid_col    = first_col(cols, ["bid"])
        bmk_col    = first_col(cols, ["benchmark"])
        org_col    = first_col(cols, ["organization", "org"])
        pname_col  = first_col(cols, ["plan_name", "planname"])

        if contract_col is None or planid_col is None:
            print("Year", y, "missing contract or plan columns")
            print("First 80 columns", cols[:80])
            continue

        dd = pd.DataFrame()
        dd["contract_id"] = d[contract_col].astype(str).str.strip()
        dd["plan_id"]     = d[planid_col].astype(str).str.strip()
        dd["state"]       = d[state_col] if state_col is not None else np.nan
        dd["county"]      = d[county_col] if county_col is not None else np.nan
        dd["organization_name"] = d[org_col] if org_col is not None else np.nan
        dd["plan_name"]   = d[pname_col] if pname_col is not None else np.nan
        dd["bid"]         = to_num(d[bid_col]) if bid_col is not None else np.nan
        dd["benchmark"]   = to_num(d[bmk_col]) if bmk_col is not None else np.nan
        dd["year"]        = d["year"]
        dd["source_file"] = d["source_file"]

        out.append(dd)

    if not out:
        raise RuntimeError("No landscape data loaded")
    return pd.concat(out, ignore_index=True)

landscape_clean = build_landscape_clean(YEARS)
print("landscape_clean shape", landscape_clean.shape)
print("years present", sorted(landscape_clean["year"].unique().tolist()))
landscape_clean.head()

loaded 2014 2014LandscapeSource file MA_AtoM 05292014.csv shape (15444, 17)
loaded 2014 2014LandscapeSource file MA_NtoW 05292014.csv shape (19536, 17)
loaded 2014 508_AlabamatoMontana 05292014.csv shape (15855, 29)
loaded 2014 508_NebraskatoWyoming 05292014.csv shape (20301, 29)
loaded 2015 2015LandscapeSource file MA_AtoM 11042014.csv shape (15881, 17)
loaded 2015 2015LandscapeSource file MA_NtoW 11042014.csv shape (17695, 17)
loaded 2015 508_AlabamatoMontana 03182015.csv shape (16662, 28)
loaded 2015 508_NebraskatoWyoming 03182015.csv shape (5675, 28)
loaded 2016 2016LandscapeSource file MA_AtoM 04222016.csv shape (16096, 18)
loaded 2016 2016LandscapeSource file MA_NtoW 04222016.csv shape (18122, 18)
loaded 2016 508_AlabamatoMontana 04222016.csv shape (17046, 28)
loaded 2016 508_NebraskatoWyoming 04222016.csv shape (20396, 28)
loaded 2016 508_AlabamatoMontana 10182016.csv shape (17653, 30)
loaded 2016 508_NebraskatoWyoming 10182016.csv shape (20213, 28)
loaded 2017 2017LandscapeSour

Unnamed: 0,contract_id,plan_id,state,county,organization_name,plan_name,bid,benchmark,year,source_file
0,H0104,10,Alabama,Autauga,Blue Advantage (PPO),Blue Advantage Premier (PPO),,,2014,2014LandscapeSource file MA_AtoM 05292014.csv
1,H0104,11,Alabama,Autauga,Blue Advantage (PPO),Blue Advantage Complete (PPO),,,2014,2014LandscapeSource file MA_AtoM 05292014.csv
2,H0150,1,Alabama,Autauga,Cigna-HealthSpring,Cigna-HealthSpring Preferred (HMO),,,2014,2014LandscapeSource file MA_AtoM 05292014.csv
3,H0150,12,Alabama,Autauga,Cigna-HealthSpring,Cigna-HealthSpring Advantage (HMO),,,2014,2014LandscapeSource file MA_AtoM 05292014.csv
4,H0151,1,Alabama,Autauga,UnitedHealthcare,AARP MedicareComplete Plan 1 (HMO),,,2014,2014LandscapeSource file MA_AtoM 05292014.csv


In [6]:
landscape_out = OUTPUT_DIR / "landscape_clean_2014_2019.csv"
landscape_clean.to_csv(landscape_out, index=False)
print("Wrote", landscape_out, "rows", len(landscape_clean))

Wrote data/output/landscape_clean_2014_2019.csv rows 502809


In [7]:
try:
    import openpyxl  # noqa
    print("openpyxl already available")
except Exception as e:
    print("openpyxl missing, installing with pip")
    !{sys.executable} -m pip install --user openpyxl

openpyxl already available


In [8]:
def build_fips_from_code(code_series: pd.Series) -> pd.Series:
    code = code_series.astype(str).str.strip()
    code = code.str.replace(r"\.0$", "", regex=True)
    code = code.str.zfill(5)
    return code

def load_ffs_from_master(master_path: Path) -> pd.DataFrame:
    df = read_csv_autoheader(master_path)
    cols = list(df.columns)

    code_col = first_col(cols, ["code"])
    year_col = first_col(cols, ["year"])
    a_en_col = first_col(cols, ["part_a_enrollment"])
    b_en_col = first_col(cols, ["part_b_enrollment"])
    a_rb_col = first_col(cols, ["part_a_total_reimbursement"])
    b_rb_col = first_col(cols, ["part_b_total_reimbursement"])

    if code_col is None or year_col is None:
        raise KeyError("FFS master missing code or year")
    if a_en_col is None or b_en_col is None or a_rb_col is None or b_rb_col is None:
        raise KeyError("FFS master missing A or B enrollment or reimbursement columns")

    tmp = df.copy()
    tmp["fips"] = build_fips_from_code(tmp[code_col])
    tmp["year"] = pd.to_numeric(tmp[year_col], errors="coerce").astype("Int64")

    denom = (to_num(tmp[a_en_col]) + to_num(tmp[b_en_col])).replace(0, np.nan)
    tmp["ffs_cost"] = (to_num(tmp[a_rb_col]) + to_num(tmp[b_rb_col])) / denom

    out = tmp[["fips", "year", "ffs_cost"]].dropna(subset=["year", "ffs_cost"]).copy()
    out["year"] = out["year"].astype(int)
    return out

MASTER_PATH = FFSCOST_ROOT / "CMS FFS Costs.csv"
print("MASTER_PATH exists", MASTER_PATH.exists())

ffs_master_cost = None
if MASTER_PATH.exists():
    ffs_master_cost = load_ffs_from_master(MASTER_PATH)
    print("Master years", sorted(ffs_master_cost["year"].unique().tolist()))
else:
    print("No master found")

MASTER_PATH exists True
Master years [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]


In [9]:
def detect_header_row_excel(path: Path, nrows: int = 80) -> int:
    preview = pd.read_excel(path, header=None, nrows=nrows, engine="openpyxl").fillna("").astype(str)
    keys = ["code", "state", "county", "part a", "part b", "enrollment", "reimbursement", "per capita"]
    best_i = 0
    best_hits = -1
    for i in range(preview.shape[0]):
        row = " ".join(preview.iloc[i].tolist()).lower()
        hits = sum(1 for k in keys if k in row)
        if hits > best_hits:
            best_hits = hits
            best_i = i
    return best_i

def read_excel_autoheader(path: Path) -> pd.DataFrame:
    header_row = detect_header_row_excel(path)
    df = pd.read_excel(path, skiprows=header_row, header=0, engine="openpyxl", dtype=str)
    df.columns = [norm_colname(c) for c in df.columns]
    df = df.dropna(how="all")
    return df

def find_ffs_excel(ffscost_root: Path, year: int) -> Path | None:
    extracted = ffscost_root / "Extracted Data"
    yy = str(year)[-2:]
    pats = [f"*FFS{yy}*.xlsx", f"*ffs{yy}*.xlsx", f"*FFS{year}*.xlsx", f"*ffs{year}*.xlsx"]
    hits = []
    for pat in pats:
        hits.extend(list(extracted.rglob(pat)))
    hits = [p for p in hits if p.is_file() and "~$" not in p.name]
    if not hits:
        return None
    hits = sorted(hits, key=lambda p: (len(str(p)), str(p).lower()))
    return hits[0]

def load_ffs_from_excel_year(ffscost_root: Path, year: int) -> pd.DataFrame:
    xlsx_path = find_ffs_excel(ffscost_root, year)
    if xlsx_path is None:
        raise FileNotFoundError(f"Missing Excel for {year} under {ffscost_root}/Extracted Data")

    df = read_excel_autoheader(xlsx_path)
    cols = list(df.columns)

    code_col = first_col(cols, ["code"])
    a_en_col = first_col(cols, ["part_a_enrollment"])
    b_en_col = first_col(cols, ["part_b_enrollment"])
    a_rb_col = first_col(cols, ["part_a_total_reimbursement"])
    b_rb_col = first_col(cols, ["part_b_total_reimbursement"])

    if code_col is None:
        raise KeyError("Excel missing code column")
    if a_en_col is None or b_en_col is None or a_rb_col is None or b_rb_col is None:
        raise KeyError("Excel missing A or B enrollment or reimbursement columns")

    tmp = df.copy()
    tmp["fips"] = build_fips_from_code(tmp[code_col])
    tmp["year"] = year

    denom = (to_num(tmp[a_en_col]) + to_num(tmp[b_en_col])).replace(0, np.nan)
    tmp["ffs_cost"] = (to_num(tmp[a_rb_col]) + to_num(tmp[b_rb_col])) / denom

    out = tmp[["fips", "year", "ffs_cost"]].dropna(subset=["ffs_cost"]).copy()
    print("Loaded", year, "from", xlsx_path)
    print("Rows", len(out), "ffs_cost nonmissing share", out["ffs_cost"].notna().mean())
    return out

In [10]:
pieces = []

if ffs_master_cost is not None:
    pieces.append(ffs_master_cost[ffs_master_cost["year"].isin(YEARS)].copy())

for y in [2016, 2017, 2018, 2019]:
    pieces.append(load_ffs_from_excel_year(FFSCOST_ROOT, y))

ffs_cost = pd.concat(pieces, ignore_index=True)
ffs_cost = ffs_cost.drop_duplicates(subset=["fips", "year"]).copy()
ffs_cost = ffs_cost[ffs_cost["year"].isin(YEARS)].copy()

print("ffs_cost shape", ffs_cost.shape)
print("years present", sorted(ffs_cost["year"].unique().tolist()))
ffs_cost.head()

Loaded 2016 from /scion/5261/econ470001/ma-data/ffs-costs/Extracted Data/FFS16.xlsx
Rows 3220 ffs_cost nonmissing share 1.0
Loaded 2017 from /scion/5261/econ470001/ma-data/ffs-costs/Extracted Data/ffs2017/FFS17.xlsx
Rows 3221 ffs_cost nonmissing share 1.0
Loaded 2018 from /scion/5261/econ470001/ma-data/ffs-costs/Extracted Data/FFS2018/FFS18.xlsx
Rows 3221 ffs_cost nonmissing share 1.0
Loaded 2019 from /scion/5261/econ470001/ma-data/ffs-costs/Extracted Data/FFS2019/FFS19.xlsx
Rows 3221 ffs_cost nonmissing share 1.0
ffs_cost shape (19318, 3)
years present [2014, 2015, 2016, 2017, 2018, 2019]


Unnamed: 0,fips,year,ffs_cost
0,1000,2014,3740.899419
1,1010,2014,3654.810516
2,1020,2014,4060.929493
3,1030,2014,4338.15239
4,1040,2014,3776.224337


In [11]:
ffs_out = OUTPUT_DIR / "ffs_cost_2014_2019.csv"
ffs_cost.to_csv(ffs_out, index=False)
print("Wrote", ffs_out, "rows", len(ffs_cost))

Wrote data/output/ffs_cost_2014_2019.csv rows 19318


In [12]:
def quartile_codes(x: pd.Series) -> pd.Series:
    x = pd.to_numeric(x, errors="coerce").replace([np.inf, -np.inf], np.nan)
    cats = pd.qcut(x, 4, duplicates="drop")
    codes = cats.cat.codes.replace(-1, np.nan) + 1
    return codes.astype("Int64")

ffs18 = ffs_cost[ffs_cost["year"] == 2018].copy()
print("ffs18 rows", len(ffs18))

ffs18["ffs_quartile"] = quartile_codes(ffs18["ffs_cost"])
print(ffs18["ffs_quartile"].value_counts(dropna=False).sort_index())

ffs18_out = OUTPUT_DIR / "ffs_cost_2018_with_quartiles.csv"
ffs18[["fips", "year", "ffs_cost", "ffs_quartile"]].to_csv(ffs18_out, index=False)
print("Wrote", ffs18_out, "rows", len(ffs18))

ffs18 rows 3221
ffs_quartile
1    806
2    805
3    805
4    805
Name: count, dtype: Int64
Wrote data/output/ffs_cost_2018_with_quartiles.csv rows 3221


In [13]:
print("DataFrames you should have now")
objs = {k: v for k, v in globals().items() if isinstance(v, pd.DataFrame)}
print(sorted(objs.keys()))
print("Files in output")
print(sorted([p.name for p in OUTPUT_DIR.glob("*.csv")]))

DataFrames you should have now
['_', '_10', '_5', '__', 'ffs18', 'ffs_cost', 'ffs_master_cost', 'landscape_clean']
Files in output
['ffs_cost_2014_2019.csv', 'ffs_cost_2018_with_quartiles.csv', 'landscape_clean_2014_2019.csv']
