In [1]:
from __future__ import annotations

import re
from pathlib import Path

import numpy as np
import pandas as pd
import gc

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

YEARS = list(range(2014, 2020))

OUT_DIR = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CANDIDATE_MA_ROOTS = [
    Path("/scion/5261/econ470001/ma-data/ma"),
    Path("/econ470/a0/work/ma-data/ma"),
    Path.cwd().parent / "ma-data" / "ma",
]

In [2]:
def pick_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p.exists():
            return p
    return None

MA_ROOT = pick_existing(CANDIDATE_MA_ROOTS)
if MA_ROOT is None:
    raise FileNotFoundError("Could not find ma-data/ma. Update CANDIDATE_MA_ROOTS if needed.")

ENROLL_EXTRACTED  = MA_ROOT / "enrollment" / "Extracted Data"
SAREA_EXTRACTED   = MA_ROOT / "service-area" / "Extracted Data"
PEN_EXTRACTED     = MA_ROOT / "penetration" / "Extracted Data"

print("MA_ROOT", MA_ROOT)
print("ENROLL_EXTRACTED exists", ENROLL_EXTRACTED.exists())
print("SAREA_EXTRACTED exists", SAREA_EXTRACTED.exists())
print("PEN_EXTRACTED exists", PEN_EXTRACTED.exists())

MA_ROOT /scion/5261/econ470001/ma-data/ma
ENROLL_EXTRACTED exists True
SAREA_EXTRACTED exists True
PEN_EXTRACTED exists True


In [3]:
def norm_colname(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "", s)
    return s

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [norm_colname(c) for c in df.columns]
    return df

def first_existing_col(cols: list[str], candidates: list[str]) -> str | None:
    s = set(cols)
    for c in candidates:
        if c in s:
            return c
    return None

def coerce_id_series(x: pd.Series, width: int | None = None) -> pd.Series:
    x = x.astype(str).str.strip()
    x = x.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    x = x.str.replace(r"\.0$", "", regex=True)
    if width is not None:
        x = x.str.zfill(width)
    return x

def to_numeric_clean(x: pd.Series) -> pd.Series:
    s = x.astype(str).str.replace(",", "", regex=False).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan, "*": np.nan})
    return pd.to_numeric(s, errors="coerce")

In [4]:
def _digits_zfill(x: pd.Series, width: int) -> pd.Series:
    s = x.astype(str).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "": np.nan})
    s = s.str.replace(r"\.0$", "", regex=True)
    s = s.str.replace(r"[^0-9]", "", regex=True)
    s = s.where(s.str.len() > 0, np.nan)
    return s.str.zfill(width)

def coerce_fips(df: pd.DataFrame) -> pd.Series:
    cols = list(df.columns)

    fips_col = first_existing_col(
        cols,
        [
            "fips",
            "fips_state_county_code",   # your enrollment file
            "fipsstatecountycode",
            "county_fips",
            "cnty_fips",
            "county_fips_code",
            "fipscounty",
            "fipscnty",
        ],
    )
    if fips_col is not None:
        return _digits_zfill(df[fips_col], 5)

    st_col = first_existing_col(
        cols,
        ["state_fips", "statefips", "fipsst", "state_fipscode", "state_code"]
    )
    ct_col = first_existing_col(
        cols,
        ["county_fips", "countyfips", "fipscnty", "county_code", "cnty"]
    )

    if st_col is not None and ct_col is not None:
        st = _digits_zfill(df[st_col], 2)
        ct = _digits_zfill(df[ct_col], 3)
        out = (st.fillna("") + ct.fillna("")).replace({"": np.nan})
        return out

    return pd.Series([np.nan] * len(df), index=df.index, dtype="object")

In [5]:
def find_year_file(root: Path, year: int, kind: str) -> Path:
    files = [p for p in root.rglob("*") if p.is_file()]

    if kind == "penetration":
        pats = [rf".*{year}.*penet.*\.(csv|txt)$", rf".*penet.*{year}.*\.(csv|txt)$"]
    else:
        raise ValueError("unknown kind")

    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year} under {root}")

def find_month_file(root: Path, year: int, month: int, kind: str) -> Path:
    m2 = f"{month:02d}"
    files = [p for p in root.rglob("*") if p.is_file()]

    if kind == "enroll":
        pats = [rf".*{year}.*{m2}.*enroll.*\.(csv|txt)$", rf".*enroll.*{year}.*{m2}.*\.(csv|txt)$"]
    elif kind == "sarea":
        pats = [rf".*{year}.*{m2}.*(service|sa|sarea).*?\.(csv|txt)$", rf".*(service|sa|sarea).*{year}.*{m2}.*?\.(csv|txt)$"]
    else:
        raise ValueError("unknown kind")

    for pat in pats:
        rx = re.compile(pat, flags=re.I)
        hits = [p for p in files if rx.match(p.name)]
        if hits:
            hits.sort(key=lambda p: len(p.name))
            return hits[0]

    rx = re.compile(rf".*{year}.*{m2}.*\.(csv|txt)$", flags=re.I)
    hits = [p for p in files if rx.match(p.name)]
    if hits:
        hits.sort(key=lambda p: len(p.name))
        return hits[0]

    raise FileNotFoundError(f"Could not find {kind} file for {year}-{m2} under {root}")

def read_csv_any(path: Path) -> pd.DataFrame:
    return pd.read_csv(
        path,
        dtype=str,
        na_values=["*", "NA", "N/A", ""],
        keep_default_na=True,
        encoding_errors="replace",
        low_memory=False,
    )

In [6]:
def standardize_keys(df: pd.DataFrame, require_plan_id: bool = True) -> pd.DataFrame:
    df = normalize_columns(df)
    cols = list(df.columns)

    contract_col = first_existing_col(cols, ["contract_id", "contractid", "contr_id", "contract", "contract_number"])
    plan_col     = first_existing_col(cols, ["plan_id", "planid", "plan", "plan_number", "pln_id"])

    if contract_col is None:
        raise KeyError(f"Missing contract id column. Sample columns {sorted(cols)[:40]}")

    out = df.copy()
    out["contract_id"] = coerce_id_series(out[contract_col], None)

    if require_plan_id:
        if plan_col is None:
            raise KeyError(f"Missing plan id column. Sample columns {sorted(cols)[:40]}")
        out["plan_id"] = coerce_id_series(out[plan_col], 3)
    else:
        if plan_col is None:
            out["plan_id"] = np.nan
        else:
            out["plan_id"] = coerce_id_series(out[plan_col], 3)

    out["fips"] = coerce_fips(out)
    return out

def pick_enrollment_col(df: pd.DataFrame) -> str:
    cols = list(df.columns)
    c = first_existing_col(
        cols,
        ["enrollment","enroll","total_enrollment","tot_enrollment","enrollment_cnt","enrollment_count","plan_enrollment"],
    )
    if c is None:
        raise KeyError(f"Missing enrollment column. Sample columns {sorted(cols)[:60]}")
    return c

def pick_plan_meta_cols(df: pd.DataFrame) -> list[str]:
    cols = list(df.columns)
    candidates = [
        "plan_type","plantype","contract_type","contracttype","organization_type","org_type",
        "snp","snp_type","segment","partd","part_d","partc","part_c","plan_name",
    ]
    return [c for c in candidates if c in cols]

In [7]:
def flag_exclusions(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    cols = list(df.columns)

    df["is_800_series"] = df["plan_id"].astype(str).str.strip().str.startswith("8")

    snp_col = first_existing_col(cols, ["snp", "snp_type"])
    if snp_col is None:
        df["is_snp"] = False
    else:
        s = df[snp_col].astype(str).str.lower().str.strip()
        df["is_snp"] = s.isin(["y", "yes", "1", "true"]) | (s.notna() & (s != "") & ~s.isin(["n", "no", "0", "false", "nan", "none"]))

    partc_col = first_existing_col(cols, ["partc", "part_c"])
    if partc_col is not None:
        s = df[partc_col].astype(str).str.lower().str.strip()
        df["is_pdp_only"] = s.isin(["n", "no", "0", "false"])
    else:
        plan_type_col = first_existing_col(cols, ["plan_type", "plantype", "contract_type", "contracttype"])
        if plan_type_col is None:
            df["is_pdp_only"] = False
        else:
            s = df[plan_type_col].astype(str).str.lower().str.strip()
            df["is_pdp_only"] = s.str.contains(r"\bpdp\b", regex=True) | s.str.contains("prescription drug", regex=False)

    df["drop_hw2"] = df["is_snp"] | df["is_800_series"] | df["is_pdp_only"]
    return df

In [8]:
year = 2018
month = 1
enroll_path = find_month_file(ENROLL_EXTRACTED, year, month, "enroll")
sarea_path  = find_month_file(SAREA_EXTRACTED,  year, month, "sarea")

en = standardize_keys(read_csv_any(enroll_path), require_plan_id=True)
sa = standardize_keys(read_csv_any(sarea_path), require_plan_id=False)

print("enroll fips non-missing share:", en["fips"].notna().mean())
print("sarea  fips non-missing share:", sa["fips"].notna().mean())
print(en[["contract_id","plan_id","fips"]].head(3))
print(sa[["contract_id","plan_id","fips"]].head(3))

enroll fips non-missing share: 0.9972787896000975
sarea  fips non-missing share: 0.9998615883637939
  contract_id plan_id fips
0       E0654     801  NaN
1       E0654     801  NaN
2       E0654     801  NaN
  contract_id  plan_id   fips
0       90091      NaN    NaN
1       H0022      NaN  39023
2       H0022      NaN  39035


In [9]:
paths_to_delete = [
    OUT_DIR / "plan_county_year_2014_2019.csv",
    OUT_DIR / "county_plan_counts_2014_2019.csv",
    OUT_DIR / "county_hhi_ma_share_2014_2019.csv",
]
for p in paths_to_delete:
    if p.exists():
        p.unlink()
        print("deleted", p)

for y in YEARS:
    yp = OUT_DIR / f"plan_county_year_{y}.csv"
    if yp.exists():
        yp.unlink()
        print("deleted", yp)

deleted data/processed/plan_county_year_2014_2019.csv
deleted data/processed/county_plan_counts_2014_2019.csv
deleted data/processed/county_hhi_ma_share_2014_2019.csv
deleted data/processed/plan_county_year_2014.csv
deleted data/processed/plan_county_year_2015.csv
deleted data/processed/plan_county_year_2016.csv
deleted data/processed/plan_county_year_2017.csv
deleted data/processed/plan_county_year_2018.csv
deleted data/processed/plan_county_year_2019.csv


In [10]:
def build_plan_county_year_one_year(year: int) -> Path:
    out_path = OUT_DIR / f"plan_county_year_{year}.csv"
    if out_path.exists():
        return out_path

    accum = None

    for month in range(1, 13):
        enroll_path = find_month_file(ENROLL_EXTRACTED, year, month, "enroll")
        sarea_path  = find_month_file(SAREA_EXTRACTED,  year, month, "sarea")

        enroll_raw = read_csv_any(enroll_path)
        sarea_raw  = read_csv_any(sarea_path)

        enroll = standardize_keys(enroll_raw, require_plan_id=True)
        sarea  = standardize_keys(sarea_raw, require_plan_id=False)

        enroll_col = pick_enrollment_col(enroll)
        meta_cols  = pick_plan_meta_cols(enroll)

        keep_enroll = ["contract_id", "plan_id", "fips", enroll_col] + meta_cols
        keep_enroll = [c for c in keep_enroll if c in enroll.columns]
        enroll = enroll[keep_enroll].copy()
        enroll = enroll.rename(columns={enroll_col: "enrollment"})
        enroll["enrollment"] = to_numeric_clean(enroll["enrollment"])

        sarea_keys = ["contract_id", "fips"]
        has_plan_in_sarea = sarea["plan_id"].notna().any()
        if has_plan_in_sarea:
            sarea_keys = ["contract_id", "plan_id", "fips"]
        sarea = sarea[sarea_keys].copy()

        enroll = enroll.dropna(subset=["contract_id", "plan_id", "fips"])
        sarea  = sarea.dropna(subset=["contract_id", "fips"])
        if has_plan_in_sarea:
            sarea = sarea.dropna(subset=["plan_id"])

        merge_keys = ["contract_id", "fips"] if not has_plan_in_sarea else ["contract_id", "plan_id", "fips"]

        merged = enroll.merge(
            sarea,
            on=merge_keys,
            how="inner",
        )

        merged["year"] = year
        merged["month"] = month
        merged = flag_exclusions(merged)

        group_keys = ["contract_id", "plan_id", "fips", "year"]
        agg = {
            "enrollment": "sum",
            "is_snp": "max",
            "is_800_series": "max",
            "is_pdp_only": "max",
            "drop_hw2": "max",
        }
        for c in meta_cols:
            if c in merged.columns:
                agg[c] = "first"

        g = merged.groupby(group_keys, as_index=False).agg(agg)
        g["months_observed"] = 1
        g["enroll_sum"] = g["enrollment"]
        g.drop(columns=["enrollment"], inplace=True)

        g["dec_enrollment"] = g["enroll_sum"] if month == 12 else np.nan

        if accum is None:
            accum = g
        else:
            keys = ["contract_id", "plan_id", "fips", "year"]
            accum = accum.merge(g, on=keys, how="outer", suffixes=("", "_new"))

            for col in ["enroll_sum", "months_observed"]:
                accum[col] = accum[col].fillna(0) + accum[f"{col}_new"].fillna(0)
                accum.drop(columns=[f"{col}_new"], inplace=True)

            for col in ["is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]:
                accum[col] = np.maximum(
                    to_numeric_clean(accum[col]).fillna(0),
                    to_numeric_clean(accum[f"{col}_new"]).fillna(0),
                ).astype(int)
                accum.drop(columns=[f"{col}_new"], inplace=True)

            accum["dec_enrollment"] = accum["dec_enrollment"].combine_first(accum["dec_enrollment_new"])
            accum.drop(columns=["dec_enrollment_new"], inplace=True)

            for c in meta_cols:
                if c in accum.columns and f"{c}_new" in accum.columns:
                    accum[c] = accum[c].combine_first(accum[f"{c}_new"])
                    accum.drop(columns=[f"{c}_new"], inplace=True)

        del enroll_raw, sarea_raw, enroll, sarea, merged, g
        gc.collect()

    accum["avg_enrollment"] = accum["enroll_sum"] / accum["months_observed"].replace({0: np.nan})

    keep = ["contract_id", "plan_id", "fips", "year", "avg_enrollment", "dec_enrollment", "months_observed",
            "is_snp", "is_800_series", "is_pdp_only", "drop_hw2"]
    out = accum[keep].copy()

    out.to_csv(out_path, index=False)
    del accum, out
    gc.collect()
    return out_path

In [11]:
def build_plan_county_year_2014_2019_files() -> Path:
    combined_path = OUT_DIR / "plan_county_year_2014_2019.csv"
    if combined_path.exists():
        return combined_path

    wrote_header = False
    for y in YEARS:
        print("Building year", y)
        year_path = build_plan_county_year_one_year(y)

        chunk = pd.read_csv(year_path, dtype=str)
        chunk.to_csv(combined_path, mode="a", header=not wrote_header, index=False)
        wrote_header = True

        del chunk
        gc.collect()

    return combined_path

combined_csv = build_plan_county_year_2014_2019_files()
print("Wrote", combined_csv)
print(pd.read_csv(combined_csv, dtype=str, nrows=5))

Building year 2014
Building year 2015
Building year 2016
Building year 2017
Building year 2018
Building year 2019
Wrote data/processed/plan_county_year_2014_2019.csv
  contract_id plan_id   fips  year avg_enrollment dec_enrollment months_observed is_snp is_800_series is_pdp_only drop_hw2
0       H0022     001  39023  2014           43.2           81.0            10.0      0             0           0        0
1       H0022     001  39035  2014          264.1          344.0            10.0      0             0           0        0
2       H0022     001  39051  2014           17.2           24.0            10.0      0             0           0        0
3       H0022     001  39055  2014            0.0            0.0            10.0      0             0           0        0
4       H0022     001  39057  2014           72.5          135.0            10.0      0             0           0        0


In [12]:
y = 2018
yp = OUT_DIR / f"plan_county_year_{y}.csv"
print("exists:", yp.exists(), "bytes:", yp.stat().st_size)

print("first rows:")
print(pd.read_csv(yp, dtype=str, nrows=5))

# count lines without loading into memory
with open(yp, "r") as f:
    n = sum(1 for _ in f)
print("lines:", n)

exists: True bytes: 57910373
first rows:
  contract_id plan_id   fips  year      avg_enrollment dec_enrollment months_observed is_snp is_800_series is_pdp_only drop_hw2
0       H0022     001  39023  2018   598.4166666666666          622.0            12.0      0             0           0        0
1       H0022     001  39035  2018              3653.0         3657.0            12.0      0             0           0        0
2       H0022     001  39051  2018  115.83333333333333          126.0            12.0      0             0           0        0
3       H0022     001  39055  2018   77.33333333333333           80.0            12.0      0             0           0        0
4       H0022     001  39057  2018   571.0833333333334          601.0            12.0      0             0           0        0
lines: 1366488


In [13]:
combined_csv = OUT_DIR / "plan_county_year_2014_2019.csv"

with open(combined_csv, "r") as f:
    n = sum(1 for _ in f)
print("combined lines:", n)

print(pd.read_csv(combined_csv, dtype=str, nrows=5))
print("bytes:", combined_csv.stat().st_size)

combined lines: 7774410
  contract_id plan_id   fips  year avg_enrollment dec_enrollment months_observed is_snp is_800_series is_pdp_only drop_hw2
0       H0022     001  39023  2014           43.2           81.0            10.0      0             0           0        0
1       H0022     001  39035  2014          264.1          344.0            10.0      0             0           0        0
2       H0022     001  39051  2014           17.2           24.0            10.0      0             0           0        0
3       H0022     001  39055  2014            0.0            0.0            10.0      0             0           0        0
4       H0022     001  39057  2014           72.5          135.0            10.0      0             0           0        0
bytes: 329564529


In [14]:
PLAN_CY_PATH = OUT_DIR / "plan_county_year_2014_2019.csv"

def build_county_plan_counts(in_path: Path) -> Path:
    out_path = OUT_DIR / "county_plan_counts_2014_2019.csv"
    if out_path.exists():
        return out_path

    parts = []
    usecols = ["fips", "year", "drop_hw2"]

    for chunk in pd.read_csv(in_path, dtype=str, usecols=usecols, chunksize=600_000):
        drop = chunk["drop_hw2"].astype(str).str.lower().str.strip().isin(["1","true","t","yes","y"])
        chunk = chunk.loc[~drop, ["fips","year"]].dropna()

        tmp = chunk.groupby(["fips","year"], as_index=False).size()
        tmp = tmp.rename(columns={"size":"plan_count"})
        parts.append(tmp)

        del chunk, tmp
        gc.collect()

    out = pd.concat(parts, ignore_index=True)
    out = out.groupby(["fips","year"], as_index=False)["plan_count"].sum()
    out = out.sort_values(["year","fips"]).reset_index(drop=True)
    out.to_csv(out_path, index=False)
    return out_path

county_counts_path = build_county_plan_counts(PLAN_CY_PATH)
print("Wrote", county_counts_path)
print(pd.read_csv(county_counts_path, dtype=str, nrows=5))

Wrote data/processed/county_plan_counts_2014_2019.csv
    fips  year plan_count
0  01001  2014        386
1  01003  2014        431
2  01005  2014        370
3  01007  2014        384
4  01009  2014        387


In [15]:
def read_penetration_year(year: int) -> pd.DataFrame:
    path = find_year_file(PEN_EXTRACTED, year, "penetration")
    df = read_csv_any(path)
    df = normalize_columns(df)
    df["fips"] = coerce_fips(df)

    cols = list(df.columns)
    ma_col = first_existing_col(cols, ["ma_enrollment", "ma_enroll", "enrolled", "enroll"])
    elig_col = first_existing_col(cols, ["eligibles", "total_eligibles", "medicare_eligibles", "tot_eligibles"])

    if ma_col is None or elig_col is None:
        raise KeyError(f"Penetration file missing enrolled/eligibles columns. Columns: {sorted(cols)[:80]}")

    out = df[["fips", ma_col, elig_col]].copy()
    out = out.rename(columns={ma_col: "ma_enrollment", elig_col: "eligibles"})
    out["ma_enrollment"] = to_numeric_clean(out["ma_enrollment"])
    out["eligibles"] = to_numeric_clean(out["eligibles"])
    out["year"] = year
    out = out.dropna(subset=["fips"])
    return out

pen = pd.concat([read_penetration_year(y) for y in YEARS], ignore_index=True)
print(pen.shape)
print(pen.head())

(19494, 4)
    fips  ma_enrollment  eligibles  year
0  01001         3000.0       9438  2014
1  01003        13183.0      41640  2014
2  01005          859.0       6004  2014
3  01007         1665.0       4599  2014
4  01009         4680.0      11193  2014


In [17]:
pen = pen.copy()
pen["year"] = pen["year"].astype(str)
pen["fips"] = pen["fips"].astype(str).str.zfill(5)
print(pen.dtypes)

fips              object
ma_enrollment    float64
eligibles          int64
year              object
dtype: object


In [18]:
def build_county_hhi_ma_share(in_path: Path, pen: pd.DataFrame) -> Path:
    out_path = OUT_DIR / "county_hhi_ma_share_2014_2019.csv"
    if out_path.exists():
        return out_path

    parts = []
    usecols = ["fips","year","avg_enrollment","dec_enrollment","drop_hw2"]

    for chunk in pd.read_csv(in_path, dtype=str, usecols=usecols, chunksize=500_000):
        drop = chunk["drop_hw2"].astype(str).str.lower().str.strip().isin(["1","true","t","yes","y"])
        chunk = chunk.loc[~drop].dropna(subset=["fips","year"])

        chunk["dec_enrollment"] = pd.to_numeric(chunk["dec_enrollment"], errors="coerce")
        chunk["avg_enrollment"] = pd.to_numeric(chunk["avg_enrollment"], errors="coerce")

        e = chunk["dec_enrollment"].fillna(chunk["avg_enrollment"]).fillna(0.0)
        chunk["e"] = e
        chunk["e2"] = e * e

        tmp = (
            chunk.groupby(["fips","year"], as_index=False)
                 .agg(total_enroll=("e","sum"), sumsq_enroll=("e2","sum"))
        )
        parts.append(tmp)

        del chunk, tmp
        gc.collect()

    out = pd.concat(parts, ignore_index=True)
    out = out.groupby(["fips","year"], as_index=False).agg(
        total_enroll=("total_enroll","sum"),
        sumsq_enroll=("sumsq_enroll","sum"),
    )

    out["hhi"] = out["sumsq_enroll"] / (out["total_enroll"] * out["total_enroll"])
    out = out.rename(columns={"total_enroll":"ma_total_from_plans"})
    out = out.merge(pen, on=["fips","year"], how="left")
    out["ma_share"] = out["ma_enrollment"] / out["eligibles"]

    out = out.sort_values(["year","fips"]).reset_index(drop=True)
    out.to_csv(out_path, index=False)
    return out_path

county_hhi_path = build_county_hhi_ma_share(PLAN_CY_PATH, pen)
print("Wrote", county_hhi_path)
print(pd.read_csv(county_hhi_path, dtype=str, nrows=5))

Wrote data/processed/county_hhi_ma_share_2014_2019.csv
    fips  year ma_total_from_plans sumsq_enroll                  hhi ma_enrollment eligibles             ma_share
0  01001  2014              3027.0    1223071.0  0.13348326683022055        3000.0    9438.0   0.3178639542275906
1  01003  2014             13627.0   27941489.0  0.15046969271875732       13183.0   41640.0   0.3165946205571566
2  01005  2014               863.0     252081.0   0.3384687064042676         859.0    6004.0  0.14307128580946035
3  01007  2014              1661.0     484639.0  0.17566251443952183        1665.0    4599.0  0.36203522504892366
4  01009  2014              4653.0    3412299.0  0.15760898558544553        4680.0   11193.0   0.4181184668989547


In [19]:
for p in [
    OUT_DIR / "plan_county_year_2014_2019.csv",
    OUT_DIR / "county_plan_counts_2014_2019.csv",
    OUT_DIR / "county_hhi_ma_share_2014_2019.csv",
]:
    print(p, "exists", p.exists(), "bytes", p.stat().st_size if p.exists() else None)

data/processed/plan_county_year_2014_2019.csv exists True bytes 329564529
data/processed/county_plan_counts_2014_2019.csv exists True bytes 290270
data/processed/county_hhi_ma_share_2014_2019.csv exists True bytes 1484389
