In [1]:
from __future__ import annotations

import re
import warnings
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 220)

YEARS = list(range(2010, 2016))

HWK3_ROOT  = Path.cwd()
CACHE_DIR  = HWK3_ROOT / 'data' / 'cache'
OUTPUT_DIR = HWK3_ROOT / 'data' / 'output'
for d in [CACHE_DIR, OUTPUT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

_CANDIDATES = [
    Path('/scion/5261/econ470001/ma-data/ma'),
    Path('/home/rpat638/econ470/a0/work/ma-data/ma'),
    HWK3_ROOT.parent / 'ma-data' / 'ma',
    HWK3_ROOT.parent.parent / 'ma-data' / 'ma',
]

def _pick(paths):
    for p in paths:
        if p.exists():
            return p
    raise FileNotFoundError('No MA_ROOT found. Check _CANDIDATES:\n' + '\n'.join(str(p) for p in paths))

MA_ROOT    = _pick(_CANDIDATES)
ENROLL_DIR = MA_ROOT / 'enrollment'   / 'Extracted Data'
SAREA_DIR  = MA_ROOT / 'service-area' / 'Extracted Data'
PEN_DIR    = MA_ROOT / 'penetration'  / 'Extracted Data'
STARS_DIR  = MA_ROOT / 'star-ratings' / 'Extracted Star Ratings'
BENCH_DIR  = MA_ROOT / 'benchmarks'

TERRITORIES = {'VI', 'PR', 'MP', 'GU', 'AS'}

print('MA_ROOT :', MA_ROOT)
print('OUTPUT  :', OUTPUT_DIR)

MA_ROOT : /scion/5261/econ470001/ma-data/ma
OUTPUT  : /home/rpat638/econ470/a0/work/hwk3/data/output


In [2]:
RATING_VARS = {
    2010: [
        'contractid', 'org_type', 'contract_name', 'org_marketing',
        'breastcancer_screen', 'rectalcancer_screen', 'cv_diab_cholscreen',
        'glaucoma_test', 'monitoring', 'flu_vaccine', 'pn_vaccine',
        'physical_health', 'mental_health', 'osteo_test', 'physical_monitor',
        'primaryaccess', 'osteo_manage', 'diab_healthy', 'bloodpressure',
        'ra_manage', 'copd_test', 'bladder', 'falling',
        'nodelays', 'doctor_communicate', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'complaints_plan', 'appeals_timely', 'appeals_review',
        'leave_plan', 'audit_problems', 'hold_times', 'info_accuracy', 'ttyt_available',
    ],
    2011: [
        'contractid', 'org_type', 'contract_name', 'org_marketing',
        'breastcancer_screen', 'rectalcancer_screen', 'cv_cholscreen', 'diab_cholscreen',
        'glaucoma_test', 'monitoring', 'flu_vaccine', 'pn_vaccine',
        'physical_health', 'mental_health', 'osteo_test', 'physical_monitor',
        'primaryaccess', 'osteo_manage',
        'diabetes_eye', 'diabetes_kidney', 'diabetes_bloodsugar', 'diabetes_chol',
        'bloodpressure', 'ra_manage', 'copd_test', 'bladder', 'falling',
        'nodelays', 'doctor_communicate', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'complaints_plan', 'appeals_timely', 'appeals_review',
        'corrective_action', 'hold_times', 'info_accuracy', 'ttyt_available',
    ],
    2012: [
        'contractid', 'org_type', 'org_parent', 'org_marketing',
        'breastcancer_screen', 'rectalcancer_screen', 'cv_cholscreen', 'diab_cholscreen',
        'glaucoma_test', 'flu_vaccine', 'pn_vaccine',
        'physical_health', 'mental_health', 'physical_monitor',
        'primaryaccess', 'bmi_assess',
        'older_medication', 'older_function', 'older_pain',
        'osteo_manage', 'diabetes_eye', 'diabetes_kidney', 'diabetes_bloodsugar', 'diabetes_chol',
        'bloodpressure', 'ra_manage', 'bladder', 'falling', 'readmissions',
        'nodelays', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'complaints_plan', 'access_problems', 'leave_plan',
        'appeals_timely', 'appeals_review', 'ttyt_available',
    ],
    2013: [
        'contractid', 'org_type', 'contract_name', 'org_marketing', 'org_parent',
        'breastcancer_screen', 'rectalcancer_screen', 'cv_cholscreen', 'diab_cholscreen',
        'glaucoma_test', 'flu_vaccine',
        'physical_health', 'mental_health', 'physical_monitor',
        'bmi_assess', 'older_medication', 'older_function', 'older_pain',
        'osteo_manage', 'diabetes_eye', 'diabetes_kidney', 'diabetes_bloodsugar', 'diabetes_chol',
        'bloodpressure', 'ra_manage', 'bladder', 'falling', 'readmissions',
        'nodelays', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'coordination', 'complaints_plan', 'access_problems', 'leave_plan', 'improve',
        'appeals_timely', 'appeals_review', 'ttyt_available', 'enroll_timely',
    ],
    2014: [
        'contractid', 'org_type', 'contract_name', 'org_marketing', 'org_parent',
        'breastcancer_screen', 'rectalcancer_screen', 'cv_cholscreen', 'diab_cholscreen',
        'glaucoma_test', 'flu_vaccine',
        'physical_health', 'mental_health', 'physical_monitor',
        'bmi_assess', 'older_medication', 'older_function', 'older_pain',
        'osteo_manage', 'diabetes_eye', 'diabetes_kidney', 'diabetes_bloodsugar', 'diabetes_chol',
        'bloodpressure', 'ra_manage', 'bladder', 'falling', 'readmissions',
        'nodelays', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'coordination', 'complaints_plan', 'access_problems', 'leave_plan', 'improve',
        'appeals_timely', 'appeals_review', 'ttyt_available',
    ],
    2015: [
        'contractid', 'org_type', 'contract_name', 'org_marketing', 'org_parent',
        'rectalcancer_screen', 'cv_cholscreen', 'diab_cholscreen', 'flu_vaccine',
        'physical_health', 'mental_health', 'physical_monitor',
        'bmi_assess', 'specialneeds_manage',
        'older_medication', 'older_function', 'older_pain',
        'osteo_manage', 'diabetes_eye', 'diabetes_kidney', 'diabetes_bloodsugar', 'diabetes_chol',
        'bloodpressure', 'ra_manage', 'bladder', 'falling', 'readmissions',
        'nodelays', 'carequickly', 'customer_service',
        'overallrating_care', 'overallrating_plan',
        'coordination', 'complaints_plan', 'leave_plan', 'improve',
        'appeals_timely', 'appeals_review',
    ],
}

_ID_COLS = {
    'contractid', 'org_type', 'contract_name', 'org_marketing',
    'org_parent', 'org_name', 'org_marketing_name', 'parent_org',
}

print('RATING_VARS loaded for years:', sorted(RATING_VARS))

RATING_VARS loaded for years: [2010, 2011, 2012, 2013, 2014, 2015]


In [3]:
def read_csv_safe(path: Path, **kw) -> pd.DataFrame:
    """Try utf-8 then latin-1; skip bad lines."""
    if kw.get('engine') == 'python':
        kw.pop('low_memory', None)
    else:
        kw.setdefault('low_memory', False)
    kw.setdefault('on_bad_lines', 'skip')
    for enc in ['utf-8', 'latin-1', 'cp1252']:
        try:
            return pd.read_csv(path, encoding=enc, **kw)
        except UnicodeDecodeError:
            continue
    return pd.read_csv(path, **kw)


_CONTRACT_RE = re.compile(r'^[A-Z]\d{4,5}$')

def probe_data_start(path: Path) -> int:
    """
    Scan the first 25 rows and return the 0-based row index where column 0
    first looks like a CMS contract ID (e.g. 'H1234' or 'H12345').
    Pass this as skiprows= so row 0 of the resulting DataFrame is real data.
    """
    try:
        text = path.read_bytes().decode('latin-1', errors='replace')
        for i, line in enumerate(text.splitlines()[:25]):
            first = line.split(',')[0].strip().strip('"').strip("'")
            if _CONTRACT_RE.match(first):
                return i
    except Exception:
        pass
    return 0


def find_file(directory: Path, patterns: list) -> Optional[Path]:
    for pat in patterns:
        hits = sorted(directory.glob(pat))
        if hits:
            return hits[0]
    return None


def clean_contractid(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.extract(r'([A-Z]\d{4,5})', expand=False)


def clean_planid(x) -> Optional[str]:
    d = re.sub(r'\D', '', '' if pd.isna(x) else str(x))
    return d[-3:].zfill(3) if d else None


def clean_fips(x) -> Optional[str]:
    d = re.sub(r'\D', '', '' if pd.isna(x) else str(x))
    if not d:
        return None
    d = d[-5:].zfill(5)
    return d if len(d) == 5 else None


def clean_ssa(x) -> Optional[str]:
    d = re.sub(r'\D', '', '' if pd.isna(x) else str(x))
    if not d:
        return None
    d = d[-5:].zfill(5)
    return d if len(d) == 5 else None


def to_float(s: pd.Series) -> pd.Series:
    """Strip commas/asterisks and coerce to float."""
    t = s.astype(str).str.replace(',', '', regex=False).str.strip()
    return pd.to_numeric(t.str.extract(r'(\d+(?:\.\d+)?)', expand=False), errors='coerce')


def raw_to_star(raw: pd.Series) -> pd.Series:
    """
    Convert a continuous raw domain score to a half-star CMS rating.
    Used as a fallback when partc_score is not parseable from the summary file.

    Boundaries (right-exclusive):
      < 1.25 → 1.0 | 1.25-1.75 → 1.5 | 1.75-2.25 → 2.0 | 2.25-2.75 → 2.5
      2.75-3.25 → 3.0 | 3.25-3.75 → 3.5 | 3.75-4.25 → 4.0 | 4.25-4.75 → 4.5
      >= 4.75 → 5.0
    """
    bounds = [-np.inf, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, np.inf]
    labels = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    s = pd.to_numeric(raw, errors='coerce')
    return pd.cut(s, bins=bounds, labels=labels, right=False).astype(float)


print('Utilities loaded.')

Utilities loaded.


In [4]:
def load_contract_info(year: int) -> pd.DataFrame:
    """
    Code.
    """
    p = find_file(ENROLL_DIR, [
        f'CPSC_Contract_Info_{year}_01.csv',
        f'CPSC_Contract_Info_{year}_1.csv',
        f'CPSC_Contract_Info_{year}*.csv',
    ])
    if p is None:
        raise FileNotFoundError(f'No contract-info file for {year} in {ENROLL_DIR}')

    df = read_csv_safe(p, dtype=str)
    df.columns = [
        re.sub(r'[^\w]+', '_', c.strip().lower()).strip('_') for c in df.columns
    ]

    renames = {
        'contract_id': 'contractid', 'contract_number': 'contractid',
        'plan_id': 'planid',
        'organization_type': 'org_type',
        'offers_part_d': 'partd',
        'snp_plan': 'snp',
        'organization_name': 'org_name',
        'organization_marketing_name': 'org_marketing_name',
        'parent_organization': 'parent_org',
    }
    df = df.rename(columns={k: v for k, v in renames.items() if k in df.columns})

    df['contractid'] = clean_contractid(df['contractid'])
    df['planid']     = df['planid'].apply(clean_planid)
    df = df.dropna(subset=['contractid', 'planid'])

    if 'snp' in df.columns:
        df = df[df['snp'].astype(str).str.strip() == 'No']
    planid_n = pd.to_numeric(df['planid'], errors='coerce')
    df = df[~planid_n.between(800, 899)]
    if 'plan_type' in df.columns:
        df = df[~df['plan_type'].astype(str).str.upper().str.contains('PDP', na=False)]

    keep = [c for c in ['contractid', 'planid', 'org_type', 'plan_type',
                         'partd', 'snp', 'eghp', 'org_name',
                         'org_marketing_name', 'plan_name', 'parent_org']
            if c in df.columns]
    return df[keep].drop_duplicates()


def build_plan_county_year(year: int) -> pd.DataFrame:
    """
    
    """
    cache = CACHE_DIR / f'enroll_{year}.csv'
    if cache.exists():
        out = pd.read_csv(cache, dtype=str)
        print(f'  enroll {year}: (cached) {out.shape[0]} rows')
        return out

    contract    = load_contract_info(year)
    month_files = sorted(ENROLL_DIR.glob(f'CPSC_Enrollment_Info_{year}_*.csv'))
    if not month_files:
        raise FileNotFoundError(f'No enrollment files for {year}')

    frames = []
    for p in month_files:
        m_match = re.search(rf'{year}_(\d+)\.csv$', p.name)
        if not m_match:
            continue
        m = int(m_match.group(1))

        dfm = read_csv_safe(p, dtype=str)
        dfm.columns = [
            re.sub(r'[^\w]+', '_', c.strip().lower()).strip('_') for c in dfm.columns
        ]
        for old, new in [('contract_number', 'contractid'), ('contract_id', 'contractid'),
                          ('plan_id', 'planid'), ('fips_state_county_code', 'fips')]:
            if old in dfm.columns and new not in dfm.columns:
                dfm = dfm.rename(columns={old: new})

        keep = [c for c in ['contractid', 'planid', 'fips', 'state', 'county', 'enrollment']
                if c in dfm.columns]
        dfm = dfm[keep].copy()
        dfm['contractid'] = clean_contractid(dfm['contractid'])
        dfm['planid']     = dfm['planid'].apply(clean_planid)
        dfm['fips']       = dfm['fips'].apply(clean_fips)
        dfm['enrollment'] = pd.to_numeric(
            dfm['enrollment'].astype(str).str.replace(',', '', regex=False), errors='coerce')
        dfm['month'] = m

        
        dfm = dfm.merge(contract[['contractid', 'planid']], on=['contractid', 'planid'], how='inner')
        frames.append(dfm)

    if not frames:
        raise RuntimeError(f'No usable enrollment data for {year}')

    df = pd.concat(frames, ignore_index=True)
    df = df.dropna(subset=['fips', 'contractid', 'planid'])

    
    if 'state' in df.columns and 'county' in df.columns:
        df = df.sort_values(['state', 'county', 'month'])
        df['fips'] = (df.groupby(['state', 'county'])['fips']
                        .transform(lambda x: x.ffill().bfill()))

    
    out = (
        df.groupby(['contractid', 'planid', 'fips'], as_index=False)
          .agg(n_months=('month', 'count'),
               avg_enrollment=('enrollment', 'mean'),
               last_enrollment=('enrollment', 'last'))
    )

    
    if 'state' in df.columns:
        loc = (
            df.dropna(subset=['state', 'county'])
              .sort_values('month')
              .groupby(['contractid', 'planid', 'fips'], as_index=False)
              .last()[['contractid', 'planid', 'fips', 'state', 'county']]
        )
        out = out.merge(loc, on=['contractid', 'planid', 'fips'], how='left')

    out = out.merge(contract, on=['contractid', 'planid'], how='left')
    out['year'] = year

    out.to_csv(cache, index=False)
    months_found = sorted(df['month'].unique().tolist())
    print(f'  enroll {year}: {out.shape[0]} rows  months={months_found}')
    return out


print('Enrollment functions defined.')

Enrollment functions defined.


In [5]:
def load_service_area(year: int) -> pd.DataFrame:
    cache = CACHE_DIR / f'sarea_{year}.csv'
    if cache.exists():
        out = pd.read_csv(cache, dtype=str)
        print(f'  sarea  {year}: (cached) {out.shape[0]} rows')
        return out

    
    sa_files = sorted(SAREA_DIR.glob(f'MA_Cnty_SA_{year}_*.csv'))
    if not sa_files:
        p = find_file(SAREA_DIR, [
            f'MA_Cnty_SA_{year}_01.csv', f'MA_Cnty_SA_{year}01.csv',
            f'MA_Cnty_SA_{year}.csv', f'*{year}*SA*.csv',
        ])
        sa_files = [p] if p else []

    if not sa_files:
        print(f'  sarea  {year}: FILE NOT FOUND — returning empty')
        return pd.DataFrame(columns=['contractid', 'fips', 'ssa'])

    frames = []
    for p in sa_files:
        m_match = re.search(rf'{year}_(\d+)\.csv$', p.name)
        m = int(m_match.group(1)) if m_match else 0

        dfm = read_csv_safe(p, dtype=str)
        dfm.columns = [
            re.sub(r'[^\w]+', '_', c.strip().lower()).strip('_') for c in dfm.columns
        ]
        for old, new in [('contract_id', 'contractid'), ('contract', 'contractid'),
                          ('fips_state_county_code', 'fips')]:
            if old in dfm.columns and new not in dfm.columns:
                dfm = dfm.rename(columns={old: new})

        dfm['contractid'] = clean_contractid(dfm['contractid'])
        dfm['fips']       = dfm['fips'].apply(clean_fips)
        if 'ssa' in dfm.columns:
            dfm['ssa'] = dfm['ssa'].apply(clean_ssa)
        dfm['month'] = m
        frames.append(dfm)

    df = pd.concat(frames, ignore_index=True)
    df = df.dropna(subset=['contractid', 'fips'])

    if 'state' in df.columns and 'county' in df.columns:
        df = df.sort_values(['state', 'county', 'month'])
        df['fips'] = (df.groupby(['state', 'county'])['fips']
                        .transform(lambda x: x.ffill().bfill()))

    
    out = (
        df.sort_values('month')
          .groupby(['contractid', 'fips'], as_index=False)
          .last()
    )
    keep = [c for c in ['contractid', 'fips', 'ssa'] if c in out.columns]
    out  = out[keep].drop_duplicates(subset=['contractid', 'fips'])

    out.to_csv(cache, index=False)
    print(f'  sarea  {year}: {out.shape[0]} rows')
    return out


print('Service-area functions defined.')

Service-area functions defined.


In [6]:
def load_penetration(year: int) -> pd.DataFrame:
    cache = CACHE_DIR / f'pen_{year}.csv'
    if cache.exists():
        out = pd.read_csv(cache, dtype=str)
        print(f'  pen    {year}: (cached) {out.shape[0]} counties')
        return out

    pen_files = sorted(PEN_DIR.glob(f'State_County_Penetration_MA_{year}_*.csv'))
    if not pen_files:
        raise FileNotFoundError(f'No penetration files for {year} in {PEN_DIR}')

    frames = []
    for p in pen_files:
        m_match = re.search(rf'{year}_(\d+)\.csv$', p.name)
        m = int(m_match.group(1)) if m_match else 0

        dfm = read_csv_safe(p, dtype=str)
        dfm.columns = [
            re.sub(r'[^\w]+', '_', c.strip().lower()).strip('_') for c in dfm.columns
        ]
        for old, new in [('state_name', 'state'), ('county_name', 'county')]:
            if old in dfm.columns:
                dfm = dfm.rename(columns={old: new})

        keep = [c for c in ['state', 'county', 'fips', 'ssa', 'eligibles', 'enrolled']
                if c in dfm.columns]
        dfm = dfm[keep].copy()
        dfm['fips']      = dfm['fips'].apply(clean_fips)
        dfm['ssa']       = dfm['ssa'].apply(clean_ssa)
        dfm['eligibles'] = pd.to_numeric(
            dfm['eligibles'].astype(str).str.replace(',', '', regex=False), errors='coerce')
        dfm['enrolled']  = pd.to_numeric(
            dfm['enrolled'].astype(str).str.replace(',', '', regex=False), errors='coerce')
        dfm['month'] = m
        frames.append(dfm)

    df = pd.concat(frames, ignore_index=True)
    df = df.dropna(subset=['fips'])

    if 'state' in df.columns and 'county' in df.columns:
        df = df.sort_values(['state', 'county', 'month'])
        df['fips'] = (df.groupby(['state', 'county'])['fips']
                        .transform(lambda x: x.ffill().bfill()))

    
    out = (
        df.groupby('fips', as_index=False)
          .agg(avg_eligibles=('eligibles', 'mean'),
               avg_enrolled=('enrolled', 'mean'),
               ssa=('ssa', 'last'))
    )

    
    fips_counts = out['fips'].value_counts()
    out = out[out['fips'].isin(fips_counts[fips_counts == 1].index)].copy()

    out['year'] = year
    out.to_csv(cache, index=False)
    print(f'  pen    {year}: {out.shape[0]} counties')
    return out


print('Penetration functions defined.')

Penetration functions defined.


In [13]:
STAR_YEAR_DIRS = {
    2010: STARS_DIR / '2010',
    2011: STARS_DIR / '2011',
    2012: STARS_DIR / 'Part C 2012 Fall',
    2013: STARS_DIR / 'Part C 2013 Fall',
    2014: STARS_DIR / 'Part C 2014 Fall',
    2015: STARS_DIR / '2015 Fall',
}


def find_star_files(year: int):
    """
    Return (domain_path, summary_path) for a given year.

    For 2010-2011: files contain 'domain' / 'summary' in their names.
    For 2012-2015: files live in 'Part C YYYY Fall' or 'YYYY Fall' folders;
                   pick by column count (domain file has far more columns).
    """
    year_dir = STAR_YEAR_DIRS.get(year)
    if year_dir is None or not year_dir.exists():
        raise FileNotFoundError(
            f'Star-rating directory for {year} not found.\n'
            f'Expected: {year_dir}\n'
            f'Contents of STARS_DIR: {[p.name for p in sorted(STARS_DIR.iterdir())]}'
        )

    all_csvs = sorted(year_dir.glob('*.csv'))
    if not all_csvs:
        raise FileNotFoundError(
            f'No CSVs in {year_dir}.\n'
            f'Contents: {[p.name for p in sorted(year_dir.iterdir())]}'
        )

    
    dom  = find_file(year_dir, [f'*[Dd]omain*.csv'])
    summ = find_file(year_dir, [f'*[Ss]ummary*.csv'])

    if dom is None or summ is None:
        def col_count(p):
            try:
                skip = probe_data_start(p)
                lines = p.read_bytes().decode('latin-1', errors='replace').splitlines()
                if skip < len(lines):
                    return len(lines[skip].split(','))
                return 0
            except Exception:
                return 0

        sized = sorted([(col_count(p), p) for p in all_csvs], reverse=True)
        
        if dom  is None: dom  = sized[0][1]  if len(sized) >= 1 else None
        if summ is None: summ = sized[-1][1] if len(sized) >= 2 else dom

    if dom  is None: raise FileNotFoundError(f'Cannot find domain file for {year} in {year_dir}')
    if summ is None: raise FileNotFoundError(f'Cannot find summary file for {year} in {year_dir}')

    return dom, summ


def _load_domain(year: int, path: Path) -> pd.DataFrame:
    skip      = probe_data_start(path)
    col_names = RATING_VARS[year]

    df = read_csv_safe(path, skiprows=skip, header=None, dtype=str, on_bad_lines='skip')

    n     = df.shape[1]
    names = col_names[:n] + [f'_x{i}' for i in range(len(col_names), n)]
    df.columns = names

    df['contractid'] = clean_contractid(df['contractid'])
    df = df.dropna(subset=['contractid'])

    measure_cols = [c for c in col_names if c not in _ID_COLS and c in df.columns]
    for c in measure_cols:
        df[c] = to_float(df[c])

    df['raw_score'] = df[measure_cols].mean(axis=1, skipna=True)
    return df[['contractid', 'raw_score']].drop_duplicates(subset='contractid')


_NEW_FLAG = re.compile(r'too new|new to rate|plan too new', re.IGNORECASE)

def _load_summary(year: int, path: Path) -> pd.DataFrame:
    skip = probe_data_start(path)
    df   = read_csv_safe(path, skiprows=skip, header=None, dtype=str, on_bad_lines='skip')

    base = ['contractid', 'org_type', 'contract_name', 'org_marketing', 'partc_score']
    n    = df.shape[1]
    df.columns = base[:n] + [f'col_{i}' for i in range(5, n)]

    df['contractid'] = clean_contractid(df['contractid'])
    df = df.dropna(subset=['contractid'])

    
    is_new = df['partc_score'].astype(str).str.contains(_NEW_FLAG, na=False)
    df['new_contract'] = is_new.astype(int)
    df['partc_score']  = to_float(df['partc_score'])
    df.loc[df['new_contract'] == 1, 'partc_score'] = np.nan

    
    df['partcd_score'] = np.nan
    if 'col_5' in df.columns:
        cand    = to_float(df['col_5'])
        valid_n = int(cand.notna().sum())
        if valid_n > 20 and float(((cand >= 1) & (cand <= 5)).sum() / valid_n) > 0.50:
            df['partcd_score'] = cand

    df.loc[df['new_contract'] == 1, 'partcd_score'] = np.nan

    return (
        df[['contractid', 'partc_score', 'partcd_score', 'new_contract']]
        .drop_duplicates(subset='contractid')
    )


def load_star_ratings(year: int) -> pd.DataFrame:
    cache = CACHE_DIR / f'stars_{year}.csv'
    if cache.exists():
        out = pd.read_csv(cache, dtype=str)
        pc  = pd.to_numeric(out['partc_score'], errors='coerce')
        rs  = pd.to_numeric(out['raw_score'],   errors='coerce')
        print(f'  stars  {year}: (cached)  partc={pc.notna().sum()}  raw={rs.notna().sum()}')
        return out

    dom_path, sum_path = find_star_files(year)
    print(f'  stars  {year}: domain  = {dom_path.name}')
    print(f'           summary = {sum_path.name}')

    domain  = _load_domain(year,  dom_path)
    summary = _load_summary(year, sum_path)

    out = domain.merge(summary, on='contractid', how='outer')
    out['new_contract'] = out['new_contract'].fillna(0).astype(int)
    out['year'] = year

   
    pc = pd.to_numeric(out['partc_score'], errors='coerce')
    if pc.notna().sum() == 0:
        rs = pd.to_numeric(out['raw_score'], errors='coerce')
        out['partc_score'] = raw_to_star(rs)
        print(f'    INFO: partc_score for {year} derived from raw_score via CMS rounding')

    for col in ['partc_score', 'partcd_score', 'raw_score']:
        if col in out.columns:
            out.loc[out['new_contract'] == 1, col] = np.nan

    n_pc  = int(pd.to_numeric(out['partc_score'], errors='coerce').notna().sum())
    n_raw = int(pd.to_numeric(out['raw_score'],   errors='coerce').notna().sum())
    print(f'           contracts={out.shape[0]}  partc_nonmiss={n_pc}  raw_nonmiss={n_raw}')

    out.to_csv(cache, index=False)
    return out


print('Star-ratings functions defined.')
print('Year → directory mapping:')
for y, d in STAR_YEAR_DIRS.items():
    print(f'  {y}: {d.name}  ({"exists" if d.exists() else "MISSING"})')

Star-ratings functions defined.
Year → directory mapping:
  2010: 2010  (exists)
  2011: 2011  (exists)
  2012: Part C 2012 Fall  (exists)
  2013: Part C 2013 Fall  (exists)
  2014: Part C 2014 Fall  (exists)
  2015: 2015 Fall  (exists)


In [14]:
_BENCH_COLS = [
    'ssa', 'state', 'county_name',
    'aged_parta', 'aged_partb',
    'disabled_parta', 'disabled_partb',
    'esrd_ab', 'risk_ab',
]


def load_benchmark(year: int) -> pd.DataFrame:
    cache = CACHE_DIR / f'bench_{year}.csv'
    if cache.exists():
        out = pd.read_csv(cache, dtype=str)
        print(f'  bench  {year}: (cached) {out.shape[0]} rows')
        return out

    p = find_file(BENCH_DIR, [
        f'ratebook{year}/CountyRate{year}.csv',
        f'ratebook{year}/countyrate{year}.csv',
        f'CountyRate{year}.csv',
        f'*{year}*.csv',
    ])
    if p is None:
        print(f'  bench  {year}: FILE NOT FOUND')
        return pd.DataFrame(columns=['ssa', 'risk_ab', 'aged_parta', 'aged_partb', 'year'])

    skip = probe_data_start(p)

    df = read_csv_safe(p, skiprows=skip, header=None, dtype=str, on_bad_lines='skip')

    n  = df.shape[1]
    df.columns = _BENCH_COLS[:n] + [f'_b{i}' for i in range(len(_BENCH_COLS), n)]

    df['ssa']        = df['ssa'].apply(clean_ssa)
    df['risk_ab']    = to_float(df['risk_ab'])    if 'risk_ab'    in df.columns else np.nan
    df['aged_parta'] = to_float(df['aged_parta']) if 'aged_parta' in df.columns else np.nan
    df['aged_partb'] = to_float(df['aged_partb']) if 'aged_partb' in df.columns else np.nan

    out = (
        df[['ssa', 'risk_ab', 'aged_parta', 'aged_partb']]
        .dropna(subset=['ssa'])
        .drop_duplicates(subset='ssa')
        .copy()
    )
    out['year'] = year

    out.to_csv(cache, index=False)
    print(f'  bench  {year}: {out.shape[0]} rows  (skip={skip})')
    return out


print('Benchmark functions defined.')

Benchmark functions defined.


In [15]:
all_years = []

for year in YEARS:
    print(f'\n{"="*60}\nYEAR {year}')

    enroll = build_plan_county_year(year)
    sa     = load_service_area(year)
    pen    = load_penetration(year)
    stars  = load_star_ratings(year)
    bench  = load_benchmark(year)

    
    df = enroll.merge(
        sa[['contractid', 'fips']].drop_duplicates(),
        on=['contractid', 'fips'], how='inner',
    )

   
    if 'state' in df.columns:
        df = df[~df['state'].isin(TERRITORIES)]
        df = df[df['state'].astype(str).str.strip() != '']

    
    if 'snp' in df.columns:
        df = df[df['snp'].astype(str).str.strip() == 'No']

    planid_n = pd.to_numeric(df['planid'], errors='coerce')
    df = df[~planid_n.between(800, 899)]
    df = df.dropna(subset=['planid', 'fips'])

    
    pen_merge = pen[['fips', 'avg_eligibles', 'avg_enrolled', 'ssa']].drop_duplicates('fips')
    df = df.merge(pen_merge, on='fips', how='left', suffixes=('_sa', '_pen'))

    
    if 'ssa_sa' in df.columns and 'ssa_pen' in df.columns:
        df['ssa'] = df['ssa_sa'].where(df['ssa_sa'].notna(), df['ssa_pen'])
        df.drop(columns=['ssa_sa', 'ssa_pen'], inplace=True)
    elif 'ssa_sa' in df.columns:
        df.rename(columns={'ssa_sa': 'ssa'}, inplace=True)
    elif 'ssa_pen' in df.columns:
        df.rename(columns={'ssa_pen': 'ssa'}, inplace=True)

    
    star_cols = [c for c in ['contractid', 'partc_score', 'partcd_score',
                              'raw_score', 'new_contract'] if c in stars.columns]
    df = df.merge(stars[star_cols], on='contractid', how='left')

    for c in ['partc_score', 'partcd_score', 'raw_score']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

   
    partd_col = df.get('partd', pd.Series(['No'] * len(df), index=df.index))
    partd_no  = partd_col.astype(str).str.strip() == 'No'

    pc  = df['partc_score']  if 'partc_score'  in df.columns else pd.Series(np.nan, index=df.index)
    pcd = df['partcd_score'] if 'partcd_score' in df.columns else pd.Series(np.nan, index=df.index)

    df['Star_Rating'] = np.where(partd_no, pc, np.where(pcd.notna(), pcd, pc))

    
    bench_cols = [c for c in ['ssa', 'risk_ab', 'aged_parta', 'aged_partb']
                  if c in bench.columns]
    df = df.merge(
        bench[bench_cols].dropna(subset=['ssa']).drop_duplicates('ssa'),
        on='ssa', how='left',
    )

    
    for c in ['avg_enrollment', 'last_enrollment', 'avg_enrolled', 'avg_eligibles', 'risk_ab']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    
    df['enrollment_used'] = df.get('last_enrollment', pd.Series(np.nan, index=df.index))
    df['enrollment_used'] = df['enrollment_used'].where(
        df['enrollment_used'].notna(),
        df.get('avg_enrollment', pd.Series(np.nan, index=df.index))
    )

    
    df['mkt_share'] = df['enrollment_used'] / df['avg_enrolled']
    invalid = ~np.isfinite(df['mkt_share'].astype(float, errors='ignore'))
    df.loc[invalid, 'mkt_share'] = np.nan

    
    df['ma_rate'] = df.get('risk_ab', pd.Series(np.nan, index=df.index))

    df['hmo']       = df.get('plan_type', '').astype(str).str.upper().str.contains('HMO').astype(int)
    df['partd_ind'] = (~partd_no).astype(int)
    df['plan_key']  = df['contractid'].astype(str) + '-' + df['planid'].astype(str)
    df['year']      = year

    
    keep = [
        'year', 'contractid', 'planid', 'plan_key',
        'fips', 'state', 'county',
        'enrollment_used', 'avg_enrollment', 'last_enrollment',
        'avg_enrolled', 'avg_eligibles', 'mkt_share',
        'Star_Rating', 'raw_score', 'new_contract',
        'partd', 'partd_ind', 'plan_type', 'hmo', 'org_type',
        'ssa', 'risk_ab', 'aged_parta', 'aged_partb', 'ma_rate',
    ]
    keep = [c for c in keep if c in df.columns]
    df   = df[keep].copy()

    for c in ['Star_Rating', 'raw_score', 'mkt_share', 'enrollment_used',
               'avg_enrollment', 'last_enrollment', 'avg_enrolled',
               'avg_eligibles', 'risk_ab', 'ma_rate']:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    all_years.append(df)

    print(f'  FINAL rows       = {df.shape[0]}')
    print(f'  Star_Rating ≠ NA = {int(df["Star_Rating"].notna().sum())}')
    print(f'  raw_score   ≠ NA = {int(df["raw_score"].notna().sum())}')
    print(f'  mkt_share   ≠ NA = {int(df["mkt_share"].notna().sum())}')


YEAR 2010
  enroll 2010: (cached) 675013 rows
  sarea  2010: (cached) 435122 rows
  pen    2010: (cached) 3280 counties
  stars  2010: (cached)  partc=367  raw=537
  bench  2010: (cached) 4 rows
  FINAL rows       = 109948
  Star_Rating ≠ NA = 60519
  raw_score   ≠ NA = 95248
  mkt_share   ≠ NA = 30662

YEAR 2011
  enroll 2011: (cached) 507548 rows
  sarea  2011: (cached) 380785 rows
  pen    2011: (cached) 3228 counties
  stars  2011: (cached)  partc=507  raw=507
  bench  2011: (cached) 5 rows
  FINAL rows       = 67967
  Star_Rating ≠ NA = 57318
  raw_score   ≠ NA = 57318
  mkt_share   ≠ NA = 24200

YEAR 2012
  enroll 2012: (cached) 498019 rows
  sarea  2012: (cached) 374145 rows
  pen    2012: (cached) 3224 counties
  stars  2012: domain  = 2012_Part_C_Report_Card_Master_Table_2011_11_01_Domain.csv
           summary = 2012_Part_C_Report_Card_Master_Table_2011_11_01_Summary.csv
           contracts=569  partc_nonmiss=446  raw_nonmiss=484
  bench  2012: 2 rows  (skip=0)
  FINAL rows

In [16]:
ma_data = pd.concat(all_years, ignore_index=True)

out_panel = OUTPUT_DIR / 'ma_data_2010_2015.csv'
ma_data.to_csv(out_panel, index=False)

out_2010 = OUTPUT_DIR / 'ma_data_2010.csv'
ma_data[ma_data['year'] == 2010].to_csv(out_2010, index=False)

contract_ratings = (
    ma_data
    .drop_duplicates(subset=['year', 'contractid'])
    [['year', 'contractid', 'Star_Rating', 'raw_score', 'new_contract',
      'org_type', 'partd_ind', 'hmo']]
    .copy()
)
out_cr = OUTPUT_DIR / 'contract_ratings_2010_2015.csv'
contract_ratings.to_csv(out_cr, index=False)

plan_year = (
    ma_data
    .groupby(['year', 'contractid', 'planid'], as_index=False)
    .agg(
        total_enrollment=('enrollment_used', 'sum'),
        Star_Rating=('Star_Rating', 'first'),
        raw_score=('raw_score', 'first'),
        mkt_share=('mkt_share', 'mean'),
        hmo=('hmo', 'first'),
        partd_ind=('partd_ind', 'first'),
        org_type=('org_type', 'first'),
    )
)
out_py = OUTPUT_DIR / 'plan_year_2010_2015.csv'
plan_year.to_csv(out_py, index=False)

print('Files written:')
for f in [out_panel, out_2010, out_cr, out_py]:
    n = pd.read_csv(f).shape[0]
    print(f'  {f.name}  ({n:,} rows)')

Files written:
  ma_data_2010_2015.csv  (440,740 rows)
  ma_data_2010.csv  (109,948 rows)
  contract_ratings_2010_2015.csv  (3,504 rows)
  plan_year_2010_2015.csv  (13,555 rows)


In [17]:
print('=== Row counts by year ===')
print(ma_data.groupby('year').size().rename('n_rows').to_string())

print('\n=== Star_Rating non-missing by year ===')
print(ma_data.groupby('year')['Star_Rating']
             .apply(lambda x: x.notna().sum()).rename('n_star').to_string())

print('\n=== raw_score non-missing by year ===')
print(ma_data.groupby('year')['raw_score']
             .apply(lambda x: x.notna().sum()).rename('n_raw').to_string())

print('\n=== Star_Rating distribution (plan-county rows) ===')
dist = (
    ma_data.dropna(subset=['Star_Rating'])
           .groupby(['year', 'Star_Rating'])
           .size()
           .unstack('Star_Rating')
           .fillna(0).astype(int)
)
print(dist.to_string())

print('\n=== Market-share summary by year ===')
print(ma_data.groupby('year')['mkt_share'].describe().round(4).to_string())

print('\n=== 2010 contract-level checks (RD analysis) ===')
d10   = ma_data[ma_data['year'] == 2010]
d10_c = d10.drop_duplicates('contractid')
print(f'  unique contracts         : {d10_c.shape[0]}')
print(f'  Star_Rating non-missing  : {int(d10_c["Star_Rating"].notna().sum())}')
print(f'  raw_score   non-missing  : {int(d10_c["raw_score"].notna().sum())}')
if d10_c['raw_score'].notna().any():
    lo = d10_c['raw_score'].min()
    hi = d10_c['raw_score'].max()
    print(f'  raw_score range          : [{lo:.3f}, {hi:.3f}]')

print('\nDataset build complete.')

=== Row counts by year ===
year
2010    109948
2011     67967
2012     67212
2013     67789
2014     62333
2015     65491

=== Star_Rating non-missing by year ===
year
2010    60519
2011    57318
2012    58518
2013    50536
2014      391
2015      311

=== raw_score non-missing by year ===
year
2010    95248
2011    57318
2012    59512
2013    65076
2014    59557
2015    61730

=== Star_Rating distribution (plan-county rows) ===
Star_Rating  1.0  2.0    2.5    3.0    3.5    4.0   4.5   5.0
year                                                         
2010           0  475  32422  10162   8262   6224  2869   105
2011           0  268   9070  20679  11495  10679  4784   343
2012           0  154   8208  20545  13040   7158  7323  2090
2013         107  160   4054  10449  19097   9791  3723  3155
2014         391    0      0      0      0      0     0     0
2015         311    0      0      0      0      0     0     0

=== Market-share summary by year ===
        count    mean     std  mi