In [1]:
import os
import copy
import zipfile
import requests
import json
from io import BytesIO
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# there are 20 parts to the CMS medicare DESYNPUF public use files
length = 20
len_range = range(1,length+1)

In [3]:
# GCM means general claim model, the name for this bunch of pipelines & extras

In [4]:
# set up directories for pulling raw and GCM data (General Claims Model, name picked out of hat) 
cwd = os.getcwd() + '/'
data_folder = 'data/'
data_path = cwd + data_folder

# raw path takes the downloaded and extracted raw csv files in a pre-named folder e.g 
# data/raw/beneficiary2008/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv
raw_path = data_path + 'raw/'

# 'proc' files e.g beneficiary2008.csv are dumped here after being assembled from data/raw
gcm_path = data_path + 'gcm/'

# model pickles are dumped here
models_path = cwd + 'models/'

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
if not os.path.exists(raw_path):
    os.mkdir(raw_path)
    
if not os.path.exists(gcm_path):
    os.mkdir(gcm_path)


In [5]:
# def gen_names(name):
#     names_list = [name + str(i) for i in len_range]
#     return names_list

In [6]:
def get_names_dict(data_sets):
    names_dict = {}
    # data_sets is a dict of folder keys and file name stub values. Get all of them from cms
    for k, v in data_sets.items():
        file_names = [v + str(i) for i in len_range]
        # if used, carriers is split into A and B slices, e.g 11A, 11B. Complicates iteration
        if k == 'carriers':
            carriers = []
            car_postfixes = ['A', 'B']
            for name in file_names:
                short = [name + postfix for postfix in car_postfixes]
                carriers.extend(short)
            file_names = carriers
        # bug fix -file 1 is actually a copy of file 20
        elif k == 'beneficiary2010':
            if len(v) > 0:
                file_names.pop(0)
        names_dict[k] = file_names
        
    return names_dict

In [7]:
def get_csv_names(names_dict):
    # I need zip and csv names in dicts, where spelling errors are accounted for
    csv_names_dict = copy.deepcopy(names_dict)
    for k, v in csv_names_dict.items():
        for i, name in enumerate(v):
            if k == 'beneficiary2010':
                # file 17 has a naming error
                if name == 'DE1_0_2010_Beneficiary_Summary_File_Sample_17':
                    v[i] = name + ' - Copy'
        v = [name + '.csv' for name in v]
        csv_names_dict[k] = v
    return csv_names_dict

In [8]:
def get_zip_names(names_dict):
    ext_url = 'http://downloads.cms.gov/files/'
    base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/'

    # ensure zip name addresses for download are using the right URL as a base stub
    zip_names_dict = copy.deepcopy(names_dict)
    for k, v in zip_names_dict.items():
        for i, name in enumerate(v):
            if k == 'carriers':
                    if name ==  'DE1_0_2008_to_2010_Carrier_Claims_Sample_11A':
                        v[i] = name + '.csv'

        if k == 'prescription' or k == 'carriers':
            url = ext_url
        else:
            url = base_url

        v = [url + name + '.zip' for name in v]
        zip_names_dict[k] = v
    return zip_names_dict

In [9]:
# I am going to download each zip, unpack the csvs to a folder in data/raw
def output_raw_data(names_dict, csv_dict, zip_dict):
    for k, v in names_dict.items():
        # each key in names_dict itself holds a dict which contains te csv and zip download info
        csv_names = csv_dict.get(k)
        zip_names = zip_dict.get(k)
        len_range = range(0,len(csv_names))

        out_path = raw_path + k + '/'

        if len(csv_names) > 0:
            for j in tqdm(len_range,desc=k):

                file_name = csv_names[j]
                zip_file = zip_names[j]
                data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
                
                data.to_csv(out_path + file_name, index=False)
    

In [10]:
# I exist to process prescription files in data/raw into GCM processed files
def proc_presc_file(file_path):
    cols = ['DESYNPUF_ID', 'PTNT_PAY_AMT', 'TOT_RX_CST_AMT']
    data = pd.read_csv(file_path,usecols=cols)
    
    data = data.groupby(
        ['DESYNPUF_ID'],as_index=False
    ).agg(PTNT_PAY_AMT_total=('PTNT_PAY_AMT','sum'),
            TOT_RX_CST_AMT_total=('TOT_RX_CST_AMT','sum'),
            presc_num=('DESYNPUF_ID','size'),

    )
    return data

In [11]:
# on closer examination I do not understand the data well enough to reliably interpret line anount data
# this is left here just in case

# def proc_carriers_file(file_path, nch_amt, dgns_codes):
#     cols = ['DESYNPUF_ID'] + nch_amt + dgns_codes  
#     data = pd.read_csv(file_path,usecols=cols)
    
    
#     data['LINE_NCH_PMT_AMT_sum'] = data[nch_amt].sum(axis=1)
#     data['LINE_NCH_PMT_AMT_count'] = data[data[nch_amt] > 0].count(axis=1)
#     data = data.drop(nch_amt,axis=1)

#     data['ICD9_DGNS_CD_count'] = data[data[dgns_codes] != 0].count(axis=1)
#     data = data.drop(dgns_codes,axis=1)
    
#     # this is an aggregation of ID records where each ID has multiple "lines"
#     data = data.groupby(
#         ['DESYNPUF_ID'],as_index=False
#     ).agg(LINE_NCH_PMT_AMT_total=('LINE_NCH_PMT_AMT_sum','sum'),
#          lines_num=('DESYNPUF_ID','size')
          
        
#         )

#     return data

In [12]:
# I take the data in data/raw/<folders> where appropriate, and concatenate/process for assembly into a 
#single csv for the actual models to use

# folders below is a list of folders to look for, as they are named for the raw folder they are taken from 
# and dumped in data/gcm as 'proc' files, pre processed for assembly into final csv
def raw_to_gcm(folders):
    for folder in folders:
        data = None
        file_list = []
        # gcm proc files eg beneficiary2008.csv are harvested in data/raw/beneficiary2008
        # and then a file is dumped in the data/gcm folder which is a 'proc' file
        for file in os.scandir(raw_path + folder):
            if file.path.endswith('.csv'):
                file_list.append(file.path) 
        if len(file_list) > 0:      

            if 'beneficiary' in folder:
                data = pd.concat([pd.read_csv(file_path) for file_path in file_list])
            elif 'carriers' in folder:
                nch_amt = ['LINE_NCH_PMT_AMT_' + str(i) for i in range(1,10)]
                dgns_codes = ['ICD9_DGNS_CD_' + str(i) for i in range(1,9)]

                data = pd.concat([proc_carriers_file(file_path, nch_amt, dgns_codes
                                                    ) for file_path in file_list])
            elif 'prescription' in folder:
                data = pd.concat([proc_presc_file(file_path) for file_path in file_list])
            elif 'patient' in folder:
                cols = ['DESYNPUF_ID','CLM_PMT_AMT']
                data = pd.concat([pd.read_csv(file_path, usecols=cols) for file_path in file_list])
            
            data.to_csv(gcm_path + folder + '.csv', index=False)


In [13]:
def proc_gcm_data(folders):
    # less pythonic & automated but i need this control at the intermediate step
    outpat_file = pd.read_csv(gcm_path + 'outpatient.csv')
    outpat_file = outpat_file.rename(columns={'CLM_PMT_AMT' : 'OUTCLM_PMT_AMT'})

    inpat_file = pd.read_csv(gcm_path + 'inpatient.csv')
    inpat_file = inpat_file.rename(columns={'CLM_PMT_AMT' : 'INCLM_PMT_AMT'})
    
    patients = pd.concat([inpat_file, outpat_file]).fillna(0)
    
    patients = patients.groupby(['DESYNPUF_ID'],as_index=False).agg(
            outpat_total_claimed=('OUTCLM_PMT_AMT','sum'),outpat_no_of_claims=('OUTCLM_PMT_AMT','count'),
            inpat_total_claimed=('INCLM_PMT_AMT','sum'),inpat_no_of_claims=('INCLM_PMT_AMT','count'))

    patients['claimed'] = 1   
    
    del inpat_file, outpat_file
    
    bene_list = [file.path for file in os.scandir(gcm_path) if 'beneficiary' in file.path]
    bene = pd.concat([pd.read_csv(path) for path in bene_list])
    bene = bene.drop(['PPPYMT_IP','PPPYMT_OP','MEDREIMB_CAR','BENRES_CAR','PPPYMT_CAR'],axis=1)
    
    data = bene.merge(patients,on='DESYNPUF_ID',how='left').fillna(0)
    del bene

#     we no longer need to process carriers due to a lack of insurance knowledge
#     carriers = pd.read_csv(gcm_path + 'carriers.csv')
#     data = data.merge(carriers,on='DESYNPUF_ID',how='left')
#     del carriers
    
    presc = pd.read_csv(gcm_path + 'prescription.csv')
    data = data.merge(presc,on='DESYNPUF_ID',how='left')
    del presc
    
    data['total_claimed'] = data['inpat_total_claimed'] + data['outpat_total_claimed']
    data['no_of_claims'] = data['inpat_no_of_claims'] + data['outpat_no_of_claims']
    
    data['claimed'] = data['claimed'].astype(int)

    data.drop('BENE_ESRD_IND',axis=1,inplace=True)

    data=data[(data['claimed'] == 1)|(data['BENE_DEATH_DT'] == 0)].reset_index(drop=True)
    data.drop('BENE_DEATH_DT',axis=1,inplace=True)

    data['BENE_BIRTH_DT'] =(dt.datetime.strptime(
                            '20110101','%Y%m%d') - pd.to_datetime(data['BENE_BIRTH_DT']
                                                                  ,format='%Y%m%d',errors='ignore')
                           ).dt.days

    data = data.rename(columns=rename_dict)

    illnesses= ['alzheimers','heart_failure','kidney_disease','cancer',
        'pulmonary_disease','depression','diabetes','ischemic_heart_disease',
        'osteoporosis','arthritis','stroke']
    data[illnesses] = data[illnesses].replace(2,0,regex=True)
    
    data['sex'] = data['sex'] - 1
    
    data['uw_illness_score'] = data[illnesses].sum(axis=1)/len(illnesses)
    
    data.to_csv(gcm_path + 'gcm_data.csv',index=False)

In [14]:
def clean_gcm():
    for file in os.scandir(gcm_path):
        if file.path.endswith('.csv') and not file.path.endswith('gcm_data.csv'):
            os.remove(file)

In [15]:
rename_dict = {
    'SP_ALZHDMTA':'alzheimers','SP_CHF':'heart_failure',
    'SP_CHRNKIDN':'kidney_disease','SP_CNCR':'cancer',
    'SP_COPD':'pulmonary_disease','SP_DEPRESSN':'depression',
    'SP_DIABETES':'diabetes','SP_ISCHMCHT':'ischemic_heart_disease',
    'SP_OSTEOPRS':'osteoporosis','SP_RA_OA':'arthritis','SP_STRKETIA':'stroke',
    'BENE_SEX_IDENT_CD':'sex','BENE_RACE_CD':'race',
    'SP_STATE_CODE':'state_code','BENE_COUNTY_CD':'county_code','BENE_BIRTH_DT':'age',
    'BENE_HI_CVRAGE_TOT_MONS':'in_cover_dur','BENE_SMI_CVRAGE_TOT_MONS':'out_cover_dur',
    'BENE_HMO_CVRAGE_TOT_MONS':'carrier_cover_dur','PLAN_CVRG_MOS_NUM':'drug_cover_dur',
    'MEDREIMB_IP':'in_cover_amt','MEDREIMB_OP':'out_cover_amt',
    'BENRES_IP':'in_excess_amt','BENRES_OP':'out_excess_amt'
}

data_sets = {'inpatient' : 'DE1_0_2008_to_2010_Inpatient_Claims_Sample_',
             'outpatient' : 'DE1_0_2008_to_2010_Outpatient_Claims_Sample_',
             'beneficiary2008' : 'DE1_0_2008_Beneficiary_Summary_File_Sample_',
             'beneficiary2009' : 'DE1_0_2009_Beneficiary_Summary_File_Sample_',
             'beneficiary2010' : 'DE1_0_2010_Beneficiary_Summary_File_Sample_',
             'prescription' : 'DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_'}

for key in data_sets.keys():
    out_path = raw_path + key
    if not os.path.exists(out_path):
        os.mkdir(out_path)  

# This looks superflouous, and in a sense it is, however, the file names change in different places
# at different times, and folder names as well as file names in 2 domains must be tracked.
# the mutual references allow me to corrrect errors and link files, folders, and zips 
names_dict = get_names_dict(data_sets)
csv_dict = get_csv_names(names_dict)
zip_dict = get_zip_names(names_dict)

# removed carriers
folders = ['inpatient',
          'outpatient',
          'beneficiary2008',
          'beneficiary2009',
          'beneficiary2010',
          'prescription']

folders = [name for name in csv_dict.keys() if name in folders]

In [16]:
output_raw_data(names_dict, csv_dict, zip_dict)

inpatient:   0%|          | 0/20 [00:00<?, ?it/s]

outpatient:   0%|          | 0/20 [00:00<?, ?it/s]

  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(reque

beneficiary2008:   0%|          | 0/20 [00:00<?, ?it/s]

beneficiary2009:   0%|          | 0/20 [00:00<?, ?it/s]

beneficiary2010:   0%|          | 0/19 [00:00<?, ?it/s]

prescription:   0%|          | 0/20 [00:00<?, ?it/s]

  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
  data = pd.read_csv(zipfile.ZipFile(BytesIO(reque

In [17]:
raw_to_gcm(folders)

In [18]:
proc_gcm_data(folders)

In [19]:
# clean_gcm()