In [1]:
import os
import copy
import zipfile
import requests
import json
from io import BytesIO
from tqdm.notebook import tqdm

import pandas as pd
import numpy as np
import datetime as dt

In [2]:
length = 20
len_range = range(1,length+1)

In [3]:
cwd = os.getcwd() + '/'
data_folder = 'data/'
data_path = cwd + data_folder
raw_path = data_path + 'raw/'
gcm_path = data_path + 'gcm/'

models_path = cwd + 'models/'

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
if not os.path.exists(raw_path):
    os.mkdir(raw_path)
    
if not os.path.exists(gcm_path):
    os.mkdir(gcm_path)
    

In [4]:
rename_dict = {
    'SP_ALZHDMTA':'alzheimers','SP_CHF':'heart_failure',
    'SP_CHRNKIDN':'kidney_disease','SP_CNCR':'cancer',
    'SP_COPD':'pulmonary_disease','SP_DEPRESSN':'depression',
    'SP_DIABETES':'diabetes','SP_ISCHMCHT':'ischemic_heart_disease',
    'SP_OSTEOPRS':'osteoporosis','SP_RA_OA':'arthritis','SP_STRKETIA':'stroke',
    'DESYNPUF_ID':'patient_ID','BENE_SEX_IDENT_CD':'sex','BENE_RACE_CD':'race',
    'SP_STATE_CODE':'state_code','BENE_COUNTY_CD':'county_code','BENE_BIRTH_DT':'age',
    'BENE_HI_CVRAGE_TOT_MONS':'in_cover_dur','BENE_SMI_CVRAGE_TOT_MONS':'out_cover_dur',
    'BENE_HMO_CVRAGE_TOT_MONS':'carrier_cover_dur','PLAN_CVRG_MOS_NUM':'drug_cover_dur',
    'MEDREIMB_IP':'in_cover_amt','MEDREIMB_OP':'out_cover_amt',
    'BENRES_IP':'in_excess_amt','BENRES_OP':'out_excess_amt'
}

In [5]:
data_sets = {'inpatient' : 'DE1_0_2008_to_2010_Inpatient_Claims_Sample_',
             'outpatient' : 'DE1_0_2008_to_2010_Outpatient_Claims_Sample_',
             'beneficiary2008' : 'DE1_0_2008_Beneficiary_Summary_File_Sample_',
             'beneficiary2009' : 'DE1_0_2009_Beneficiary_Summary_File_Sample_',
             'beneficiary2010' : 'DE1_0_2010_Beneficiary_Summary_File_Sample_',
             'prescription' : 'DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_',
             'carriers' : 'DE1_0_2008_to_2010_Carrier_Claims_Sample_'}

In [6]:
for key in data_sets.keys():
    out_path = raw_path + key
    if not os.path.exists(out_path):
        os.mkdir(out_path)  

In [7]:
def gen_names(name):
    names_list = [name + str(i) for i in len_range]
    return names_list

In [8]:
def get_names_dict(data_sets):
    names_dict = {}
    for k, v in data_sets.items():
        file_names = gen_names(v)
        if k == 'carriers':
            carriers = []
            car_postfixes = ['A', 'B']
            for name in file_names:
                short = [name + postfix for postfix in car_postfixes]
                carriers.extend(short)
            file_names = carriers
        # bug fix -file 1 is actually a copy of file 20
        elif k == 'beneficiary2010':
            if len(v) > 0:
                file_names.pop(0)
        names_dict[k] = file_names
        
    return names_dict

In [9]:
def get_csv_names(names_dict):
    csv_names_dict = copy.deepcopy(names_dict)
    for k, v in csv_names_dict.items():
        for i, name in enumerate(v):
            if k == 'beneficiary2010':
                    if name == 'DE1_0_2010_Beneficiary_Summary_File_Sample_17':
                        v[i] = name + ' - Copy'
        v = [name + '.csv' for name in v]
        csv_names_dict[k] = v
    return csv_names_dict

In [10]:
def get_zip_names(names_dict):
    ext_url = 'http://downloads.cms.gov/files/'
    base_url = 'https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/SynPUFs/Downloads/'


    zip_names_dict = copy.deepcopy(names_dict)
    for k, v in zip_names_dict.items():
        for i, name in enumerate(v):
            if k == 'carriers':
                    if name ==  'DE1_0_2008_to_2010_Carrier_Claims_Sample_11A':
                        v[i] = name + '.csv'

        if k == 'prescription' or k == 'carriers':
            url = ext_url
        else:
            url = base_url

        v = [url + name + '.zip' for name in v]
        zip_names_dict[k] = v
    return zip_names_dict

In [11]:
names_dict = get_names_dict(data_sets)
csv_dict = get_csv_names(names_dict)
zip_dict = get_zip_names(names_dict)

In [12]:
csv_dict

{'inpatient': ['DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_2.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_3.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_4.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_5.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_6.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_7.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_8.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_9.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_10.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_11.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_12.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_13.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_14.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_15.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_16.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_17.csv',
  'DE1_0_2008_to_2010_Inpatient_Claims_Sample_18.csv',
  'DE1

In [13]:
def output_raw_data(names_dict, csv_dict, zip_dict):
    for k, v in names_dict.items():
        csv_names = csv_dict.get(k)
        zip_names = zip_dict.get(k)
        len_range = range(0,len(csv_names))

        out_path = raw_path + k + '/'

        if len(csv_names) > 0:
            for j in tqdm(len_range,desc=k):

                file_name = csv_names[j]
                zip_file = zip_names[j]
                data = pd.read_csv(zipfile.ZipFile(BytesIO(requests.get(zip_file).content)).open(file_name))
                
                data.to_csv(out_path + file_name, index=False)
    

In [14]:
def raw_to_gcm(folders):
    for folder in folders:
        data = None
        file_list = []
        for file in os.scandir(raw_path + folder):
            if file.path.endswith('.csv'):
                file_list.append(file.path) 
        if len(file_list) > 0:      

            if 'beneficiary' in folder:
                data = pd.concat([pd.read_csv(file_path) for file_path in file_list])
            else:
                cols = ['DESYNPUF_ID','CLM_PMT_AMT']
                data = pd.concat([pd.read_csv(file_path, usecols=cols) for file_path in file_list])
            
            data.to_csv(gcm_path + folder + '.csv', index=False)


In [15]:
def proc_gcm_data(folders):
    # now we want to get a single file from these:
    bene_list = [folder + '.csv' for folder in folders if 'beneficiary' in folder]
    bene_paths = [file.path for file in os.scandir(gcm_path) for bene in bene_list if bene in file.path]
    
    pat_list = [folder + '.csv' for folder in folders if 'patient' in folder]
    pat_paths = [file.path for file in os.scandir(gcm_path) for pat in pat_list if pat in file.path]
    
    bene_file = pd.concat([pd.read_csv(path) for path in bene_paths])
#     bene_file = bene_file.rename(columns=rename_dict)
    
    
    claim_file = pd.concat([pd.read_csv(path,usecols=['DESYNPUF_ID','CLM_PMT_AMT']) for path in pat_paths])
#     claim_file = claim_file.rename(columns=rename_dict)
    
#     claim_file = claim_file[claim_file['CLM_PMT_AMT'] > 0].reset_index(drop=True)
    
    claim_file = claim_file.groupby(['DESYNPUF_ID'],as_index=False).agg(
                    total_claimed=('CLM_PMT_AMT','sum'),no_of_claims=('CLM_PMT_AMT','count'))
    claim_file['claimed'] = 1
    
    def merge_data(claim_file):
        data = bene_file.merge(claim_file,on='DESYNPUF_ID',how='left').fillna(0)
        
        data['claimed'] = data['claimed'].astype(bool)

        data.drop('BENE_ESRD_IND',axis=1,inplace=True)
        
        data=data[(data['claimed'] == 1)|(data['BENE_DEATH_DT'] == 0)].reset_index(drop=True)
        data.drop('BENE_DEATH_DT',axis=1,inplace=True)
        
        data['BENE_BIRTH_DT'] =(dt.datetime.strptime(
                                '20080101','%Y%m%d') - pd.to_datetime(data['BENE_BIRTH_DT']
                                                                      ,format='%Y%m%d',errors='ignore')
                               ).dt.days
        
        data = data.rename(columns=rename_dict)
        
        illness_list = ['alzheimers','heart_failure','kidney_disease','cancer',
                        'pulmonary_disease','depression','diabetes','ischemic_heart_disease',
                        'osteoporosis','arthritis','stroke']
        


        illnesses= ['alzheimers','heart_failure','kidney_disease','cancer',
            'pulmonary_disease','depression','diabetes','ischemic_heart_disease',
            'osteoporosis','arthritis','stroke']
        data[illnesses] = data[illnesses].replace(2,0,regex=True)
        data['uw_illness_score'] = data[illness_list].sum(axis=1)/len(illness_list)
        
        return data
    
    merge_data(claim_file).to_csv(gcm_path + 'gcm_data.csv',index=False)

In [16]:
def clean_gcm_proc():
    for file in os.scandir(gcm_path):
        if file.path.endswith('.csv') and not file.path.endswith('gcm_data.csv'):
            os.remove(file)

In [17]:
def output_gcm_data(csv_dict):
    folders = ['inpatient',
              'outpatient',
              'beneficiary2008',
              'beneficiary2009',
              'beneficiary2010']
    
    folders = [name for name in csv_dict.keys() if name in folders]
    
    raw_to_gcm(folders)
    proc_gcm_data(folders)
    clean_gcm_proc()
    return

In [18]:
# output_raw_data(names_dict, csv_dict, zip_dict)

In [19]:
output_gcm_data(csv_dict)