In [1]:
import pandas as pd

In [2]:
file_path = "391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"
url = "https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency/391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"

In [3]:
from enum import Enum

class FileFormat(Enum):
    XML = "XML"
    CSV = "CSV"
    JSON = "JSON"
    XLSX = "XLSX"

class FileSubtype(Enum):
    AURORA = "AURORA"
    
class TypeRecognizer(object):
    def _looks_like_aurora_xml(self, file_path):
        try:
            df = pd.read_xml(file_path)
        except:
            return False
        
        columns = df.columns.to_list()
        
        if len(columns) > 10:
            check_colnames = ['Facility', 'Type', 'Chargecode_DRG_CPT', 'Description', 
                              'Rev', 'CPT', 'NDC', 'Self_Pay', 'Min', 'Max']
            for col_name in check_colnames:
                if not col_name in columns:
                    return False
            
            remaining_colnames = list(set(columns) - set(check_colnames))
            for col_name in remaining_colnames:
                if " " in col_name:
                    return False
                
                if not col_name.startswith("_"):
                    return False
                
                components = col_name.split("_")
                
                if len(components) < 3:
                    return False
                
                if len(components[1]) != 4 and components[-1] != 'Fee':
                    return False
                
                return True
        
        return False
    
    def recognize_format_and_subtype(self, file_path):
        file_format = None
        subtype = None
        
        if file_path.endswith(".xml") or file_path.endswith(".XML"):
            file_format = FileFormat.XML
            
            if self._looks_like_aurora_xml(file_path):
                subtype = FileSubtype.AURORA
        
        return file_format, subtype

In [4]:
TARGET_COLUMNS = ['filename', 'file_last_updated', 'hospital_ccn', 'hospital_ein', 'code_meta', 
                  'description', 'procedure_code', 'code_type', 'code', 'rev_code',
                  'modifier', 'ndc', 'apc', 'billing_class', 'patient_class', 'billed_quantity',
                  'rev_desc', 'quantity_desc', 'payer_desc', 'payer_category', 'payer_name',
                  'plan_name', 'plan_id', 'plan_type', 'is_medicare_adv', 'rate', 'rate_method',
                  'rate_desc', 'url', 'permalink']

class AbstractStandardChargesConverter(object):
    def convert(self, url, file_path, ccn):
        pass

In [5]:
def cleanup_dollar_value(value):
    if type(value) == str:
        return value.replace(",", "").replace("$", "")

    return value

def cleanup_values(values):
    return list(map(lambda value: cleanup_dollar_value(value), values))

def pad_rev_code_if_needed(rev_code):
    if type(rev_code) == str and rev_code != 'na':
        if len(rev_code) == 3:
            return '0' + rev_code
        elif len(rev_code) == 2:
            return '00' + rev_code
        elif len(rev_code) == 1:
            return '000' + rev_code
    
    return rev_code

pad_rev_code_if_needed('111')

'0111'

In [6]:
class AuroraXMLConverter(AbstractStandardChargesConverter):
    def __init__(self):
        super().__init__()
    
    def convert(self, url, file_path, ccn):
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)
        
        df_in = pd.read_xml(file_path)
        # HACK: https://stackoverflow.com/a/50132405
        df_in['NDC'] = df_in['NDC'].fillna('na')
        df_in['NDC'] = df_in['NDC'].astype(str)
        df_in['Rev'] = df_in['Rev'].fillna('na')
        df_in['Rev'] = df_in['Rev'].astype(str)
        df_in['Chargecode_DRG_CPT'] = df_in['Chargecode_DRG_CPT'].astype(str)
    
        columns = df_in.columns.to_list()
        money_columns = list(filter(lambda c: c.startswith('_'), columns)) + ['Self_Pay', 'Min', 'Max']

        df_in[money_columns] = df_in[money_columns].apply(lambda values: cleanup_values(values))

        remaining_cols = list(set(columns) - set(money_columns))

        df_intermediate = pd.DataFrame(df_in)
        df_intermediate = pd.melt(df_intermediate, id_vars=remaining_cols)
        df_intermediate = df_intermediate.rename(columns={
            'variable': 'payer_desc',
            'value': 'rate',
            'Description': 'procedure_desc',
            'Rev': 'rev_code',
            'NDC': 'ndc',
            'Chargecode_DRG_CPT': 'code'
        })
        del df_intermediate['Facility']
        del df_intermediate['CPT']

        df_intermediate['ndc'] = df_intermediate['ndc'].apply(lambda ndc: ndc.replace('-', ''))
        df_intermediate['rev_code'] = df_intermediate['rev_code'].apply(lambda rev_code: rev_code.split('.')[0])
        df_intermediate['rev_code'] = df_intermediate['rev_code'].apply(pad_rev_code_if_needed)
        df_intermediate['patient_class'] = df_intermediate['Type'].replace(
            'CHARGE', 'na').replace(
            'IP DRG*', 'inpatient').replace(
            'OP PROC*', 'outpatient')
        df_intermediate['code_type'] = df_intermediate['Type'].replace(
            'IP DRG*', 'ms-drg').replace(
            'OP PROC*', 'hcpcs_cpt').replace(
            'CHARGE', 'cdm')
        df_intermediate['code_meta'] = df_intermediate['Type'].replace(
            'IP DRG*', 'drg').replace(
            'OP PROC*', 'cpt').replace(
            'CHARGE', 'cdm')
        
        del df_intermediate['Type']

        def get_payer_category_from_payer_desc(payer_desc):
            if payer_desc == "Min":
                return "min"
            elif payer_desc == "Max":
                return "max"
            elif payer_desc == "Self_Pay":
                return "cash"
            elif payer_desc.endswith("_Fee"):
                return "gross"
            
            return "payer"

        df_intermediate['payer_category'] = df_intermediate['payer_desc'].apply(get_payer_category_from_payer_desc)
        
        def get_payer_name_from_payer_desc(payer_desc):
            if 'Common_Ground' in payer_desc:
                return 'Common Ground'
            elif 'Health_EOS' in payer_desc:
                return 'Health EOS'
            elif 'Aetna' in payer_desc:
                return 'Aetna'
            elif 'Anthem' in payer_desc:
                return 'Anthem'
            elif 'Aurora' in payer_desc:
                return 'Aurora'
            elif 'Centivo' in payer_desc:
                return 'Centivo'
            elif 'Cigna' in payer_desc:
                return 'Cigna'
            elif 'Common_Ground' in payer_desc:
                return 'Common Ground'
            elif 'Everpointe' in payer_desc:
                return 'Everpointe'
            elif 'HealthPartners' in payer_desc:
                return 'HealthPartners'
            elif 'HPS' in payer_desc:
                return 'HPS'
            elif 'HST' in payer_desc:
                return 'HST'
            elif 'Humana' in payer_desc:
                return 'Humana'
            elif 'Molina' in payer_desc:
                return 'Molina'
            elif 'Quartz_One' in payer_desc:
                return 'Quartz One'
            elif 'Trilogy' in payer_desc:
                return 'Trilogy'
            elif 'UHC' in payer_desc:
                return 'UHC'
            elif 'WPS' in payer_desc:
                return 'WPS'
        
            return ''

        df_intermediate['payer_name'] = df_intermediate['payer_desc'].apply(get_payer_name_from_payer_desc)

        filename = file_path.split("/")[0]
        hospital_ein = filename.split("_")[0]

        df_intermediate['filename'] = filename
        df_intermediate['hospital_ein'] = hospital_ein
        df_intermediate['hospital_ccn'] = ccn
        df_intermediate['url'] = url
        df_intermediate['file_last_updated'] = '2023-01-01' # FIXME: refrain from hardcoding this; determine this field from _Fee column name
        df_intermediate['unique_procedure_id'] = 'na'
        df_intermediate['internal_code'] = 'na'
        df_intermediate['billing_class'] = 'na'
        df_intermediate['procedure_code'] = 'na'
        df_intermediate['modifier'] = 'na'
        df_intermediate['apc'] = 'na'
        df_intermediate['billed_quantity'] = -1

        def get_plan_type_from_payer_desc(payer_desc):
            components = payer_desc.split('_')
            last_component = components[-1]
            if last_component in ["HMO", "PPO", "HPN", "EPO"]:
                return last_component
            
            return ''
            
        df_intermediate['plan_type'] = df_intermediate['payer_desc'].apply(get_plan_type_from_payer_desc)
        
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)

        df_out = df_out.append(df_intermediate)
        
        return df_out


In [7]:
recognizer = TypeRecognizer()
recognizer

<__main__.TypeRecognizer at 0x7f4f4b8affd0>

In [8]:
recognizer.recognize_format_and_subtype(file_path)

(<FileFormat.XML: 'XML'>, <FileSubtype.AURORA: 'AURORA'>)

In [9]:
converter = AuroraXMLConverter()
df_out = converter.convert(url, file_path, '520034')
df_out

  df_out = df_out.append(df_intermediate)


Unnamed: 0,filename,file_last_updated,hospital_ccn,hospital_ein,code_meta,description,procedure_code,code_type,code,rev_code,...,plan_type,is_medicare_adv,rate,rate_method,rate_desc,url,permalink,procedure_desc,unique_procedure_id,internal_code
0,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cdm,,na,cdm,10000002,0121,...,,,1770.00,,,https://www.aurorahealthcare.org/assets/docume...,,ROOM CHARGE MED SURG,na,na
1,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cdm,,na,cdm,10000003,0122,...,,,1770.00,,,https://www.aurorahealthcare.org/assets/docume...,,ROOM CHARGE WOMEN'S HEALTH,na,na
2,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cdm,,na,cdm,10000004,0123,...,,,1770.00,,,https://www.aurorahealthcare.org/assets/docume...,,ROOM CHARGE PEDIATRICS,na,na
3,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cdm,,na,cdm,10000005,0171,...,,,1650.00,,,https://www.aurorahealthcare.org/assets/docume...,,ROOM CHARGE NURSERY LEVEL 1,na,na
4,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cdm,,na,cdm,10000008,0200,...,,,4070.00,,,https://www.aurorahealthcare.org/assets/docume...,,ROOM CHARGE ICU OR CCU,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111809,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cpt,,na,hcpcs_cpt,66984,na,...,,,12331.01,,,https://www.aurorahealthcare.org/assets/docume...,,XCAPSL CTRC RMVL W/O ECP,na,na
111810,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cpt,,na,hcpcs_cpt,69436,na,...,,,11606.12,,,https://www.aurorahealthcare.org/assets/docume...,,CREATE EARDRUM OPENING,na,na
111811,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cpt,,na,hcpcs_cpt,G0105,na,...,,,6453.61,,,https://www.aurorahealthcare.org/assets/docume...,,COLORECTAL SCRN; HI RISK IND,na,na
111812,391211629_aurora-medical-center-manitowoc-coun...,2023-01-01,520034,391211629,cpt,,na,hcpcs_cpt,G0121,na,...,,,6476.92,,,https://www.aurorahealthcare.org/assets/docume...,,COLON CA SCRN NOT HI RSK IND,na,na


In [10]:
df_out.to_csv('520034.csv', index=False)