In [1]:
import pandas as pd

In [2]:
file_path = "390872192_aurora-psychiatric-hospital_standardcharges.xml"
url = "https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency/390872192_aurora-psychiatric-hospital_standardcharges.xml"

In [3]:
from enum import Enum

class FileFormat(Enum):
    XML = "XML"
    CSV = "CSV"
    JSON = "JSON"
    XLSX = "XLSX"

class FileSubtype(Enum):
    AURORA = "AURORA"
    
class TypeRecognizer(object):
    def _looks_like_aurora_xml(self, file_path):
        try:
            df = pd.read_xml(file_path)
        except:
            return False
        
        columns = df.columns.to_list()
        
        if len(columns) > 10:
            check_colnames = ['Facility', 'Type', 'Chargecode_DRG_CPT', 'Description', 
                              'Rev', 'CPT', 'NDC', 'Self_Pay', 'Min', 'Max']
            for col_name in check_colnames:
                if not col_name in columns:
                    return False
            
            remaining_colnames = list(set(columns) - set(check_colnames))
            for col_name in remaining_colnames:
                if " " in col_name:
                    return False
                
                if not col_name.startswith("_"):
                    return False
                
                components = col_name.split("_")
                
                if len(components) < 3:
                    return False
                
                if len(components[1]) != 4 and components[-1] != 'Fee':
                    return False
                
                return True
        
        return False
    
    def recognize_format_and_subtype(self, file_path):
        file_format = None
        subtype = None
        
        if file_path.endswith(".xml") or file_path.endswith(".XML"):
            file_format = FileFormat.XML
            
            if self._looks_like_aurora_xml(file_path):
                subtype = FileSubtype.AURORA
        
        return file_format, subtype

In [4]:
TARGET_COLUMNS = ['filename', 'hospital_ccn', 'hospital_ein', 'code_meta', 'unique_procedure_id',
                      'internal_code', 'billing_class', 'patient_class', 'rev_code',
                      'rev_desc', 'code_type', 'code', 'code_modifier', 'procedure_desc',
                      'cdm', 'hcpcs_cpt', 'ndc', 'ms_drg', 'icd_10',
                      'eapg', 'apc', 'cmg', 'quantity_desc', 'quantity_number',
                      'quantity_type', 'payer_category', 'payer_desc', 'payer_name',
                      'plan_name', 'plan_id', 'plan_type', 'is_medicare_adv', 'rate', 'rate_method',
                      'rate_desc', 'file_last_updated', 'url', 'permalink']

class AbstractStandardChargesConverter(object):
    def convert(self, url, file_path, ccn):
        pass

In [5]:
def cleanup_dollar_value(value):
    if type(value) == str:
        return value.replace(",", "").replace("$", "")

    return value

def cleanup_values(values):
    return list(map(lambda value: cleanup_dollar_value(value), values))

def pad_rev_code_if_needed(rev_code):
    if type(rev_code) == str and rev_code != 'nan':
        if len(rev_code) == 3:
            return '0' + rev_code
        elif len(rev_code) == 2:
            return '00' + rev_code
        elif len(rev_code) == 1:
            return '000' + rev_code
    
    return rev_code

pad_rev_code_if_needed('111')

'0111'

In [6]:
class AuroraXMLConverter(AbstractStandardChargesConverter):
    def __init__(self):
        super().__init__()
    
    def convert(self, url, file_path, ccn):
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)
        
        df_in = pd.read_xml(file_path)
        # HACK: https://stackoverflow.com/a/50132405
        df_in['NDC'] = df_in['NDC'].fillna('')
        df_in['NDC'] = df_in['NDC'].astype(str)
        df_in['Rev'] = df_in['Rev'].fillna('nan')
        df_in['Rev'] = df_in['Rev'].astype(str)
        df_in['Chargecode_DRG_CPT'] = df_in['Chargecode_DRG_CPT'].astype(str)
    
        columns = df_in.columns.to_list()
        money_columns = list(filter(lambda c: c.startswith('_'), columns)) + ['Self_Pay', 'Min', 'Max']

        df_in[money_columns] = df_in[money_columns].apply(lambda values: cleanup_values(values))

        remaining_cols = list(set(columns) - set(money_columns))

        df_intermediate = pd.DataFrame(df_in)
        df_intermediate = pd.melt(df_intermediate, id_vars=remaining_cols)
        df_intermediate = df_intermediate.rename(columns={
            'variable': 'payer_desc',
            'value': 'rate',
            'Description': 'procedure_desc',
            'CPT': 'hcpcs_cpt',
            'Rev': 'rev_code',
            'NDC': 'ndc',
            'Chargecode_DRG_CPT': 'code'
        })
        del df_intermediate['Facility']

        df_intermediate['rev_code'] = df_intermediate['rev_code'].apply(lambda rev_code: rev_code.split('.')[0])
        df_intermediate['rev_code'] = df_intermediate['rev_code'].apply(pad_rev_code_if_needed)
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].fillna('-1')
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].astype(str)
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].replace('-1', '')
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].apply(lambda cpt: cpt[:5])
        df_intermediate['patient_class'] = df_intermediate['Type'].replace(
            'CHARGE', 'nan').replace(
            'IP DRG*', 'inpatient').replace(
            'OP PROC*', 'outpatient')
        df_intermediate['code_type'] = df_intermediate['Type'].replace(
            'IP DRG*', 'ms-drg').replace(
            'OP PROC*', 'hcpcs_cpt').replace(
            'CHARGE', '')
        df_intermediate['code_meta'] = df_intermediate['Type'].replace(
            'IP DRG*', 'drg').replace(
            'OP PROC*', 'cpt').replace(
            'CHARGE', 'cdm')
        
        del df_intermediate['Type']

        # https://stackoverflow.com/a/60264415
        df_intermediate['hcpcs_cpt'] = df_intermediate.apply(lambda row: row['code'] if row['code_meta'] == 'cpt' else row['hcpcs_cpt'], axis=1)
        df_intermediate['ms_drg'] = df_intermediate.apply(lambda row: row['code'] if row['code_meta'] == 'drg' else None, axis=1)
        df_intermediate['cdm'] = df_intermediate.apply(lambda row: row['code'] if row['code_meta'] == 'cdm' else None, axis=1)
        
        def get_payer_category_from_payer_desc(payer_desc):
            if payer_desc == "Min":
                return "min"
            elif payer_desc == "Max":
                return "max"
            elif payer_desc == "Self_Pay":
                return "cash"
            elif payer_desc.endswith("_Fee"):
                return "gross"
            
            return "payer"

        df_intermediate['payer_category'] = df_intermediate['payer_desc'].apply(get_payer_category_from_payer_desc)
        
        def get_payer_name_from_payer_desc(payer_desc):
            if 'Common_Ground' in payer_desc:
                return 'Common Ground'
            elif 'Health_EOS' in payer_desc:
                return 'Health EOS'
            elif 'Aetna' in payer_desc:
                return 'Aetna'
            elif 'Anthem' in payer_desc:
                return 'Anthem'
            elif 'Aurora' in payer_desc:
                return 'Aurora'
            elif 'Centivo' in payer_desc:
                return 'Centivo'
            elif 'Cigna' in payer_desc:
                return 'Cigna'
            elif 'Common_Ground' in payer_desc:
                return 'Common Ground'
            elif 'Everpointe' in payer_desc:
                return 'Everpointe'
            elif 'HealthPartners' in payer_desc:
                return 'HealthPartners'
            elif 'HPS' in payer_desc:
                return 'HPS'
            elif 'HST' in payer_desc:
                return 'HST'
            elif 'Humana' in payer_desc:
                return 'Humana'
            elif 'Molina' in payer_desc:
                return 'Molina'
            elif 'Quartz_One' in payer_desc:
                return 'Quartz One'
            elif 'Trilogy' in payer_desc:
                return 'Trilogy'
            elif 'UHC' in payer_desc:
                return 'UHC'
            elif 'WPS' in payer_desc:
                return 'WPS'
        
            return ''

        df_intermediate['payer_name'] = df_intermediate['payer_desc'].apply(get_payer_name_from_payer_desc)

        filename = file_path.split("/")[0]
        hospital_ein = filename.split("_")[0]

        df_intermediate['filename'] = filename
        df_intermediate['hospital_ein'] = hospital_ein
        df_intermediate['hospital_ccn'] = ccn
        df_intermediate['url'] = url
        df_intermediate['file_last_updated'] = '2023-01-01' # FIXME: refrain from hardcoding this; determine this field from _Fee column name
        df_intermediate['unique_procedure_id'] = 'nan'
        df_intermediate['internal_code'] = 'nan'
        df_intermediate['billing_class'] = 'nan'

        def get_plan_type_from_payer_desc(payer_desc):
            components = payer_desc.split('_')
            last_component = components[-1]
            if last_component in ["HMO", "PPO", "HPN", "EPO"]:
                return last_component
            
            return ''
            
        df_intermediate['plan_type'] = df_intermediate['payer_desc'].apply(get_plan_type_from_payer_desc)
        
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)

        df_out = df_out.append(df_intermediate)
        
        return df_out


In [7]:
recognizer = TypeRecognizer()
recognizer

<__main__.TypeRecognizer at 0x7f02cef2d4f0>

In [8]:
recognizer.recognize_format_and_subtype(file_path)

(<FileFormat.XML: 'XML'>, <FileSubtype.AURORA: 'AURORA'>)

In [9]:
converter = AuroraXMLConverter()
df_out = converter.convert(url, file_path, '524000')
df_out

  df_out = df_out.append(df_intermediate)


Unnamed: 0,filename,hospital_ccn,hospital_ein,code_meta,unique_procedure_id,internal_code,billing_class,patient_class,rev_code,rev_desc,...,plan_name,plan_id,plan_type,is_medicare_adv,rate,rate_method,rate_desc,file_last_updated,url,permalink
0,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,cdm,,,,,0300,,...,,,,,35.00,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
1,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,cdm,,,,,0307,,...,,,,,30.00,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
2,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,cdm,,,,,0307,,...,,,,,50.00,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
3,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,cdm,,,,,0307,,...,,,,,40.00,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
4,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,cdm,,,,,0307,,...,,,,,105.00,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24933,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,drg,,,,inpatient,,,...,,,,,17258.87,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
24934,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,drg,,,,inpatient,,,...,,,,,12017.65,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
24935,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,drg,,,,inpatient,,,...,,,,,5750.14,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,
24936,390872192_aurora-psychiatric-hospital_standard...,524000,390872192,drg,,,,inpatient,,,...,,,,,11505.47,,,2023-01-01,https://www.aurorahealthcare.org/assets/docume...,


In [10]:
df_out.to_csv('524000.csv', index=False)