In [1]:
import pandas as pd

In [2]:
file_path = "391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"
url = "https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency/391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"

In [3]:
from enum import Enum

class FileFormat(Enum):
    XML = "XML"
    CSV = "CSV"
    JSON = "JSON"
    XLSX = "XLSX"

class FileSubtype(Enum):
    AURORA = "AURORA"
    
class TypeRecognizer(object):
    def _looks_like_aurora_xml(self, file_path):
        try:
            df = pd.read_xml(file_path)
        except:
            return False
        
        columns = df.columns.to_list()
        
        if len(columns) > 10:
            check_colnames = ['Facility', 'Type', 'Chargecode_DRG_CPT', 'Description', 
                              'Rev', 'CPT', 'NDC', 'Self_Pay', 'Min', 'Max']
            for col_name in check_colnames:
                if not col_name in columns:
                    return False
            
            remaining_colnames = list(set(columns) - set(check_colnames))
            for col_name in remaining_colnames:
                if " " in col_name:
                    return False
                
                if not col_name.startswith("_"):
                    return False
                
                components = col_name.split("_")
                
                if len(components) < 3:
                    return False
                
                if len(components[1]) != 4 and components[-1] != 'Fee':
                    return False
                
                return True
        
        return False
    
    def recognize_format_and_subtype(self, file_path):
        file_format = None
        subtype = None
        
        if file_path.endswith(".xml") or file_path.endswith(".XML"):
            file_format = FileFormat.XML
            
            if self._looks_like_aurora_xml(file_path):
                subtype = FileSubtype.AURORA
        
        return file_format, subtype

In [4]:
TARGET_COLUMNS = ['filename', 'hospital_ccn', 'hospital_ein', 'code_meta', 'unique_procedure_id',
                      'internal_code', 'billing_class', 'patient_class', 'rev_code',
                      'rev_desc', 'code', 'code_modifier', 'procedure_desc',
                      'cdm', 'hcpcs_cpt', 'ndc', 'ms_drg', 'icd_10',
                      'eapg', 'apc', 'cmg', 'quantity_desc', 'quantity_number',
                      'quantity_type', 'payer_category', 'payer_desc', 'payer_name',
                      'plan_name', 'plan_id', 'plan_type', 'rate', 'rate_method',
                      'rate_desc', 'file_last_updated', 'url', 'permalink']

class AbstractStandardChargesConverter(object):
    def convert(self, url, file_path, ccn):
        pass

In [5]:
def cleanup_dollar_value(value):
    if type(value) == str:
        return value.replace(",", "").replace("$", "")

    return value

def cleanup_values(values):
    return list(map(lambda value: cleanup_dollar_value(value), values))

In [6]:
class AuroraXMLConverter(AbstractStandardChargesConverter):
    def __init__(self):
        super().__init__()
    
    def convert(self, url, file_path, ccn):
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)
        
        df_in = pd.read_xml(file_path)
        # HACK: https://stackoverflow.com/a/50132405
        df_in['NDC'] = df_in['NDC'].fillna('-1')
        df_in['NDC'] = df_in['NDC'].astype(str)
        df_in['Rev'] = df_in['Rev'].fillna(-1)
        df_in['Rev'] = df_in['Rev'].astype(int)
    
        columns = df_in.columns.to_list()
        money_columns = list(filter(lambda c: c.startswith('_'), columns)) + ['Self_Pay', 'Min', 'Max']

        df_in[money_columns] = df_in[money_columns].apply(lambda values: cleanup_values(values))

        remaining_cols = list(set(columns) - set(money_columns))

        df_intermediate = pd.DataFrame(df_in)
        df_intermediate = pd.melt(df_intermediate, id_vars=remaining_cols)
        df_intermediate = df_intermediate.rename(columns={
            'variable': 'payer_desc',
            'value': 'rate',
            'Description': 'procedure_desc',
            'CPT': 'hcpcs_cpt',
            'Rev': 'rev_code',
            'Type': 'rev_desc',
            'NDC': 'ndc',
            'Chargecode_DRG_CPT': 'code'
        })
        del df_intermediate['Facility']

        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].fillna('-1')
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].astype(str)
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].replace('-1', '')
        df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].apply(lambda cpt: cpt[:5])
        df_intermediate['rev_code'] = df_intermediate['rev_code'].replace(-1, 'nan')
        df_intermediate['ndc'] = df_intermediate['ndc'].replace('-1', 'nan')
        df_intermediate['quantity_desc'] = 'nan'
        df_intermediate['patient_class'] = df_intermediate['rev_desc'].replace(
            'CHARGE', 'nan').replace(
            'IP DRG*', 'inpatient').replace(
            'OP PROC*', 'outpatient')
        df_intermediate['code_type'] = df_intermediate['rev_desc'].replace(
            'IP DRG*', 'ms-drg').replace(
            'OP PROC*', 'hcpcs_cpt').replace(
            'CHARGE', 'local')

        def get_payer_category_from_payer_desc(payer_desc):
            if payer_desc == "Min":
                return "min"
            elif payer_desc == "Max":
                return "max"
            elif payer_desc == "Self_Pay":
                return "cash"
            elif payer_desc.endswith("_Fee"):
                return "gross"
            
            return "payer"

        df_intermediate['payer_category'] = df_intermediate['payer_desc'].apply(get_payer_category_from_payer_desc)
        
        def get_payer_name_from_payer_desc(payer_desc):
            if 'Common_Ground' in payer_desc:
                return 'Common Ground'

            if 'Health_EOS' in payer_desc:
                return 'Health EOS'

            components = payer_desc.split('_')
            if len(components) < 3 or components[-1] == 'Fee':
                return ''

            components = components[2:]
            return components[0]

        df_intermediate['payer_name'] = df_intermediate['payer_desc'].apply(get_payer_name_from_payer_desc)

        filename = file_path.split("/")[0]
        hospital_ein = filename.split("_")[0]

        df_intermediate['filename'] = filename
        df_intermediate['hospital_ein'] = hospital_ein
        df_intermediate['hospital_ccn'] = '520034'
        df_intermediate['url'] = url
        df_intermediate['unique_procedure_id'] = 'nan'
        df_intermediate['internal_code'] = 'nan'
        df_intermediate['billing_class'] = 'nan'

        def get_plan_type_from_payer_desc(payer_desc):
            components = payer_desc.split('_')
            last_component = components[-1]
            if last_component in ["HMO", "PPO", "HPN", "GPPO", "EPO"]:
                return last_component
            
            return ''

        def get_plan_name_from_payer_desc(payer_desc):
            if payer_desc.endswith("Fee"):
                return ''
            
            payer_name = get_payer_name_from_payer_desc(payer_desc)
            components = payer_desc.split('_')
            components = components[2:]
            plan_type = get_plan_type_from_payer_desc(payer_desc)
            return ' '.join(components).replace(plan_type, '').replace(payer_name, '').strip()
            
        df_intermediate['plan_type'] = df_intermediate['payer_desc'].apply(get_plan_type_from_payer_desc)
        df_intermediate['plan_name'] = df_intermediate['payer_desc'].apply(get_plan_name_from_payer_desc)
        
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)

        df_out = df_out.append(df_intermediate)
        
        return df_out


In [7]:
recognizer = TypeRecognizer()
recognizer

<__main__.TypeRecognizer at 0x7fc013fad700>

In [8]:
recognizer.recognize_format_and_subtype(file_path)

(<FileFormat.XML: 'XML'>, <FileSubtype.AURORA: 'AURORA'>)

In [9]:
converter = AuroraXMLConverter()
df_out = converter.convert(url, file_path, '520034')
df_out

  df_out = df_out.append(df_intermediate)


Unnamed: 0,filename,hospital_ccn,hospital_ein,code_meta,unique_procedure_id,internal_code,billing_class,patient_class,rev_code,rev_desc,...,plan_name,plan_id,plan_type,rate,rate_method,rate_desc,file_last_updated,url,permalink,code_type
0,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,,121,CHARGE,...,,,,1770.00,,,,https://www.aurorahealthcare.org/assets/docume...,,local
1,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,,122,CHARGE,...,,,,1770.00,,,,https://www.aurorahealthcare.org/assets/docume...,,local
2,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,,123,CHARGE,...,,,,1770.00,,,,https://www.aurorahealthcare.org/assets/docume...,,local
3,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,,171,CHARGE,...,,,,1650.00,,,,https://www.aurorahealthcare.org/assets/docume...,,local
4,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,,200,CHARGE,...,,,,4070.00,,,,https://www.aurorahealthcare.org/assets/docume...,,local
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111809,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,outpatient,,OP PROC*,...,,,,12331.01,,,,https://www.aurorahealthcare.org/assets/docume...,,hcpcs_cpt
111810,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,outpatient,,OP PROC*,...,,,,11606.12,,,,https://www.aurorahealthcare.org/assets/docume...,,hcpcs_cpt
111811,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,outpatient,,OP PROC*,...,,,,6453.61,,,,https://www.aurorahealthcare.org/assets/docume...,,hcpcs_cpt
111812,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,,,,,outpatient,,OP PROC*,...,,,,6476.92,,,,https://www.aurorahealthcare.org/assets/docume...,,hcpcs_cpt


In [10]:
df_out.to_csv('520034.csv', index=False)