DB column names
---------------

* `filename` - file name without extensions
* `hospital_ccn` from other DB
* `hospital_ein` from file name (numeric prefix)
* `code_type` is one of: `sup`, `erx`, `eap`, `drg` - how to determine if not in data?
* `unique_id` - ??? `nan`?
* `internal_code` - internal code of procedure; tends to be in data
* `billing_class` - `professional`, `facility` or `nan`
* `patient_class` - `inpatient`, `outpatient`, `both`, `emergency`, `nan`
* `rev_code` - internal revenue code; tends to be in data
* `rev_desc` - revenue code description; sometimes available in the data
* `billing_code_prefix` - ???
* `billing_code_desc` - ???
* `description` - procedure description
* `cdm` - ???
* `hcpcs_cpt` - HCPCS/CPT code from data
* `ndc` - National Drug Code from data
* `ms_drg` - MS-DRG from data
* `icd_10` - ICD 10 (disease classification code) from input
* `eapg` - Enhanced Ambulatory Patient Grouping (probably available in some files)
* `apc` - Ambulatory Payment Classification code (probably available in some files)
* `modifier` - code modifier (sometimes in the data)
* `quantity_desc` - quantity description
* `quantity_number` - quantity number (numberic value)
* `quantity_type` - something like "5 mg. tablet"
* `payer_category` - one of `payer`, `gross`, `cash`, `min`, `max`
* `payer_desc` - payer description from input
* `payer_name` - payer name from input
* `plan_name` - plan name from input
* `plan_id` - plan ID
* `plan_type` - plan type (e.g. "medicaid")
* `rate` - price in dollars
* `rate_method` - "fee schedule" or "percent of charges"
* `rate_desc` - rate description from input
* `is_placeholder_rate` - is placeholder rate? 1 or 0
* `updated_date` - updated at (if known from input data)
* `url` I'm getting from other DB or from scraper.


In [18]:
import pandas as pd

In [2]:
file_path = "391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"
url = "https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency/391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"

In [3]:
from enum import Enum

class FileFormat(Enum):
    XML = "XML"
    CSV = "CSV"
    JSON = "JSON"
    XLSX = "XLSX"

class FileSubtype(Enum):
    AURORA = "AURORA"
    
class TypeRecognizer(object):
    def _looks_like_aurora_xml(self, file_path):
        try:
            df = pd.read_xml(file_path)
        except:
            return False
        
        columns = df.columns.to_list()
        
        if len(columns) > 10:
            check_colnames = ['Facility', 'Type', 'Chargecode_DRG_CPT', 'Description', 
                              'Rev', 'CPT', 'NDC', 'Self_Pay', 'Min', 'Max']
            for col_name in check_colnames:
                if not col_name in columns:
                    return False
            
            remaining_colnames = list(set(columns) - set(check_colnames))
            for col_name in remaining_colnames:
                if " " in col_name:
                    return False
                
                if not col_name.startswith("_"):
                    return False
                
                components = col_name.split("_")
                
                if len(components) < 3:
                    return False
                
                if len(components[1]) != 4 and components[-1] != 'Fee':
                    return False
                
                return True
        
        return False
    
    def recognize_format_and_subtype(self, file_path):
        file_format = None
        subtype = None
        
        if file_path.endswith(".xml") or file_path.endswith(".XML"):
            file_format = FileFormat.XML
            
            if self._looks_like_aurora_xml(file_path):
                subtype = FileSubtype.AURORA
        
        return file_format, subtype

In [4]:
TARGET_COLUMNS = ['filename', 'hospital_ccn', 'hospital_ein', 'code_type', 'unique_id', 
                      'internal_code', 'billing_class', 'patient_class', 'rev_code',
                      'rev_desc', 'billing_code_prefix', 'billing_code_desc',
                      'description', 'cdm', 'hcpcs_cpt', 'ndc', 'ms_drg', 'icd_10',
                      'eapg', 'apc', 'cmg', 'modifier', 'quantity_desc', 'quantity_number',
                      'quantity_type', 'payer_category', 'payer_desc', 'payer_name',
                      'plan_name', 'plan_id', 'plan_type', 'rate', 'rate_method', 
                      'rate_desc', 'is_placeholder_rate', 'updated_date', 'url']

class AbstractStandardChargesConverter(object):
    def convert(self, url, file_path, ccn):
        pass

In [5]:
class AuroraXMLConverter(object):
    def __init__(self):
        super().__init__()
    
    def convert(self, url, file_path, ccn):
        df_out = pd.DataFrame(columns=TARGET_COLUMNS)
        
        df_in = pd.read_xml(file_path)
        # HACK: https://stackoverflow.com/a/50132405
        df_in['Rev'] = df_in['Rev'].fillna(-1)
        df_in['Rev'] = df_in['Rev'].astype(int)
        df_in['Rev'] = df_in['Rev'].replace(-1, None)
        
        return df_out

In [6]:
df_in = pd.read_xml(file_path)
# HACK: https://stackoverflow.com/a/50132405
# Do I need this though?
#df_in['Rev'] = df_in['Rev'].fillna(-1)
#df_in['Rev'] = df_in['Rev'].astype(int)
#df_in['Rev'] = df_in['Rev'].replace(-1, None)
df_in['NDC'] = df_in['NDC'].fillna('-1')
df_in['NDC'] = df_in['NDC'].astype(str)
df_in['Rev'] = df_in['Rev'].fillna(-1)
df_in['Rev'] = df_in['Rev'].astype(int)

def cleanup_dollar_value(value):
    if type(value) == str:
        return value.replace(",", "").replace("$", "")

    return value

def cleanup_values(values):
    return list(map(lambda value: cleanup_dollar_value(value), values))

columns = df_in.columns.to_list()
money_columns = list(filter(lambda c: c.startswith('_'), columns)) + ['Self_Pay', 'Min', 'Max']

df_in[money_columns] = df_in[money_columns].apply(lambda values: cleanup_values(values))
df_in

Unnamed: 0,Facility,Type,Chargecode_DRG_CPT,Description,Rev,CPT,NDC,_1_1_23_Fee,_2023_Aetna_W,_2023_Aetna_PPO,...,_2023_Trilogy,_2023_UHC_Charter,_2023_UHC_Nexus,_2023_UHC_HMO,_2023_UHC_PPO,_2023_WPS_Arise,_2023_WPS_Statewide,Self_Pay,Min,Max
0,MANITOWOC,CHARGE,10000002,ROOM CHARGE MED SURG,121,,-1,1770.00,1122.39,1471.10,...,1335.14,1332.39,1332.39,1426.36,1497.42,1235.34,1396.45,973.50,1079.70,1504.50
1,MANITOWOC,CHARGE,10000003,ROOM CHARGE WOMEN'S HEALTH,122,,-1,1770.00,778.73,1483.26,...,920.84,963.36,963.36,1040.03,1497.42,891.65,1059.77,973.50,755.66,1504.50
2,MANITOWOC,CHARGE,10000004,ROOM CHARGE PEDIATRICS,123,,-1,1770.00,840.12,1483.26,...,1111.48,1010.75,1010.75,1101.19,1497.42,840.12,1012.80,973.50,812.02,1770.00
3,MANITOWOC,CHARGE,10000005,ROOM CHARGE NURSERY LEVEL 1,171,,-1,1650.00,969.48,1382.70,...,1239.59,1159.05,1159.05,1229.76,1395.90,1004.20,1163.04,907.50,969.48,1402.51
4,MANITOWOC,CHARGE,10000008,ROOM CHARGE ICU OR CCU,200,,-1,4070.00,2417.19,3410.66,...,2787.26,2968.97,2968.97,3162.02,3443.22,2610.12,2895.14,2238.50,2381.53,3459.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3017,MANITOWOC,OP PROC*,66984,XCAPSL CTRC RMVL W/O ECP,-1,,-1,12409.28,3211.22,10398.98,...,4117.54,2950.00,2950.00,3214.00,10498.25,3401.22,4089.22,6825.10,2741.57,12331.01
3018,MANITOWOC,OP PROC*,69436,CREATE EARDRUM OPENING,-1,,-1,12013.22,9509.09,10067.08,...,5970.52,4196.51,4196.51,4571.73,10163.18,3586.59,4351.73,6607.27,2929.89,11606.12
3019,MANITOWOC,OP PROC*,G0105,COLORECTAL SCRN; HI RISK IND,-1,,-1,6453.61,3119.00,5408.13,...,2163.00,2360.00,2360.00,2571.00,5459.76,1790.00,2172.00,3549.49,1060.00,6453.61
3020,MANITOWOC,OP PROC*,G0121,COLON CA SCRN NOT HI RSK IND,-1,,-1,6476.92,3119.00,5427.66,...,2163.00,2360.00,2360.00,2571.00,5479.47,1790.00,2172.00,3562.31,1060.00,6476.92


In [7]:
remaining_cols = list(set(columns) - set(money_columns))

df_intermediate = pd.DataFrame(df_in)
df_intermediate = pd.melt(df_intermediate, id_vars=remaining_cols)
df_intermediate = df_intermediate.rename(columns={
    'variable': 'payer_desc',
    'value': 'rate',
    'Description': 'description',
    'CPT': 'hcpcs_cpt',
    'Rev': 'rev_code',
    'Type': 'rev_desc',
    'NDC': 'ndc',
    'Chargecode_DRG_CPT': 'internal_code'
})
del df_intermediate['Facility']

df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].fillna('-1')
df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].astype(str)
df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].replace('-1', '')
df_intermediate['hcpcs_cpt'] = df_intermediate['hcpcs_cpt'].apply(lambda cpt: cpt[:5])
df_intermediate['rev_code'] = df_intermediate['rev_code'].replace(-1, 'nan')
df_intermediate['ndc'] = df_intermediate['ndc'].replace('-1', 'nan')
df_intermediate['quantity_desc'] = 'nan'
df_intermediate['patient_class'] = df_intermediate['rev_desc'].replace(
    'CHARGE', 'nan').replace(
    'IP DRG*', 'inpatient').replace(
    'OP PROC*', 'outpatient')
df_intermediate

Unnamed: 0,hcpcs_cpt,description,ndc,rev_code,rev_desc,internal_code,payer_desc,rate,quantity_desc,patient_class
0,,ROOM CHARGE MED SURG,,121,CHARGE,10000002,_1_1_23_Fee,1770.00,,
1,,ROOM CHARGE WOMEN'S HEALTH,,122,CHARGE,10000003,_1_1_23_Fee,1770.00,,
2,,ROOM CHARGE PEDIATRICS,,123,CHARGE,10000004,_1_1_23_Fee,1770.00,,
3,,ROOM CHARGE NURSERY LEVEL 1,,171,CHARGE,10000005,_1_1_23_Fee,1650.00,,
4,,ROOM CHARGE ICU OR CCU,,200,CHARGE,10000008,_1_1_23_Fee,4070.00,,
...,...,...,...,...,...,...,...,...,...,...
111809,,XCAPSL CTRC RMVL W/O ECP,,,OP PROC*,66984,Max,12331.01,,outpatient
111810,,CREATE EARDRUM OPENING,,,OP PROC*,69436,Max,11606.12,,outpatient
111811,,COLORECTAL SCRN; HI RISK IND,,,OP PROC*,G0105,Max,6453.61,,outpatient
111812,,COLON CA SCRN NOT HI RSK IND,,,OP PROC*,G0121,Max,6476.92,,outpatient


In [8]:
def get_payer_category_from_payer_desc(payer_desc):
    if payer_desc == "Min":
        return "min"
    elif payer_desc == "Max":
        return "max"
    elif payer_desc == "Self_Pay":
        return "cash"
    elif payer_desc.endswith("_Fee"):
        return "gross"
    
    return "payer"

df_intermediate['payer_category'] = df_intermediate['payer_desc'].apply(get_payer_category_from_payer_desc)
df_intermediate

Unnamed: 0,hcpcs_cpt,description,ndc,rev_code,rev_desc,internal_code,payer_desc,rate,quantity_desc,patient_class,payer_category
0,,ROOM CHARGE MED SURG,,121,CHARGE,10000002,_1_1_23_Fee,1770.00,,,gross
1,,ROOM CHARGE WOMEN'S HEALTH,,122,CHARGE,10000003,_1_1_23_Fee,1770.00,,,gross
2,,ROOM CHARGE PEDIATRICS,,123,CHARGE,10000004,_1_1_23_Fee,1770.00,,,gross
3,,ROOM CHARGE NURSERY LEVEL 1,,171,CHARGE,10000005,_1_1_23_Fee,1650.00,,,gross
4,,ROOM CHARGE ICU OR CCU,,200,CHARGE,10000008,_1_1_23_Fee,4070.00,,,gross
...,...,...,...,...,...,...,...,...,...,...,...
111809,,XCAPSL CTRC RMVL W/O ECP,,,OP PROC*,66984,Max,12331.01,,outpatient,max
111810,,CREATE EARDRUM OPENING,,,OP PROC*,69436,Max,11606.12,,outpatient,max
111811,,COLORECTAL SCRN; HI RISK IND,,,OP PROC*,G0105,Max,6453.61,,outpatient,max
111812,,COLON CA SCRN NOT HI RSK IND,,,OP PROC*,G0121,Max,6476.92,,outpatient,max


In [9]:
def get_payer_name_from_payer_desc(payer_desc):
    if 'Common_Ground' in payer_desc:
        return 'Common Ground'
    
    if 'Health_EOS' in payer_desc:
        return 'Health EOS'
    
    components = payer_desc.split('_')
    if len(components) < 3 or components[-1] == 'Fee':
        return ''
    
    components = components[2:]
    return components[0]

df_intermediate['payer_name'] = df_intermediate['payer_desc'].apply(get_payer_name_from_payer_desc)
df_intermediate

Unnamed: 0,hcpcs_cpt,description,ndc,rev_code,rev_desc,internal_code,payer_desc,rate,quantity_desc,patient_class,payer_category,payer_name
0,,ROOM CHARGE MED SURG,,121,CHARGE,10000002,_1_1_23_Fee,1770.00,,,gross,
1,,ROOM CHARGE WOMEN'S HEALTH,,122,CHARGE,10000003,_1_1_23_Fee,1770.00,,,gross,
2,,ROOM CHARGE PEDIATRICS,,123,CHARGE,10000004,_1_1_23_Fee,1770.00,,,gross,
3,,ROOM CHARGE NURSERY LEVEL 1,,171,CHARGE,10000005,_1_1_23_Fee,1650.00,,,gross,
4,,ROOM CHARGE ICU OR CCU,,200,CHARGE,10000008,_1_1_23_Fee,4070.00,,,gross,
...,...,...,...,...,...,...,...,...,...,...,...,...
111809,,XCAPSL CTRC RMVL W/O ECP,,,OP PROC*,66984,Max,12331.01,,outpatient,max,
111810,,CREATE EARDRUM OPENING,,,OP PROC*,69436,Max,11606.12,,outpatient,max,
111811,,COLORECTAL SCRN; HI RISK IND,,,OP PROC*,G0105,Max,6453.61,,outpatient,max,
111812,,COLON CA SCRN NOT HI RSK IND,,,OP PROC*,G0121,Max,6476.92,,outpatient,max,


In [10]:
filename = file_path.split("/")[0]
hospital_ein = filename.split("_")[0]

df_intermediate['filename'] = filename
df_intermediate['hospital_ein'] = hospital_ein
df_intermediate['hospital_ccn'] = '520034'
df_intermediate['rate_method'] = 'fee schedule'
df_intermediate['url'] = url
df_intermediate['is_placeholder_rate'] = 0
df_intermediate['code_type'] = 'cdm' # XXX: how do I determine this?
df_intermediate['billing_class'] = 'nan'
df_intermediate['unique_id'] = 'nan'
df_intermediate['billing_code_desc'] = 'nan'
df_intermediate

Unnamed: 0,hcpcs_cpt,description,ndc,rev_code,rev_desc,internal_code,payer_desc,rate,quantity_desc,patient_class,...,filename,hospital_ein,hospital_ccn,rate_method,url,is_placeholder_rate,code_type,billing_class,unique_id,billing_code_desc
0,,ROOM CHARGE MED SURG,,121,CHARGE,10000002,_1_1_23_Fee,1770.00,,,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
1,,ROOM CHARGE WOMEN'S HEALTH,,122,CHARGE,10000003,_1_1_23_Fee,1770.00,,,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
2,,ROOM CHARGE PEDIATRICS,,123,CHARGE,10000004,_1_1_23_Fee,1770.00,,,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
3,,ROOM CHARGE NURSERY LEVEL 1,,171,CHARGE,10000005,_1_1_23_Fee,1650.00,,,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
4,,ROOM CHARGE ICU OR CCU,,200,CHARGE,10000008,_1_1_23_Fee,4070.00,,,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111809,,XCAPSL CTRC RMVL W/O ECP,,,OP PROC*,66984,Max,12331.01,,outpatient,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
111810,,CREATE EARDRUM OPENING,,,OP PROC*,69436,Max,11606.12,,outpatient,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
111811,,COLORECTAL SCRN; HI RISK IND,,,OP PROC*,G0105,Max,6453.61,,outpatient,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,
111812,,COLON CA SCRN NOT HI RSK IND,,,OP PROC*,G0121,Max,6476.92,,outpatient,...,391211629_aurora-medical-center-manitowoc-coun...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,


In [11]:
def get_plan_name_from_payer_desc(payer_desc):
    payer_name = get_payer_name_from_payer_desc(payer_desc)
    components = payer_desc.split('_')
    
    if payer_name == '':
        return ''
    elif payer_name == 'Common Ground':
        return ' '.join(components[-2:])
    elif payer_name in ["UHC", "Humana", "Health EOS", "WPS", "Molina", "Aetna", "Anthem", "Aurora", "Cigna", "Quartz"]:
        return components[-1]
    
    return ''

df_intermediate['plan_name'] = df_intermediate['payer_desc'].apply(get_plan_name_from_payer_desc)
df_intermediate                

Unnamed: 0,hcpcs_cpt,description,ndc,rev_code,rev_desc,internal_code,payer_desc,rate,quantity_desc,patient_class,...,hospital_ein,hospital_ccn,rate_method,url,is_placeholder_rate,code_type,billing_class,unique_id,billing_code_desc,plan_name
0,,ROOM CHARGE MED SURG,,121,CHARGE,10000002,_1_1_23_Fee,1770.00,,,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
1,,ROOM CHARGE WOMEN'S HEALTH,,122,CHARGE,10000003,_1_1_23_Fee,1770.00,,,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
2,,ROOM CHARGE PEDIATRICS,,123,CHARGE,10000004,_1_1_23_Fee,1770.00,,,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
3,,ROOM CHARGE NURSERY LEVEL 1,,171,CHARGE,10000005,_1_1_23_Fee,1650.00,,,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
4,,ROOM CHARGE ICU OR CCU,,200,CHARGE,10000008,_1_1_23_Fee,4070.00,,,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111809,,XCAPSL CTRC RMVL W/O ECP,,,OP PROC*,66984,Max,12331.01,,outpatient,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
111810,,CREATE EARDRUM OPENING,,,OP PROC*,69436,Max,11606.12,,outpatient,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
111811,,COLORECTAL SCRN; HI RISK IND,,,OP PROC*,G0105,Max,6453.61,,outpatient,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,
111812,,COLON CA SCRN NOT HI RSK IND,,,OP PROC*,G0121,Max,6476.92,,outpatient,...,391211629,520034,fee schedule,https://www.aurorahealthcare.org/assets/docume...,0,cdm,,,,


In [12]:
set(df_intermediate['plan_name'].to_list())

{'',
 'Arise',
 'Caregiver',
 'Charter',
 'ETF Network',
 'Exchange',
 'Exchange Envision',
 'GPPO',
 'Group',
 'Group Envision',
 'HMO',
 'HPN',
 'Nexus',
 'One',
 'PPO',
 'Plus',
 'Preferred',
 'Priority',
 'Statewide',
 'W'}

In [13]:
df_out = pd.DataFrame(columns=TARGET_COLUMNS)

df_out = df_out.append(df_intermediate)

df_out 

  df_out = df_out.append(df_intermediate)


Unnamed: 0,filename,hospital_ccn,hospital_ein,code_type,unique_id,internal_code,billing_class,patient_class,rev_code,rev_desc,...,payer_name,plan_name,plan_id,plan_type,rate,rate_method,rate_desc,is_placeholder_rate,updated_date,url
0,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,10000002,,,121,CHARGE,...,,,,,1770.00,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
1,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,10000003,,,122,CHARGE,...,,,,,1770.00,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
2,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,10000004,,,123,CHARGE,...,,,,,1770.00,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
3,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,10000005,,,171,CHARGE,...,,,,,1650.00,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
4,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,10000008,,,200,CHARGE,...,,,,,4070.00,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111809,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,66984,,outpatient,,OP PROC*,...,,,,,12331.01,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
111810,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,69436,,outpatient,,OP PROC*,...,,,,,11606.12,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
111811,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,G0105,,outpatient,,OP PROC*,...,,,,,6453.61,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...
111812,391211629_aurora-medical-center-manitowoc-coun...,520034,391211629,cdm,,G0121,,outpatient,,OP PROC*,...,,,,,6476.92,fee schedule,,0,,https://www.aurorahealthcare.org/assets/docume...


In [14]:
df_out.to_csv('520034.csv', index=False)

In [15]:
recognizer = TypeRecognizer()
recognizer

<__main__.TypeRecognizer at 0x7fe13c725400>

In [16]:
recognizer.recognize_format_and_subtype(file_path)

(<FileFormat.XML: 'XML'>, <FileSubtype.AURORA: 'AURORA'>)

In [17]:
converter = AuroraXMLConverter()
df_out = converter.convert(url, file_path, '520034')
df_out

Unnamed: 0,filename,hospital_ccn,hospital_ein,code_type,unique_id,internal_code,billing_class,patient_class,rev_code,rev_desc,...,payer_name,plan_name,plan_id,plan_type,rate,rate_method,rate_desc,is_placeholder_rate,updated_date,url
