In [3]:
from urllib.parse import urlparse

import pandas as pd

In [4]:
file_path = "391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"
url = "https://www.aurorahealthcare.org/assets/documents/billing-insurance/pricing-transparency/391211629_aurora-medical-center-manitowoc-county_standardcharges.xml"

In [5]:
from enum import Enum

class FileFormat(Enum):
    XML = "XML"
    CSV = "CSV"
    JSON = "JSON"
    XLSX = "XLSX"

class FileSubtype(Enum):
    AURORA = "AURORA"
    
class TypeRecognizer(object):
    def _looks_like_aurora_xml(self, file_path):
        try:
            df = pd.read_xml(file_path)
        except:
            return False
        
        columns = df.columns.to_list()
        
        if len(columns) > 10:
            check_colnames = ['Facility', 'Type', 'Chargecode_DRG_CPT', 'Description', 
                              'Rev', 'CPT', 'NDC', 'Self_Pay', 'Min', 'Max']
            for col_name in check_colnames:
                if not col_name in columns:
                    return False
            
            remaining_colnames = list(set(columns) - set(check_colnames))
            for col_name in remaining_colnames:
                if " " in col_name:
                    return False
                
                if not col_name.startswith("_"):
                    return False
                
                components = col_name.split("_")
                
                if len(components) < 3:
                    return False
                
                if len(components[1]) != 4 and components[-1] != 'Fee':
                    return False
                
                return True
        
        return False
    
    def recognize_format_and_subtype(self, file_path):
        file_format = None
        subtype = None
        
        if file_path.endswith(".xml") or file_path.endswith(".XML"):
            file_format = FileFormat.XML
            
            if self._looks_like_aurora_xml(file_path):
                subtype = FileSubtype.AURORA
        
        return file_format, subtype

In [6]:
class AbstractStandardChargesConverter(object):
    pass

In [7]:
class AuroraXMLConvert(object):
    pass

In [8]:
df_in = pd.read_xml(file_path)
# HACK: https://stackoverflow.com/a/50132405
df_in['Rev'] = df_in['Rev'].fillna(-1)
df_in['Rev'] = df_in['Rev'].astype(int)
df_in['Rev'] = df_in['Rev'].replace(-1, None)
df_in

Unnamed: 0,Facility,Type,Chargecode_DRG_CPT,Description,Rev,CPT,NDC,_1_1_23_Fee,_2023_Aetna_W,_2023_Aetna_PPO,...,_2023_Trilogy,_2023_UHC_Charter,_2023_UHC_Nexus,_2023_UHC_HMO,_2023_UHC_PPO,_2023_WPS_Arise,_2023_WPS_Statewide,Self_Pay,Min,Max
0,MANITOWOC,CHARGE,10000002,ROOM CHARGE MED SURG,121,,,1770.00,1122.39,1471.10,...,1335.14,1332.39,1332.39,1426.36,1497.42,1235.34,1396.45,973.50,1079.70,1504.50
1,MANITOWOC,CHARGE,10000003,ROOM CHARGE WOMEN'S HEALTH,122,,,1770.00,778.73,1483.26,...,920.84,963.36,963.36,1040.03,1497.42,891.65,1059.77,973.50,755.66,1504.50
2,MANITOWOC,CHARGE,10000004,ROOM CHARGE PEDIATRICS,123,,,1770.00,840.12,1483.26,...,1111.48,1010.75,1010.75,1101.19,1497.42,840.12,1012.80,973.50,812.02,1770.00
3,MANITOWOC,CHARGE,10000005,ROOM CHARGE NURSERY LEVEL 1,171,,,1650.00,969.48,1382.70,...,1239.59,1159.05,1159.05,1229.76,1395.90,1004.20,1163.04,907.50,969.48,1402.51
4,MANITOWOC,CHARGE,10000008,ROOM CHARGE ICU OR CCU,200,,,4070.00,2417.19,3410.66,...,2787.26,2968.97,2968.97,3162.02,3443.22,2610.12,2895.14,2238.50,2381.53,3459.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3017,MANITOWOC,OP PROC*,66984,XCAPSL CTRC RMVL W/O ECP,,,,12409.28,3211.22,10398.98,...,4117.54,2950.00,2950.00,3214.00,10498.25,3401.22,4089.22,6825.10,2741.57,12331.01
3018,MANITOWOC,OP PROC*,69436,CREATE EARDRUM OPENING,,,,12013.22,9509.09,10067.08,...,5970.52,4196.51,4196.51,4571.73,10163.18,3586.59,4351.73,6607.27,2929.89,11606.12
3019,MANITOWOC,OP PROC*,G0105,COLORECTAL SCRN; HI RISK IND,,,,6453.61,3119.00,5408.13,...,2163.00,2360.00,2360.00,2571.00,5459.76,1790.00,2172.00,3549.49,1060.00,6453.61
3020,MANITOWOC,OP PROC*,G0121,COLON CA SCRN NOT HI RSK IND,,,,6476.92,3119.00,5427.66,...,2163.00,2360.00,2360.00,2571.00,5479.47,1790.00,2172.00,3562.31,1060.00,6476.92


In [9]:
recognizer = TypeRecognizer()
recognizer

<__main__.TypeRecognizer at 0x7f6e19675580>

In [10]:
recognizer.recognize_format_and_subtype(file_path)

(<FileFormat.XML: 'XML'>, <FileSubtype.AURORA: 'AURORA'>)