In [1]:
import subprocess
from urllib.parse import urljoin
from io import StringIO
import re

from dateutil.parser import parse as parse_datetime
import pandas as pd
import requests
from lxml import html
import js2xml

from helpers import derive_ein_from_filename

In [2]:
TARGET_COLUMNS = [
    'hospital_id',
    #'row_id',
    'line_type',
    'description',
    'rev_code',
    'local_code',
    'code',
    'ms_drg',
    'apr_drg',
    'eapg',
    'hcpcs_cpt',
    'modifiers',
    'alt_hcpcs_cpt',
    'thru',
    'apc',
    'icd',
    'ndc',
    'drug_hcpcs_multiplier',
    'drug_quantity',
    'drug_unit_of_measurement',
    'drug_type_of_measurement',
    'billing_class',
    'setting',
    'payer_category',
    'payer_name',
    'plan_name',
    'standard_charge',
    'standard_charge_percent',
    'contracting_method',
    'additional_generic_notes',
    'additional_payer_specific_notes'
]

In [3]:
transparency_page = "https://www.bartletthospital.org/"
ccn = "020008"
app_url = "https://apps.para-hcfs.com/PTT/FinalLinks/Bartlett.aspx"

In [4]:
resp = requests.get(app_url)
resp

<Response [200]>

In [5]:
tree = html.fromstring(resp.text)
js_link = tree.xpath('//script[contains(@src, "/PTT/extnet/extnet-init-js")]/@src')[0]
js_url = urljoin(resp.url, js_link)
resp1 = requests.get(js_url)
parsed = js2xml.parse(resp1.text)
db_name = parsed.xpath('//object[./property/string[text()="App.hdnDB_Container"]]/property[@name="value"]/string/text()')[0]
db_name

'dbBRHJUNEAUAK'

In [6]:
params = {
    'dbName': db_name,
    'type': 'CDMWithoutLabel'
}

data = '------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__EVENTTARGET"\r\n\r\nResourceManager\r\n------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__EVENTARGUMENT"\r\n\r\n-|public|DownloadReport\r\n------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__ExtNetDirectEventMarker"\r\n\r\ndelta=true\r\n------WebKitFormBoundarysVqstz3xq11k5yBT--\r\n'

resp2 = requests.post('https://apps.para-hcfs.com/PTT/FinalLinks/Reports.aspx', params=params, data=data)
print(resp2)

<Response [200]>


In [7]:
filename = resp2.headers['content-disposition'].split('"')[1]
filename

'920118538_bartlett-regional-hospital_standardcharges.csv'

In [8]:
chunks = resp2.text.split("\r\n\r\n")
len(chunks)

4

In [9]:
chunks[0]

'Run Date: 5/3/2023 9:08:35 AM\r\nCDM Date Stamp: 1/24/2023'

In [10]:
date_str = chunks[0].split(" ")[-1]
date_str

last_updated = parse_datetime(date_str).isoformat().split("T")[0]
last_updated

'2023-01-24'

In [11]:
chunks[1][:100]

'Procedure Code,Procedure Description,Price Tier,Revenue Code,CPT HCPCS Code,Modifier1,Modifier2,NDC '

In [12]:
chunks[2][:100]

'Diagnosis Related Group Code,Diagnosis Related Group Description,Price Tier,Revenue Code,CPT HCPCS D'

In [13]:
csv_f = open(filename, "w")
csv_f.write(resp2.text)
csv_f.close()

In [14]:
csv_buf = StringIO(chunks[1])

df_in = pd.read_csv(csv_buf, dtype=str)
df_in

Unnamed: 0,Procedure Code,Procedure Description,Price Tier,Revenue Code,CPT HCPCS Code,Modifier1,Modifier2,NDC Code,Rx Unit Multiplier,Gross Charge,Discounted Cash Price,De-identified minimum negotiated charge,De-identified maximum negotiated charge,AETNA-ALL_PLANS,BLUE_CROSS-ALL_PLANS,MEDICAID_10_1_20,MEDICARE_1_1_22,MODA-ALL_PLANS,MULTIPLAN-ALL_PLANS,UHC-ALL_PLANS
0,12001002,Daily Service MS Acuity 3,Inpatient,0120,,,,,0,2984.00,2536.40,2536.40,3739.50,2984.00,2924.32,3739.50,,2924.32,2939.24,2924.32
1,12001004,Daily Service OB Acuity 1,Inpatient,0120,,,,,0,2047.00,1739.95,1739.95,3739.50,2047.00,2006.06,3739.50,,2006.06,2016.30,2006.06
2,12001005,Daily Service OB Acuity 2,Inpatient,0120,,,,,0,2430.50,2065.93,2065.93,3739.50,2430.50,2381.89,3739.50,,2381.89,2394.04,2381.89
3,12001006,Daily Service OB Acuity 3,Inpatient,0120,,,,,0,3159.00,2685.15,2685.15,3739.50,3159.00,3095.82,3739.50,,3095.82,3111.62,3095.82
4,12001007,Daily Service Isolation OB,Inpatient,0120,,,,,0,4042.00,3435.70,3435.70,4042.00,4042.00,3961.16,3739.50,,3961.16,3981.37,3961.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17856,96110067,Psycho Tx Indivdual 38-52min,ProFee,0961,90834,,,,0,307.25,261.16,126.27,261.16,,,,126.27,,,
17857,96110069,Psycho Tx Family W/O Pt 60 Min,ProFee,0961,90846,,,,0,370.75,315.14,136.10,315.14,,,,136.10,,,
17858,96110090,HC PRO PHONE CALL 5-10min,ProFee,0961,98966,,,,0,25.00,21.25,15.52,21.25,,,,15.52,,,
17859,96110091,HC PRO PHONE CALL 11-20 MIN,ProFee,0961,98967,,,,0,48.75,41.44,30.67,41.44,,,,30.67,,,


In [15]:
df_mid = pd.DataFrame(df_in)
df_mid = df_mid.rename(columns={
    'Procedure Code': 'local_code',
    'Procedure Description': 'description',
    'Price Tier': 'setting',
    'Revenue Code': 'rev_code',
    'CPT HCPCS Code': 'hcpcs_cpt',
    'NDC Code': 'ndc',
    'Rx Unit Multiplier': 'drug_hcpcs_multiplier',
    'Modifier1': 'modifiers',
    'Diagnosis Related Group Code': 'ms_drg',
    'Diagnosis Related Group Description': 'description',
    'CPT HCPCS DRG Code': 'code',
    'Shoppable Services Code': 'local_code',
    'Shoppable Services Description': 'description'
})

df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,Modifier2,ndc,drug_hcpcs_multiplier,Gross Charge,Discounted Cash Price,De-identified minimum negotiated charge,De-identified maximum negotiated charge,AETNA-ALL_PLANS,BLUE_CROSS-ALL_PLANS,MEDICAID_10_1_20,MEDICARE_1_1_22,MODA-ALL_PLANS,MULTIPLAN-ALL_PLANS,UHC-ALL_PLANS
0,12001002,Daily Service MS Acuity 3,Inpatient,0120,,,,,0,2984.00,2536.40,2536.40,3739.50,2984.00,2924.32,3739.50,,2924.32,2939.24,2924.32
1,12001004,Daily Service OB Acuity 1,Inpatient,0120,,,,,0,2047.00,1739.95,1739.95,3739.50,2047.00,2006.06,3739.50,,2006.06,2016.30,2006.06
2,12001005,Daily Service OB Acuity 2,Inpatient,0120,,,,,0,2430.50,2065.93,2065.93,3739.50,2430.50,2381.89,3739.50,,2381.89,2394.04,2381.89
3,12001006,Daily Service OB Acuity 3,Inpatient,0120,,,,,0,3159.00,2685.15,2685.15,3739.50,3159.00,3095.82,3739.50,,3095.82,3111.62,3095.82
4,12001007,Daily Service Isolation OB,Inpatient,0120,,,,,0,4042.00,3435.70,3435.70,4042.00,4042.00,3961.16,3739.50,,3961.16,3981.37,3961.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17856,96110067,Psycho Tx Indivdual 38-52min,ProFee,0961,90834,,,,0,307.25,261.16,126.27,261.16,,,,126.27,,,
17857,96110069,Psycho Tx Family W/O Pt 60 Min,ProFee,0961,90846,,,,0,370.75,315.14,136.10,315.14,,,,136.10,,,
17858,96110090,HC PRO PHONE CALL 5-10min,ProFee,0961,98966,,,,0,25.00,21.25,15.52,21.25,,,,15.52,,,
17859,96110091,HC PRO PHONE CALL 11-20 MIN,ProFee,0961,98967,,,,0,48.75,41.44,30.67,41.44,,,,30.67,,,


In [16]:
def unify_modifiers(m1, m2):
    if m1 is None:
        return None

    if m1 is not None:
        if m2 is not None:
            return m1 + "|" + m2

        return m1

assert unify_modifiers(None, None) is None
assert unify_modifiers("TC", None) == "TC"
assert unify_modifiers("TC", "TC") == "TC|TC"

In [17]:
df_mid.loc[df_mid['modifiers'].isnull(), 'modifiers'] = None
df_mid.loc[df_mid['Modifier2'].isnull(), 'Modifier2'] = None

df_mid['modifiers'] = df_mid[['modifiers', 'Modifier2']].apply(lambda row: unify_modifiers(row['modifiers'], row['Modifier2']), axis=1)
del df_mid['Modifier2']
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,Gross Charge,Discounted Cash Price,De-identified minimum negotiated charge,De-identified maximum negotiated charge,AETNA-ALL_PLANS,BLUE_CROSS-ALL_PLANS,MEDICAID_10_1_20,MEDICARE_1_1_22,MODA-ALL_PLANS,MULTIPLAN-ALL_PLANS,UHC-ALL_PLANS
0,12001002,Daily Service MS Acuity 3,Inpatient,0120,,,,0,2984.00,2536.40,2536.40,3739.50,2984.00,2924.32,3739.50,,2924.32,2939.24,2924.32
1,12001004,Daily Service OB Acuity 1,Inpatient,0120,,,,0,2047.00,1739.95,1739.95,3739.50,2047.00,2006.06,3739.50,,2006.06,2016.30,2006.06
2,12001005,Daily Service OB Acuity 2,Inpatient,0120,,,,0,2430.50,2065.93,2065.93,3739.50,2430.50,2381.89,3739.50,,2381.89,2394.04,2381.89
3,12001006,Daily Service OB Acuity 3,Inpatient,0120,,,,0,3159.00,2685.15,2685.15,3739.50,3159.00,3095.82,3739.50,,3095.82,3111.62,3095.82
4,12001007,Daily Service Isolation OB,Inpatient,0120,,,,0,4042.00,3435.70,3435.70,4042.00,4042.00,3961.16,3739.50,,3961.16,3981.37,3961.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17856,96110067,Psycho Tx Indivdual 38-52min,ProFee,0961,90834,,,0,307.25,261.16,126.27,261.16,,,,126.27,,,
17857,96110069,Psycho Tx Family W/O Pt 60 Min,ProFee,0961,90846,,,0,370.75,315.14,136.10,315.14,,,,136.10,,,
17858,96110090,HC PRO PHONE CALL 5-10min,ProFee,0961,98966,,,0,25.00,21.25,15.52,21.25,,,,15.52,,,
17859,96110091,HC PRO PHONE CALL 11-20 MIN,ProFee,0961,98967,,,0,48.75,41.44,30.67,41.44,,,,30.67,,,


In [18]:
money_columns = df_mid.columns.to_list()[8:]
money_columns

['Gross Charge',
 'Discounted Cash Price',
 'De-identified minimum negotiated charge',
 'De-identified maximum negotiated charge',
 'AETNA-ALL_PLANS',
 'BLUE_CROSS-ALL_PLANS',
 'MEDICAID_10_1_20',
 'MEDICARE_1_1_22',
 'MODA-ALL_PLANS',
 'MULTIPLAN-ALL_PLANS',
 'UHC-ALL_PLANS']

In [19]:
remaining_columns = df_mid.columns.to_list()[:8]
df_mid = pd.melt(df_mid, id_vars=remaining_columns, var_name='payer_name', value_name='standard_charge')
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge
0,12001002,Daily Service MS Acuity 3,Inpatient,0120,,,,0,Gross Charge,2984.00
1,12001004,Daily Service OB Acuity 1,Inpatient,0120,,,,0,Gross Charge,2047.00
2,12001005,Daily Service OB Acuity 2,Inpatient,0120,,,,0,Gross Charge,2430.50
3,12001006,Daily Service OB Acuity 3,Inpatient,0120,,,,0,Gross Charge,3159.00
4,12001007,Daily Service Isolation OB,Inpatient,0120,,,,0,Gross Charge,4042.00
...,...,...,...,...,...,...,...,...,...,...
196466,96110067,Psycho Tx Indivdual 38-52min,ProFee,0961,90834,,,0,UHC-ALL_PLANS,
196467,96110069,Psycho Tx Family W/O Pt 60 Min,ProFee,0961,90846,,,0,UHC-ALL_PLANS,
196468,96110090,HC PRO PHONE CALL 5-10min,ProFee,0961,98966,,,0,UHC-ALL_PLANS,
196469,96110091,HC PRO PHONE CALL 11-20 MIN,ProFee,0961,98967,,,0,UHC-ALL_PLANS,


In [20]:
set(df_mid['setting'].to_list())

{'Ambulatory Surgical',
 'Emergency',
 'Inpatient',
 'Observation',
 'Outpatient',
 'ProFee'}

In [21]:
df_mid['additional_generic_notes'] = df_mid['setting']
df_mid['billing_class'] = None
df_mid.loc[df_mid['setting'] == 'ProFee', 'billing_class'] = 'professional'
df_mid['setting'] = None
df_mid.loc[df_mid['additional_generic_notes'] == 'Inpatient', 'setting'] = 'inpatient'
df_mid.loc[df_mid['additional_generic_notes'] == 'Outpatient', 'setting'] = 'outpatient'
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class
0,12001002,Daily Service MS Acuity 3,inpatient,0120,,,,0,Gross Charge,2984.00,Inpatient,
1,12001004,Daily Service OB Acuity 1,inpatient,0120,,,,0,Gross Charge,2047.00,Inpatient,
2,12001005,Daily Service OB Acuity 2,inpatient,0120,,,,0,Gross Charge,2430.50,Inpatient,
3,12001006,Daily Service OB Acuity 3,inpatient,0120,,,,0,Gross Charge,3159.00,Inpatient,
4,12001007,Daily Service Isolation OB,inpatient,0120,,,,0,Gross Charge,4042.00,Inpatient,
...,...,...,...,...,...,...,...,...,...,...,...,...
196466,96110067,Psycho Tx Indivdual 38-52min,,0961,90834,,,0,UHC-ALL_PLANS,,ProFee,professional
196467,96110069,Psycho Tx Family W/O Pt 60 Min,,0961,90846,,,0,UHC-ALL_PLANS,,ProFee,professional
196468,96110090,HC PRO PHONE CALL 5-10min,,0961,98966,,,0,UHC-ALL_PLANS,,ProFee,professional
196469,96110091,HC PRO PHONE CALL 11-20 MIN,,0961,98967,,,0,UHC-ALL_PLANS,,ProFee,professional


In [22]:
def payer_name_to_payer_category(payer_name):
    if payer_name == 'Discounted Cash Price':
        return 'cash'
    elif payer_name == 'Gross Charge':
        return 'gross'
    elif payer_name == 'De-identified maximum negotiated charge':
        return 'max'
    elif payer_name == 'De-identified minimum negotiated charge':
        return 'min'

    return 'payer'

df_mid['payer_category'] = df_mid['payer_name'].apply(payer_name_to_payer_category)
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class,payer_category
0,12001002,Daily Service MS Acuity 3,inpatient,0120,,,,0,Gross Charge,2984.00,Inpatient,,gross
1,12001004,Daily Service OB Acuity 1,inpatient,0120,,,,0,Gross Charge,2047.00,Inpatient,,gross
2,12001005,Daily Service OB Acuity 2,inpatient,0120,,,,0,Gross Charge,2430.50,Inpatient,,gross
3,12001006,Daily Service OB Acuity 3,inpatient,0120,,,,0,Gross Charge,3159.00,Inpatient,,gross
4,12001007,Daily Service Isolation OB,inpatient,0120,,,,0,Gross Charge,4042.00,Inpatient,,gross
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196466,96110067,Psycho Tx Indivdual 38-52min,,0961,90834,,,0,UHC-ALL_PLANS,,ProFee,professional,payer
196467,96110069,Psycho Tx Family W/O Pt 60 Min,,0961,90846,,,0,UHC-ALL_PLANS,,ProFee,professional,payer
196468,96110090,HC PRO PHONE CALL 5-10min,,0961,98966,,,0,UHC-ALL_PLANS,,ProFee,professional,payer
196469,96110091,HC PRO PHONE CALL 11-20 MIN,,0961,98967,,,0,UHC-ALL_PLANS,,ProFee,professional,payer


In [23]:
df_mid = df_mid[df_mid['standard_charge'].notnull()]
df_mid.shape

(171363, 13)

In [24]:
if not 'hcpcs_cpt' in df_mid.columns:
    df_mid['hcpcs_cpt'] = None

def split_off_modifiers(row):
    hcpcs_cpt = row.get('hcpcs_cpt')
    if type(hcpcs_cpt) != str:
        return row

    if len(hcpcs_cpt) == 7:
        row['modifiers'] = hcpcs_cpt[-2:]
        row['hcpcs_cpt'] = hcpcs_cpt[:5]

    if len(hcpcs_cpt) == 9:
        row['modifiers'] = hcpcs_cpt[-4:]
        row['modifiers'] = row['modifiers'][:2] + "|" + row['modifiers'][-2:]
        row['hcpcs_cpt'] = hcpcs_cpt[:5]
    
    return row

df_mid = df_mid.apply(lambda row: split_off_modifiers(row), axis=1)
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class,payer_category
0,12001002,Daily Service MS Acuity 3,inpatient,0120,,,,0,Gross Charge,2984.00,Inpatient,,gross
1,12001004,Daily Service OB Acuity 1,inpatient,0120,,,,0,Gross Charge,2047.00,Inpatient,,gross
2,12001005,Daily Service OB Acuity 2,inpatient,0120,,,,0,Gross Charge,2430.50,Inpatient,,gross
3,12001006,Daily Service OB Acuity 3,inpatient,0120,,,,0,Gross Charge,3159.00,Inpatient,,gross
4,12001007,Daily Service Isolation OB,inpatient,0120,,,,0,Gross Charge,4042.00,Inpatient,,gross
...,...,...,...,...,...,...,...,...,...,...,...,...,...
196452,94301962,Cardiac Rehab Initial Assess,inpatient,0943,,,,0,UHC-ALL_PLANS,327.32,Inpatient,,payer
196453,94800001,OP Pulm Rehab WO cont Oximetry,inpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,Inpatient,,payer
196454,94800001,OP Pulm Rehab WO cont Oximetry,outpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,Outpatient,,payer
196455,94800002,OP Pulm Rehab W/cont Oximetry,inpatient,0948,94626,,,0,UHC-ALL_PLANS,354.76,Inpatient,,payer


In [25]:
set(df_mid['modifiers'].to_list())

{'GO', 'GP', 'GY', 'GY|TB', None}

In [26]:
df_mid = pd.DataFrame(df_mid) # XXX

df_mid.loc[df_mid['hcpcs_cpt'].isnull(), 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'] == 'WC003', 'code'] = 'WC003'
df_mid.loc[df_mid['hcpcs_cpt'] == 'WC003', 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'].str.startswith('CS'), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.startswith('CS')]['hcpcs_cpt']
df_mid.loc[df_mid['hcpcs_cpt'].str.startswith('CS'), 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.isalpha()]['hcpcs_cpt']
df_mid['hcpcs_cpt'] = df_mid['hcpcs_cpt'].apply(lambda cpt: '' if len(cpt) != 5 else cpt)
df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'hcpcs_cpt'] = None
df_mid.loc[df_mid['hcpcs_cpt'] == '', 'hcpcs_cpt'] = None

set(df_mid['hcpcs_cpt'].to_list())

{'J0637',
 '0002A',
 '85613',
 '88172',
 '84112',
 '86226',
 '80358',
 '73000',
 '87653',
 '87425',
 'J7195',
 '99204',
 '70330',
 'C1726',
 'J7169',
 '82175',
 'C8908',
 '51600',
 '84402',
 '95811',
 '36556',
 '86677',
 '74178',
 'J8510',
 '84295',
 'G0109',
 '82627',
 '87088',
 '78278',
 'J9033',
 '95992',
 'C1758',
 '70542',
 '77086',
 '72156',
 'G0239',
 'J1644',
 '70160',
 '70250',
 '93286',
 '83655',
 '73630',
 '85362',
 '73020',
 '98966',
 '71048',
 '83021',
 '0001A',
 'J1885',
 'T1023',
 '83002',
 'J2354',
 '83605',
 '86235',
 '72147',
 '97124',
 '10030',
 '96416',
 'J2370',
 '80346',
 '29580',
 '85651',
 '84590',
 '19082',
 '97034',
 '80349',
 'J3370',
 '81050',
 'Q9992',
 'G0390',
 '96368',
 '86318',
 '88381',
 'J1335',
 '87507',
 '82550',
 'G0010',
 '51703',
 '74022',
 '73202',
 '71130',
 '86359',
 'C9290',
 '80188',
 '49465',
 '87252',
 'V2787',
 '96372',
 'J7040',
 '36430',
 '84442',
 '86904',
 '82947',
 'J2690',
 '64494',
 '32408',
 '49082',
 'A6407',
 '82523',
 'J0640',


In [27]:
df_mid['hospital_id'] = ccn
df_mid['line_type'] = None
if not 'local_code' in df_mid.columns:
    df_mid['local_code'] = None
if not 'code' in df_mid.columns:
    df_mid['code'] = None
if not 'ms_drg' in df_mid.columns:
    df_mid['ms_drg'] = None
df_mid['apr_drg'] = None
df_mid['eapg'] = None
df_mid['alt_hcpcs_cpt'] = None
df_mid['thru'] = None
df_mid['apc'] = None
if not 'icd' in df_mid.columns:
    df_mid['icd'] = None
df_mid['drug_quantity'] = None
df_mid['drug_unit_of_measurement'] = None
df_mid['drug_type_of_measurement'] = None
df_mid['plan_name'] = None
df_mid['standard_charge_percent'] = None
df_mid['contracting_method'] = None
df_mid['additional_payer_specific_notes'] = None

df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,...,thru,apc,icd,drug_quantity,drug_unit_of_measurement,drug_type_of_measurement,plan_name,standard_charge_percent,contracting_method,additional_payer_specific_notes
0,12001002,Daily Service MS Acuity 3,inpatient,0120,,,,0,Gross Charge,2984.00,...,,,,,,,,,,
1,12001004,Daily Service OB Acuity 1,inpatient,0120,,,,0,Gross Charge,2047.00,...,,,,,,,,,,
2,12001005,Daily Service OB Acuity 2,inpatient,0120,,,,0,Gross Charge,2430.50,...,,,,,,,,,,
3,12001006,Daily Service OB Acuity 3,inpatient,0120,,,,0,Gross Charge,3159.00,...,,,,,,,,,,
4,12001007,Daily Service Isolation OB,inpatient,0120,,,,0,Gross Charge,4042.00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196452,94301962,Cardiac Rehab Initial Assess,inpatient,0943,,,,0,UHC-ALL_PLANS,327.32,...,,,,,,,,,,
196453,94800001,OP Pulm Rehab WO cont Oximetry,inpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,
196454,94800001,OP Pulm Rehab WO cont Oximetry,outpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,
196455,94800002,OP Pulm Rehab W/cont Oximetry,inpatient,0948,94626,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,


In [28]:
# https://regexlib.com/REDetails.aspx?regexp_id=3084&AspxAutoDetectCookieSupport=1
def code_is_cpt(code):
    if type(code) != str:
        return False
        
    m = re.match(r'^\d{4,4}[A-Z0-9]$', code)
    return m is not None

# https://regex101.com/library/sY0wA0
def code_is_hcpcs(code):
    if type(code) != str:
        return False
    
    m = re.match(r'^[a-zA-Z]\d{4}$', code)
    return m is not None

# https://www.johndcook.com/blog/2019/05/05/regex_icd_codes/
def code_is_icd9(code):
    if type(code) != str:
        return False
    
    N = "\d{3}\.?\d{0,2}"
    E = "E\d{3}\.?\d?"
    V = "V\d{2}\.?\d{0,2}"
    icd9_regex = "|".join([N, E, V])

    m = re.match(icd9_regex, code)
    return m is not None

def code_is_icd10(code):
    if type(code) != str:
        return False

    m1 = re.match(r'[A-TV-Z][0-9][0-9AB]\.?[0-9A-TV-Z]{0,4}', code)
    # https://stackoverflow.com/a/68761242
    m2 = re.match(r'^[A-HJ-NP-Z\d]{7}$', code)
    return m1 is not None or m2 is not None

def code_is_ms_drg(code):
    if type(code) != str:
        return False

    m = re.match(r'^\d{3}$', code)
    return m is not None

assert code_is_cpt('99214')
assert not code_is_cpt('123')
assert code_is_icd9('280.1')
assert code_is_icd10('I25.110')
assert code_is_icd10('0Y6D0Z3')
assert code_is_ms_drg('026')
assert not code_is_ms_drg('25')
assert not code_is_ms_drg('2500')

In [29]:
def fix_codes(row):
    def match_and_set(row, code):
        if code is None:
            return
        
        if code_is_cpt(code) or code_is_hcpcs(code):
            if row['hcpcs_cpt'] is None:
                row['hcpcs_cpt'] = code
        elif code_is_ms_drg(code):
            if row['ms_drg'] is None:
                row['ms_drg'] = code
        elif code_is_icd9(code) or code_is_icd10(code):
            if row['icd'] is None:
                row['icd'] = code
    
    code = row.get('code')
    match_and_set(row, code)

    local_code = row.get('local_code')
    match_and_set(row, code)

    return row

df_mid = df_mid.apply(fix_codes, axis=1)
df_mid

Unnamed: 0,local_code,description,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,...,thru,apc,icd,drug_quantity,drug_unit_of_measurement,drug_type_of_measurement,plan_name,standard_charge_percent,contracting_method,additional_payer_specific_notes
0,12001002,Daily Service MS Acuity 3,inpatient,0120,,,,0,Gross Charge,2984.00,...,,,,,,,,,,
1,12001004,Daily Service OB Acuity 1,inpatient,0120,,,,0,Gross Charge,2047.00,...,,,,,,,,,,
2,12001005,Daily Service OB Acuity 2,inpatient,0120,,,,0,Gross Charge,2430.50,...,,,,,,,,,,
3,12001006,Daily Service OB Acuity 3,inpatient,0120,,,,0,Gross Charge,3159.00,...,,,,,,,,,,
4,12001007,Daily Service Isolation OB,inpatient,0120,,,,0,Gross Charge,4042.00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196452,94301962,Cardiac Rehab Initial Assess,inpatient,0943,,,,0,UHC-ALL_PLANS,327.32,...,,,,,,,,,,
196453,94800001,OP Pulm Rehab WO cont Oximetry,inpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,
196454,94800001,OP Pulm Rehab WO cont Oximetry,outpatient,0948,94625,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,
196455,94800002,OP Pulm Rehab W/cont Oximetry,inpatient,0948,94626,,,0,UHC-ALL_PLANS,354.76,...,,,,,,,,,,


In [30]:
df_out = pd.DataFrame(df_mid[TARGET_COLUMNS])
df_out

Unnamed: 0,hospital_id,line_type,description,rev_code,local_code,code,ms_drg,apr_drg,eapg,hcpcs_cpt,...,billing_class,setting,payer_category,payer_name,plan_name,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes,additional_payer_specific_notes
0,020008,,Daily Service MS Acuity 3,0120,12001002,,,,,,...,,inpatient,gross,Gross Charge,,2984.00,,,Inpatient,
1,020008,,Daily Service OB Acuity 1,0120,12001004,,,,,,...,,inpatient,gross,Gross Charge,,2047.00,,,Inpatient,
2,020008,,Daily Service OB Acuity 2,0120,12001005,,,,,,...,,inpatient,gross,Gross Charge,,2430.50,,,Inpatient,
3,020008,,Daily Service OB Acuity 3,0120,12001006,,,,,,...,,inpatient,gross,Gross Charge,,3159.00,,,Inpatient,
4,020008,,Daily Service Isolation OB,0120,12001007,,,,,,...,,inpatient,gross,Gross Charge,,4042.00,,,Inpatient,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196452,020008,,Cardiac Rehab Initial Assess,0943,94301962,,,,,,...,,inpatient,payer,UHC-ALL_PLANS,,327.32,,,Inpatient,
196453,020008,,OP Pulm Rehab WO cont Oximetry,0948,94800001,,,,,94625,...,,inpatient,payer,UHC-ALL_PLANS,,354.76,,,Inpatient,
196454,020008,,OP Pulm Rehab WO cont Oximetry,0948,94800001,,,,,94625,...,,outpatient,payer,UHC-ALL_PLANS,,354.76,,,Outpatient,
196455,020008,,OP Pulm Rehab W/cont Oximetry,0948,94800002,,,,,94626,...,,inpatient,payer,UHC-ALL_PLANS,,354.76,,,Inpatient,


In [None]:
df_out.to_csv("tmp.csv", index=False)

In [None]:
def convert_chunk(chunk, ccn):
    csv_buf = StringIO(chunk)

    df_in = pd.read_csv(csv_buf, dtype=str)

    df_mid = pd.DataFrame(df_in)
    df_mid = df_mid.rename(columns={
        'Procedure Code': 'local_code',
        'Procedure Description': 'description',
        'Price Tier': 'setting',
        'Revenue Code': 'rev_code',
        'CPT HCPCS Code': 'hcpcs_cpt',
        'NDC Code': 'ndc',
        'Rx Unit Multiplier': 'drug_hcpcs_multiplier',
        'Modifier1': 'modifiers',
        'Diagnosis Related Group Code': 'ms_drg',
        'Diagnosis Related Group Description': 'description',
        'CPT HCPCS DRG Code': 'code',
        'Shoppable Services Code': 'local_code',
        'Shoppable Services Description': 'description',
        'Description': 'description',
    })

    if 'modifiers' in df_mid.columns:
        df_mid.loc[df_mid['modifiers'].isnull(), 'modifiers'] = None
    else:
        df_mid['modifiers'] = None
    if 'Modifier2' in df_mid.columns:
        df_mid.loc[df_mid['Modifier2'].isnull(), 'Modifier2'] = None

    if 'modifiers' in df_mid.columns and 'Modifier2' in df_mid.columns:
        df_mid['modifiers'] = df_mid[['modifiers', 'Modifier2']].apply(lambda row: unify_modifiers(row['modifiers'], row['Modifier2']), axis=1)
        del df_mid['Modifier2']
    else:
        df_mid['modifiers'] = None

    money_idx = df_mid.columns.to_list().index('Gross Charge')
    remaining_columns = df_mid.columns.to_list()[:8]
    df_mid = pd.melt(df_mid, id_vars=remaining_columns, var_name='payer_name', value_name='standard_charge')

    df_mid['additional_generic_notes'] = df_mid['setting']
    df_mid['billing_class'] = None
    df_mid.loc[df_mid['setting'] == 'ProFee', 'billing_class'] = 'professional'
    df_mid['setting'] = None
    df_mid.loc[df_mid['additional_generic_notes'] == 'Inpatient', 'setting'] = 'inpatient'
    df_mid.loc[df_mid['additional_generic_notes'] == 'Outpatient', 'setting'] = 'outpatient'

    df_mid['payer_category'] = df_mid['payer_name'].apply(payer_name_to_payer_category)
    
    df_mid = df_mid[df_mid['standard_charge'].notnull()]

    df_mid = df_mid.apply(lambda row: split_off_modifiers(row), axis=1)

    df_mid = pd.DataFrame(df_mid) # XXX

    if not 'hcpcs_cpt' in df_mid.columns:
        df_mid['hcpcs_cpt'] = None

    df_mid.loc[df_mid['hcpcs_cpt'].isnull(), 'hcpcs_cpt'] = ''

    for bad_prefix in ['WC', 'CS', 'ED', 'GO', 'CC', 'JO', 'AA', '1-', 'CI', '2N', '3R']:
        df_mid.loc[df_mid['hcpcs_cpt'].str.startswith(bad_prefix), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.startswith(bad_prefix)]['hcpcs_cpt']
        df_mid.loc[df_mid['hcpcs_cpt'].str.startswith(bad_prefix), 'hcpcs_cpt'] = ''

    df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.isalpha()]['hcpcs_cpt']
    df_mid['hcpcs_cpt'] = df_mid['hcpcs_cpt'].apply(lambda cpt: '' if len(cpt) != 5 else cpt.upper())
    df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'hcpcs_cpt'] = None
    df_mid.loc[df_mid['hcpcs_cpt'] == '', 'hcpcs_cpt'] = None

    if 'ms_drg' in df_mid.columns:
        df_mid.loc[df_mid['ms_drg'].isnull(), 'ms_drg'] = ''
        df_mid.loc[df_mid['ms_drg'] != '', 'ms_drg'] = df_mid[df_mid['ms_drg'] != '']['ms_drg'].apply(lambda drg: str(drg).replace('.0', '').zfill(3))
    
    # e.g. 12870-0001-1-99-100
    df_mid['ndc'] = df_mid['ndc'].apply(lambda ndc: ndc[:13] if type(ndc) == str and len(ndc) >= 14 else ndc)
    
    df_mid['hospital_id'] = ccn
    df_mid['line_type'] = None
    if not 'local_code' in df_mid.columns:
        df_mid['local_code'] = None
    if not 'code' in df_mid.columns:
        df_mid['code'] = None
    if not 'ms_drg' in df_mid.columns:
        df_mid['ms_drg'] = None
    if not 'modifiers' in df_mid.columns:
        df_mid['modifiers'] = None
    df_mid['apr_drg'] = None
    df_mid['eapg'] = None
    df_mid['alt_hcpcs_cpt'] = None
    df_mid['thru'] = None
    df_mid['apc'] = None
    if not 'icd' in df_mid.columns:
        df_mid['icd'] = None
    df_mid['drug_quantity'] = None
    df_mid['drug_unit_of_measurement'] = None
    df_mid['drug_type_of_measurement'] = None
    df_mid['plan_name'] = None
    df_mid['standard_charge_percent'] = None
    df_mid['contracting_method'] = None
    df_mid['additional_payer_specific_notes'] = None

    df_mid = df_mid.apply(fix_codes, axis=1)
    
    df_out = pd.DataFrame(df_mid[TARGET_COLUMNS])
    return df_out

In [None]:
def perform_task(h_f, ccn, app_url, transparency_page):
    resp = requests.get(app_url)
    print(resp.url)

    tree = html.fromstring(resp.text)
    js_link = tree.xpath('//script[contains(@src, "/PTT/extnet/extnet-init-js")]/@src')[0]
    js_url = urljoin(resp.url, js_link)
    resp1 = requests.get(js_url)
    parsed = js2xml.parse(resp1.text)
    db_name = parsed.xpath('//object[./property/string[text()="App.hdnDB_Container"]]/property[@name="value"]/string/text()')[0]

    params = { 'dbName': db_name, 'type': 'CDMWithoutLabel' }

    data = '------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__EVENTTARGET"\r\n\r\nResourceManager\r\n------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__EVENTARGUMENT"\r\n\r\n-|public|DownloadReport\r\n------WebKitFormBoundarysVqstz3xq11k5yBT\r\nContent-Disposition: form-data; name="__ExtNetDirectEventMarker"\r\n\r\ndelta=true\r\n------WebKitFormBoundarysVqstz3xq11k5yBT--\r\n'

    resp2 = requests.post('https://apps.para-hcfs.com/PTT/FinalLinks/Reports.aspx', params=params, data=data)
    print(resp2.url)

    dfs = []
    
    chunks = resp2.text.split("\r\n\r\n")

    for chunk in chunks[1:]:
        df_tmp = convert_chunk(chunk, ccn)
        dfs.append(df_tmp)

    df_out = pd.concat(dfs)

    df_out.to_csv('rate_' + ccn + '.csv', index=False)
    
    filename = resp2.headers['content-disposition'].split('"')[1]
    ein = derive_ein_from_filename(filename)
    
    date_str = chunks[0].split(" ")[-1]

    last_updated_at = parse_datetime(date_str).isoformat().split("T")[0]
    query = 'UPDATE hospital SET ein = "{}", last_updated = "{}", file_name = "{}", mrf_url = "{}", transparency_page = "{}" WHERE id = "{}";'.format(
        ein, last_updated_at, filename, app_url, transparency_page, ccn)

    h_f.write(query)
    h_f.write("\n")

In [None]:
h_f = open("hospitals.sql", "w")
perform_task(h_f, ccn, app_url, transparency_page)
h_f.close()