In [1]:
import subprocess
from urllib.parse import urljoin
from io import StringIO
import re

from dateutil.parser import parse as parse_datetime
import pandas as pd
import requests
from lxml import html
import js2xml

from helpers import derive_ein_from_filename

In [2]:
TARGET_COLUMNS = [
    'hospital_id',
    #'row_id',
    'line_type',
    'description',
    'rev_code',
    'local_code',
    'code',
    'ms_drg',
    'apr_drg',
    'eapg',
    'hcpcs_cpt',
    'modifiers',
    'alt_hcpcs_cpt',
    'thru',
    'apc',
    'icd',
    'ndc',
    'drug_hcpcs_multiplier',
    'drug_quantity',
    'drug_unit_of_measurement',
    'drug_type_of_measurement',
    'billing_class',
    'setting',
    'payer_category',
    'payer_name',
    'plan_name',
    'standard_charge',
    'standard_charge_percent',
    'contracting_method',
    'additional_generic_notes',
    'additional_payer_specific_notes'
]

In [3]:
transparency_page = "https://www.bhset.net/"
ccn = "460346"
app_url = "https://apps.para-hcfs.com/PTT/FinalLinks/BHSet_V3.aspx"

In [4]:
# HACK
filename = "741303720_BAPTIST_HOSPITAL_-_ALL_standardCharges.csv"
filename

'741303720_BAPTIST_HOSPITAL_-_ALL_standardCharges.csv'

In [5]:
in_f = open(filename, 'r')
csv_str = in_f.read()
in_f.close()

chunks = csv_str.split("\r\n\r\n")
len(chunks)

1

In [6]:
date_str = chunks[0].split("\n")[1].split(" ")[-1]
date_str

last_updated = parse_datetime(date_str).isoformat().split("T")[0]
last_updated

'2023-02-03'

In [7]:
df_in = pd.read_csv(filename, dtype=str, header=2)
df_in

Unnamed: 0,Code,Description,Code Type,Price Tier,Revenue Code,CPT HCPCS Code,Modifier1,Modifier2,NDC Code,Rx Unit Multiplier,...,SIGNATURE_HEALTH_-_ALL_PLANS,SUPERIOR_HEALTH_PLAN_COMMERCIAL_-_ALL_OTHER_PLANS,SUPERIOR_HEALTH_PLAN_MCR_ADV,SUPERIOR_HEALTH_PLAN_MEDICAID,TCHP_CHIPS_-_ALL_PLANS,TEXAN_PLUS,TEXAS_EXCHANGE_PLANS_-_ALL_PLANS,TEXAS_SCHOOL_HEALTH_BENEFITS_-_ALL_PLANS,UNITED_HEALTHCARE_HMO,UNITED_HEALTHCARE_PPO
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,BAPTIST HOSPITAL - ALL,0723,,,,,0,...,176.26,,,28.20,28.20,,,,,
1,1700018_1,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - ALL,0270,,,,,0,...,133.76,,,21.40,21.40,,,,,
2,1700018_10,PHOTOTHERAPY,Procedure Code,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
3,1700018_12,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - LUMBERTON INFUSION,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
4,1700018_14,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL OUTPATIENT CENTER,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71450,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,987,,,,,111276.1060,...,23775.84,20674.63,8004.11,8004.11,20674.63,23775.84,31011.94,21645.92,30748.29,
71451,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,988,,,,,50846.9328,...,12154.45,10569.08,8004.11,8004.11,10569.08,12154.45,15853.62,11065.62,15718.84,
71452,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,989,,,,,46587.6000,...,7898.53,6868.28,8004.11,8004.11,6868.28,7898.53,10302.42,7190.96,10214.84,
71453,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,998,,,,,1972.0000,...,,,,,,,,6523.00,9266.00,


In [8]:
df_mid = pd.DataFrame(df_in)
df_mid = df_mid.rename(columns={
    'Procedure Code': 'local_code',
    'Procedure Description': 'description',
    'Price Tier': 'setting',
    'Revenue Code': 'rev_code',
    'CPT HCPCS Code': 'hcpcs_cpt',
    'NDC Code': 'ndc',
    'Rx Unit Multiplier': 'drug_hcpcs_multiplier',
    'Modifier1': 'modifiers',
    'Diagnosis Related Group Code': 'ms_drg',
    'Diagnosis Related Group Description': 'description',
    'CPT HCPCS DRG Code': 'code',
    'Shoppable Services Code': 'local_code',
    'Shoppable Services Description': 'description',
    'Description': 'description'
})

from helpers import pad_rev_code_if_needed

df_mid['rev_code'] = df_mid['rev_code'].apply(pad_rev_code_if_needed)

df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,Modifier2,ndc,drug_hcpcs_multiplier,...,SIGNATURE_HEALTH_-_ALL_PLANS,SUPERIOR_HEALTH_PLAN_COMMERCIAL_-_ALL_OTHER_PLANS,SUPERIOR_HEALTH_PLAN_MCR_ADV,SUPERIOR_HEALTH_PLAN_MEDICAID,TCHP_CHIPS_-_ALL_PLANS,TEXAN_PLUS,TEXAS_EXCHANGE_PLANS_-_ALL_PLANS,TEXAS_SCHOOL_HEALTH_BENEFITS_-_ALL_PLANS,UNITED_HEALTHCARE_HMO,UNITED_HEALTHCARE_PPO
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,BAPTIST HOSPITAL - ALL,0723,,,,,0,...,176.26,,,28.20,28.20,,,,,
1,1700018_1,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - ALL,0270,,,,,0,...,133.76,,,21.40,21.40,,,,,
2,1700018_10,PHOTOTHERAPY,Procedure Code,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
3,1700018_12,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - LUMBERTON INFUSION,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
4,1700018_14,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL OUTPATIENT CENTER,0270,,,,,0,...,66.88,,,21.40,21.40,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71450,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0987,,,,,111276.1060,...,23775.84,20674.63,8004.11,8004.11,20674.63,23775.84,31011.94,21645.92,30748.29,
71451,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0988,,,,,50846.9328,...,12154.45,10569.08,8004.11,8004.11,10569.08,12154.45,15853.62,11065.62,15718.84,
71452,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0989,,,,,46587.6000,...,7898.53,6868.28,8004.11,8004.11,6868.28,7898.53,10302.42,7190.96,10214.84,
71453,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,0998,,,,,1972.0000,...,,,,,,,,6523.00,9266.00,


In [9]:
def unify_modifiers(m1, m2):
    if m1 is None:
        return None

    if m1 is not None:
        if m2 is not None:
            return m1 + "|" + m2

        return m1

assert unify_modifiers(None, None) is None
assert unify_modifiers("TC", None) == "TC"
assert unify_modifiers("TC", "TC") == "TC|TC"

In [10]:
df_mid.loc[df_mid['modifiers'].isnull(), 'modifiers'] = None
df_mid.loc[df_mid['Modifier2'].isnull(), 'Modifier2'] = None

df_mid['modifiers'] = df_mid[['modifiers', 'Modifier2']].apply(lambda row: unify_modifiers(row['modifiers'], row['Modifier2']), axis=1)
del df_mid['Modifier2']
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,Gross Charge,...,SIGNATURE_HEALTH_-_ALL_PLANS,SUPERIOR_HEALTH_PLAN_COMMERCIAL_-_ALL_OTHER_PLANS,SUPERIOR_HEALTH_PLAN_MCR_ADV,SUPERIOR_HEALTH_PLAN_MEDICAID,TCHP_CHIPS_-_ALL_PLANS,TEXAN_PLUS,TEXAS_EXCHANGE_PLANS_-_ALL_PLANS,TEXAS_SCHOOL_HEALTH_BENEFITS_-_ALL_PLANS,UNITED_HEALTHCARE_HMO,UNITED_HEALTHCARE_PPO
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,BAPTIST HOSPITAL - ALL,0723,,,,0,282.00,...,176.26,,,28.20,28.20,,,,,
1,1700018_1,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - ALL,0270,,,,0,214.00,...,133.76,,,21.40,21.40,,,,,
2,1700018_10,PHOTOTHERAPY,Procedure Code,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,0270,,,,0,214.00,...,66.88,,,21.40,21.40,,,,,
3,1700018_12,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - LUMBERTON INFUSION,0270,,,,0,214.00,...,66.88,,,21.40,21.40,,,,,
4,1700018_14,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL OUTPATIENT CENTER,0270,,,,0,214.00,...,66.88,,,21.40,21.40,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71450,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0987,,,,111276.1060,14465.89,...,23775.84,20674.63,8004.11,8004.11,20674.63,23775.84,31011.94,21645.92,30748.29,
71451,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0988,,,,50846.9328,6610.10,...,12154.45,10569.08,8004.11,8004.11,10569.08,12154.45,15853.62,11065.62,15718.84,
71452,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0989,,,,46587.6000,6056.39,...,7898.53,6868.28,8004.11,8004.11,6868.28,7898.53,10302.42,7190.96,10214.84,
71453,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,0998,,,,1972.0000,256.36,...,,,,,,,,6523.00,9266.00,


In [11]:
money_columns = df_mid.columns.to_list()[9:]
money_columns

['Gross Charge',
 'Discounted Cash Price',
 'De-identified minimum negotiated charge',
 'De-identified maximum negotiated charge',
 'AETNA_HMO',
 'AETNA_MCR_ADV',
 'AETNA_PPO',
 'AMERICHOICE_-_ALL_PLANS',
 'AMERIGROUP_-_ALL_PLANS',
 'BCBS_BLUE_ADVANTAGE_HMO',
 'BCBS_BLUE_ESSENTIALS',
 'BCBS_MCR_ADV',
 'BCBS_PPO_POS',
 'BEACON_HEALTH_-_ALL_PLANS',
 'BLUE_BELL_-_ALL_PLANS',
 'BLUE_CROSS_BLUE_CHOICE',
 'BROOKSHIRE_BROTHERS_-_ALL_PLANS',
 'CENTRAL_HEALTHCARE_SERVICES_-_ALL_PLANS',
 'CIGNA_-_ALL_PLANS',
 'CIGNA_HEALTHSPRING_-_ALL_OTHER_PLANS',
 'COMMUNITY_HEALTH_CHOICE_-_ALL_PLANS',
 'CORRECTCARE_-_ALL_PLANS',
 'FIRST_HEALTH_-_ALL_PLANS',
 'HEALTHSMART_-_ALL_PLANS',
 'HUMANA_HMO',
 'HUMANA_PPO',
 'IMAGINE_HEALTHCARE__SMARTCARE__-_ALL_PLANS',
 'MANAGED_HEALTHCARE_INC_-_ALL_PLANS',
 'MOLINA_MEDICAID_-_ALL_PLANS',
 'MULTIPLAN_-_ALL_PLANS',
 'NAPHCARE_INC_-_ALL_PLANS',
 'PHCS_-_ALL_PLANS',
 'PPONEXT_-_ALL_PLANS',
 'PREFERRED_CARE_SERVICES_-_ALL_PLANS',
 'SIGNATURE_HEALTH_-_ALL_PLANS',
 'SUPERIO

In [12]:
remaining_columns = df_mid.columns.to_list()[:9]
df_mid = pd.melt(df_mid, id_vars=remaining_columns, var_name='payer_name', value_name='standard_charge')
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,BAPTIST HOSPITAL - ALL,0723,,,,0,Gross Charge,282.00
1,1700018_1,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - ALL,0270,,,,0,Gross Charge,214.00
2,1700018_10,PHOTOTHERAPY,Procedure Code,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,0270,,,,0,Gross Charge,214.00
3,1700018_12,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL - LUMBERTON INFUSION,0270,,,,0,Gross Charge,214.00
4,1700018_14,PHOTOTHERAPY,Procedure Code,BAPTIST HOSPITAL OUTPATIENT CENTER,0270,,,,0,Gross Charge,214.00
...,...,...,...,...,...,...,...,...,...,...,...
3144015,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0987,,,,111276.1060,UNITED_HEALTHCARE_PPO,
3144016,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0988,,,,50846.9328,UNITED_HEALTHCARE_PPO,
3144017,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0989,,,,46587.6000,UNITED_HEALTHCARE_PPO,
3144018,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,0998,,,,1972.0000,UNITED_HEALTHCARE_PPO,


In [13]:
set(df_mid['setting'].to_list())

{'ALTUS CANCER CENTER',
 'BAPTIST BEAUMONT HOSPITAL - DAUPHIN',
 'BAPTIST BEAUMONT HOSPITAL - FANNIN',
 'BAPTIST BEAUMONT HOSPITAL - JBR CANCER CENTER',
 'BAPTIST BEAUMONT HOSPITAL - ORANGE',
 'BAPTIST BEAUMONT HOSPITAL - OUTPATIENT INFUSION',
 'BAPTIST BEAUMONT HOSPITAL - SLEEP LAB',
 'BAPTIST BEAUMONT HOSPITAL - WOUND CARE',
 'BAPTIST BEAUMONT HSPT 740 CARDIAC DIA',
 'BAPTIST HOSPITAL - ALL',
 'BAPTIST HOSPITAL - LUMBERTON INFUSION',
 'BAPTIST HOSPITAL BARIATRIC, REFLUX',
 'BAPTIST HOSPITAL OUTPATIENT CENTER',
 'BAPTIST HOSPITAL OUTPATIENT CENTER - CT',
 'BAPTIST HOSPITAL RESIDENCY CLINIC',
 'BAPTIST HOSPITALS OF SOUTHEAST TEXAS - SLEEP LAB BROADWAY',
 'BAPTIST HOSPITALS OF SOUTHEAST TEXAS - SLEEP LAB EASTEX',
 'BAPTIST HOSPITALS OF SOUTHEAST TEXAS - SLEEP LAB N 11TH',
 'Baptist OP Center Jasper',
 'CANCER CENTER OF SOUTHEAST TEXAS',
 nan}

In [14]:
df_mid['additional_generic_notes'] = df_mid['setting']
df_mid['billing_class'] = None
df_mid.loc[df_mid['setting'] == 'ProFee', 'billing_class'] = 'professional'
df_mid['setting'] = None
df_mid.loc[df_mid['additional_generic_notes'] == 'Inpatient', 'setting'] = 'inpatient'
df_mid.loc[df_mid['additional_generic_notes'] == 'Outpatient', 'setting'] = 'outpatient'
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,,0723,,,,0,Gross Charge,282.00,BAPTIST HOSPITAL - ALL,
1,1700018_1,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - ALL,
2,1700018_10,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,
3,1700018_12,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - LUMBERTON INFUSION,
4,1700018_14,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL OUTPATIENT CENTER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3144015,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0987,,,,111276.1060,UNITED_HEALTHCARE_PPO,,,
3144016,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0988,,,,50846.9328,UNITED_HEALTHCARE_PPO,,,
3144017,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0989,,,,46587.6000,UNITED_HEALTHCARE_PPO,,,
3144018,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,0998,,,,1972.0000,UNITED_HEALTHCARE_PPO,,,


In [15]:
def payer_name_to_payer_category(payer_name):
    if payer_name == 'Discounted Cash Price':
        return 'cash'
    elif payer_name == 'Gross Charge':
        return 'gross'
    elif payer_name == 'De-identified maximum negotiated charge':
        return 'max'
    elif payer_name == 'De-identified minimum negotiated charge':
        return 'min'

    return 'payer'

df_mid['payer_category'] = df_mid['payer_name'].apply(payer_name_to_payer_category)
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class,payer_category
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,,0723,,,,0,Gross Charge,282.00,BAPTIST HOSPITAL - ALL,,gross
1,1700018_1,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - ALL,,gross
2,1700018_10,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,,gross
3,1700018_12,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - LUMBERTON INFUSION,,gross
4,1700018_14,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL OUTPATIENT CENTER,,gross
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3144015,987,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0987,,,,111276.1060,UNITED_HEALTHCARE_PPO,,,,payer
3144016,988,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0988,,,,50846.9328,UNITED_HEALTHCARE_PPO,,,,payer
3144017,989,NON-EXTENSIVE O.R. PROCEDURES UNRELATED TO PRI...,Inpatient,,0989,,,,46587.6000,UNITED_HEALTHCARE_PPO,,,,payer
3144018,998,PRINCIPAL DIAGNOSIS INVALID AS DISCHARGE DIAGN...,Inpatient,,0998,,,,1972.0000,UNITED_HEALTHCARE_PPO,,,,payer


In [16]:
df_mid = df_mid[df_mid['standard_charge'].notnull()]
df_mid.shape

(2437603, 14)

In [17]:
if not 'hcpcs_cpt' in df_mid.columns:
    df_mid['hcpcs_cpt'] = None

def split_off_modifiers(row):
    hcpcs_cpt = row.get('hcpcs_cpt')
    if type(hcpcs_cpt) != str:
        return row

    if len(hcpcs_cpt) == 7:
        row['modifiers'] = hcpcs_cpt[-2:]
        row['hcpcs_cpt'] = hcpcs_cpt[:5]

    if len(hcpcs_cpt) == 9:
        row['modifiers'] = hcpcs_cpt[-4:]
        row['modifiers'] = row['modifiers'][:2] + "|" + row['modifiers'][-2:]
        row['hcpcs_cpt'] = hcpcs_cpt[:5]
    
    return row

df_mid = df_mid.apply(lambda row: split_off_modifiers(row), axis=1)
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,standard_charge,additional_generic_notes,billing_class,payer_category
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,,0723,,,,0,Gross Charge,282.00,BAPTIST HOSPITAL - ALL,,gross
1,1700018_1,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - ALL,,gross
2,1700018_10,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,,gross
3,1700018_12,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL - LUMBERTON INFUSION,,gross
4,1700018_14,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,214.00,BAPTIST HOSPITAL OUTPATIENT CENTER,,gross
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3143421,9906542_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,0.00,BAPTIST HOSPITAL - ALL,,payer
3143424,9906570_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - PSYCH...,Procedure Code,,0124,,,,0,UNITED_HEALTHCARE_PPO,0.00,BAPTIST HOSPITAL - ALL,,payer
3143426,9906572_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,0.00,BAPTIST HOSPITAL - ALL,,payer
3143429,9906590_1,NURSERY - NEWBORN - LEVEL II,Procedure Code,,0172,,,,0,UNITED_HEALTHCARE_PPO,3728.00,BAPTIST HOSPITAL - ALL,,payer


In [18]:
set(df_mid['modifiers'].to_list())

{None}

In [19]:
df_mid = pd.DataFrame(df_mid) # XXX

df_mid.loc[df_mid['hcpcs_cpt'].isnull(), 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'] == 'WC003', 'code'] = 'WC003'
df_mid.loc[df_mid['hcpcs_cpt'] == 'WC003', 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'].str.startswith('CS'), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.startswith('CS')]['hcpcs_cpt']
df_mid.loc[df_mid['hcpcs_cpt'].str.startswith('CS'), 'hcpcs_cpt'] = ''
df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'code'] = df_mid[df_mid['hcpcs_cpt'].str.isalpha()]['hcpcs_cpt']
df_mid['hcpcs_cpt'] = df_mid['hcpcs_cpt'].apply(lambda cpt: '' if len(cpt) != 5 else cpt)
df_mid.loc[df_mid['hcpcs_cpt'].str.isalpha(), 'hcpcs_cpt'] = None
df_mid.loc[df_mid['hcpcs_cpt'] == '', 'hcpcs_cpt'] = None

set(df_mid['hcpcs_cpt'].to_list())

{'83036',
 '96164',
 '77338',
 'J1800',
 'J1580',
 '87176',
 '82030',
 '72285',
 'Q4101',
 'J2795',
 '86972',
 '87556',
 'J1160',
 'J2250',
 '77373',
 '82951',
 '87904',
 '90674',
 'J9303',
 '75822',
 'G0379',
 '78071',
 '93930',
 '92975',
 '86985',
 'J0282',
 'C1766',
 'J9351',
 '70549',
 '84446',
 '73620',
 '84586',
 '86146',
 '85410',
 'J2700',
 '86335',
 'J2315',
 'J0180',
 'Q4173',
 '83003',
 '87177',
 '70200',
 '83030',
 '85250',
 'S0028',
 '84588',
 '86905',
 'J1602',
 '81276',
 '81279',
 'C1780',
 '73650',
 '70490',
 'C1782',
 '85270',
 '73223',
 '78227',
 'J0207',
 '82550',
 '86360',
 '84132',
 'J0515',
 'Q0139',
 'J0595',
 'J9280',
 '83593',
 '80192',
 'J1120',
 '11104',
 'J7402',
 'J7644',
 'L1832',
 '77280',
 '76800',
 'G0269',
 'J3411',
 '88377',
 '82610',
 'J0348',
 '97116',
 'J7627',
 'Q5121',
 '90834',
 '77336',
 '73220',
 'J9144',
 '86361',
 '93503',
 'J9045',
 '70540',
 'J0280',
 '70300',
 'J1631',
 'J9268',
 'J1190',
 'C2615',
 '72158',
 'J9266',
 '85245',
 'G0279',


In [20]:
df_mid['hospital_id'] = ccn
df_mid['line_type'] = None
if not 'local_code' in df_mid.columns:
    df_mid['local_code'] = None
if not 'code' in df_mid.columns:
    df_mid['code'] = None
if not 'ms_drg' in df_mid.columns:
    df_mid['ms_drg'] = None
df_mid['apr_drg'] = None
df_mid['eapg'] = None
df_mid['alt_hcpcs_cpt'] = None
df_mid['thru'] = None
df_mid['apc'] = None
if not 'icd' in df_mid.columns:
    df_mid['icd'] = None
df_mid['drug_quantity'] = None
df_mid['drug_unit_of_measurement'] = None
df_mid['drug_type_of_measurement'] = None
df_mid['plan_name'] = None
df_mid['standard_charge_percent'] = None
df_mid['contracting_method'] = None
df_mid['additional_payer_specific_notes'] = None

df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,...,thru,apc,icd,drug_quantity,drug_unit_of_measurement,drug_type_of_measurement,plan_name,standard_charge_percent,contracting_method,additional_payer_specific_notes
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,,0723,,,,0,Gross Charge,...,,,,,,,,,,
1,1700018_1,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
2,1700018_10,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
3,1700018_12,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
4,1700018_14,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3143421,9906542_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143424,9906570_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - PSYCH...,Procedure Code,,0124,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143426,9906572_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143429,9906590_1,NURSERY - NEWBORN - LEVEL II,Procedure Code,,0172,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,


In [21]:
# https://regexlib.com/REDetails.aspx?regexp_id=3084&AspxAutoDetectCookieSupport=1
def code_is_cpt(code):
    if type(code) != str:
        return False
        
    m = re.match(r'^\d{4,4}[A-Z0-9]$', code)
    return m is not None

# https://regex101.com/library/sY0wA0
def code_is_hcpcs(code):
    if type(code) != str:
        return False
    
    m = re.match(r'^[a-zA-Z]\d{4}$', code)
    return m is not None

# https://www.johndcook.com/blog/2019/05/05/regex_icd_codes/
def code_is_icd9(code):
    if type(code) != str:
        return False
    
    N = "\d{3}\.?\d{0,2}"
    E = "E\d{3}\.?\d?"
    V = "V\d{2}\.?\d{0,2}"
    icd9_regex = "|".join([N, E, V])

    m = re.match(icd9_regex, code)
    return m is not None

def code_is_icd10(code):
    if type(code) != str:
        return False

    m1 = re.match(r'[A-TV-Z][0-9][0-9AB]\.?[0-9A-TV-Z]{0,4}', code)
    # https://stackoverflow.com/a/68761242
    m2 = re.match(r'^[A-HJ-NP-Z\d]{7}$', code)
    return m1 is not None or m2 is not None

def code_is_ms_drg(code):
    if type(code) != str:
        return False

    m = re.match(r'^\d{3}$', code)
    return m is not None

assert code_is_cpt('99214')
assert not code_is_cpt('123')
assert code_is_icd9('280.1')
assert code_is_icd10('I25.110')
assert code_is_icd10('0Y6D0Z3')
assert code_is_ms_drg('026')
assert not code_is_ms_drg('25')
assert not code_is_ms_drg('2500')

In [22]:
def fix_codes(row):
    def match_and_set(row, code):
        if code is None:
            return
        
        if code_is_cpt(code) or code_is_hcpcs(code):
            if row['hcpcs_cpt'] is None:
                row['hcpcs_cpt'] = code
        elif code_is_ms_drg(code):
            if row['ms_drg'] is None:
                row['ms_drg'] = code
        elif code_is_icd9(code) or code_is_icd10(code):
            if row['icd'] is None:
                row['icd'] = code
    
    code = row.get('code')
    match_and_set(row, code)

    local_code = row.get('local_code')
    match_and_set(row, code)

    return row

df_mid = df_mid.apply(fix_codes, axis=1)
df_mid

Unnamed: 0,Code,description,Code Type,setting,rev_code,hcpcs_cpt,modifiers,ndc,drug_hcpcs_multiplier,payer_name,...,thru,apc,icd,drug_quantity,drug_unit_of_measurement,drug_type_of_measurement,plan_name,standard_charge_percent,contracting_method,additional_payer_specific_notes
0,1700013_1,LABOR ROOM/DELIVERY - CIRCUMCISION,Procedure Code,,0723,,,,0,Gross Charge,...,,,,,,,,,,
1,1700018_1,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
2,1700018_10,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
3,1700018_12,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
4,1700018_14,PHOTOTHERAPY,Procedure Code,,0270,,,,0,Gross Charge,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3143421,9906542_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143424,9906570_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - PSYCH...,Procedure Code,,0124,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143426,9906572_1,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,Procedure Code,,0126,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,
3143429,9906590_1,NURSERY - NEWBORN - LEVEL II,Procedure Code,,0172,,,,0,UNITED_HEALTHCARE_PPO,...,,,,,,,,,,


In [23]:
df_out = pd.DataFrame(df_mid[TARGET_COLUMNS])
df_out

Unnamed: 0,hospital_id,line_type,description,rev_code,local_code,code,ms_drg,apr_drg,eapg,hcpcs_cpt,...,billing_class,setting,payer_category,payer_name,plan_name,standard_charge,standard_charge_percent,contracting_method,additional_generic_notes,additional_payer_specific_notes
0,460346,,LABOR ROOM/DELIVERY - CIRCUMCISION,0723,,,,,,,...,,,gross,Gross Charge,,282.00,,,BAPTIST HOSPITAL - ALL,
1,460346,,PHOTOTHERAPY,0270,,,,,,,...,,,gross,Gross Charge,,214.00,,,BAPTIST HOSPITAL - ALL,
2,460346,,PHOTOTHERAPY,0270,,,,,,,...,,,gross,Gross Charge,,214.00,,,BAPTIST BEAUMONT HOSPITAL - WOUND CARE,
3,460346,,PHOTOTHERAPY,0270,,,,,,,...,,,gross,Gross Charge,,214.00,,,BAPTIST HOSPITAL - LUMBERTON INFUSION,
4,460346,,PHOTOTHERAPY,0270,,,,,,,...,,,gross,Gross Charge,,214.00,,,BAPTIST HOSPITAL OUTPATIENT CENTER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3143421,460346,,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,0126,,,,,,,...,,,payer,UNITED_HEALTHCARE_PPO,,0.00,,,BAPTIST HOSPITAL - ALL,
3143424,460346,,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - PSYCH...,0124,,,,,,,...,,,payer,UNITED_HEALTHCARE_PPO,,0.00,,,BAPTIST HOSPITAL - ALL,
3143426,460346,,ROOM & BOARD - SEMI-PRIVATE (TWO BEDS) - DETOX...,0126,,,,,,,...,,,payer,UNITED_HEALTHCARE_PPO,,0.00,,,BAPTIST HOSPITAL - ALL,
3143429,460346,,NURSERY - NEWBORN - LEVEL II,0172,,,,,,,...,,,payer,UNITED_HEALTHCARE_PPO,,3728.00,,,BAPTIST HOSPITAL - ALL,


In [24]:
df_out.to_csv("tmp.csv", index=False)