# ICU and ED Module Preprocessing
Preprocessing of ICU and ED tables, with some additional descriptions of design decisions

In [137]:
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql
import pandas as pd
import csv
import numpy as np
import seaborn as sns
# for fuzzy matching on the item ids
from fuzzywuzzy import process
import os

### Environment Variables for Connection ###
DB_NAME = 'smcdougall'
USERNAME = 'postgres'
PASSWORD = 'postgres'
HOST = 'localhost'
PORT = 5432 

def connect_to_postgres(db_name, username, password, host, port):
    connection = None
    try:
        connection = psycopg2.connect(
            dbname=db_name,
            user=username,
            password=password,
            host=host,
            port=port
        )
        print('Connected to db:', db_name)
        return connection
    except OperationalError as e:
        print('Received the following error:', e)
        return None

def verify_postgres_connection(connection):
    if connection is not None:
        try:
            cur = connection.cursor()
            cur.execute('SELECT version();')
            db_version = cur.fetchone()
            print('The Postgres database version is:', db_version)
            cur.close()
        except DatabaseError as e:
            print('Received the following error:', e)
    else:
        print('Connection to Postgres failed.')

def close_connection(connection):
    if connection is not None:
        connection.close()
        print('Postgres connection has been closed.')

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
verify_postgres_connection(connection)
close_connection(connection)

Connected to db: smcdougall
The Postgres database version is: ('PostgreSQL 14.5 on aarch64-apple-darwin20.6.0, compiled by Apple clang version 12.0.5 (clang-1205.0.22.9), 64-bit',)
Postgres connection has been closed.


In [138]:
"""
General function for loading data from an existing SQL table that retrieves all of the fields
for the table.
"""
def load_table(connection, table_name):
    cur = connection.cursor()
    query = f'SELECT * FROM "{table_name}"'
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    
    # get column names
    columns = [desc[0] for desc in cur.description]
    # use column names retrieved from the cursor
    df = pd.DataFrame(rows, columns=columns)
    return df

In [139]:
"""
Saves pandas DataFrame as a CSV file.
"""
def save_df_as_csv(df, csv_name, directory='dataframes'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_path = os.path.join(directory, csv_name)
    df.to_csv(file_path, index=False)

    print(f'DataFrame has been saved as {file_path}')

# ICU Pre-processing - Non Vital-Signs Tables

## Relevant tables to start with
- chartevents
- datetimeevents
- icustays
- inputevents
- outputevents
- procedureevents

**NOTE**: the filtering of "event"-related tables (`chartevents`, `datetimeevents`, etc.) is performed in vital_signs_and_hosp_preprocessing.ipynb - any importing of those tables here is for additional preprocessing/feature engineering on the existing columns

### icustays
- derived from `transfers` (just the ICU-related rows) -- **so don't preprocess separately**
- for first care unit and last care unit columns, see the consolidated values from `transfers` because they should be the same
- keep length of stay

### ingredientevents
- neither project pulls from this table, so omit from analysis

### procedureevents
- not sure how to process - not sure if ICD

In [140]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
procedureevents_df = load_table(connection, 'mimiciv_icu.filtered_procedureevents')
print(procedureevents_df.shape)
procedureevents_df.head()

Connected to db: smcdougall
(37436, 22)


Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,value,valueuom,...,orderid,linkorderid,ordercategoryname,ordercategorydescription,patientweight,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
0,10001884,26184834,37510196,31763.0,2131-01-12 21:30:00,2131-01-13 04:00:00,2131-01-15 04:07:00,225794,390.0,min,...,4809276,4809276,Ventilation,ContinuousProcess,65.0,1,0,FinishedRunning,390.0,1.0
1,10001884,26184834,37510196,35966.0,2131-01-12 17:40:00,2131-01-12 17:41:00,2131-01-12 17:50:00,227194,1.0,,...,6470885,6470885,Intubation/Extubation,Task,65.0,0,0,FinishedRunning,1.0,0.0
2,10001884,26184834,37510196,36121.0,2131-01-19 18:44:00,2131-01-19 18:45:00,2131-01-19 18:44:00,228128,1.0,,...,9459863,9459863,Communication,Task,65.0,0,0,FinishedRunning,1.0,0.0
3,10001884,26184834,37510196,36518.0,2131-01-13 16:14:00,2131-01-13 16:15:00,2131-01-13 16:14:00,225401,1.0,,...,4595950,4595950,Procedures,Task,65.0,0,0,FinishedRunning,1.0,0.0
4,10001884,26184834,37510196,36518.0,2131-01-13 16:14:00,2131-01-13 16:15:00,2131-01-13 16:14:00,225454,1.0,,...,5410081,5410081,Procedures,Task,65.0,0,0,FinishedRunning,1.0,0.0


In [141]:
procedureevents_df['location'].value_counts()

location
Right IJ                 635
Right Antecubital        586
Left Antecubital         563
Right Radial             509
Left Radial              465
                        ... 
L Hand Lateral             2
LU Ant Forearm             2
Right Foot                 1
Ostomy                     1
LU Ant Forearm Medial      1
Name: count, Length: 94, dtype: int64

In [142]:
procedureevents_df['locationcategory'].value_counts()

locationcategory
Peripheral             4357
Invasive Venous        1285
Invasive Arterial      1157
Peripheral - old        523
Catheter, GU            146
Dialysis                 86
ICP Line                 16
Intraosseous             15
Peritoneal Dialysis      11
Name: count, dtype: int64

In [143]:
procedureevents_df['ordercategorydescription'].value_counts()

ordercategorydescription
ContinuousProcess    19675
Task                 17761
Name: count, dtype: int64

In [144]:
cols_to_drop = [
    'caregiver_id',
    # when the event was stored in the system
    'storetime',
    # use location category instead
    'location',
    'isopenbag',
    # from the docs - "These fields are present in the table and never null, but have no clear meaning."
    'originalamount',
    'originalrate'
]
procedureevents_df = procedureevents_df.drop(columns=cols_to_drop)

In [145]:
procedureevents_df.isna().sum()

subject_id                      0
hadm_id                         0
stay_id                         0
starttime                       0
endtime                         0
itemid                          0
value                           0
valueuom                        0
locationcategory            29840
orderid                         0
linkorderid                     0
ordercategoryname               0
ordercategorydescription        0
patientweight                   0
continueinnextdept              0
statusdescription               0
dtype: int64

In [146]:
procedureevents_df['locationcategory'] = procedureevents_df['locationcategory'].fillna('Unknown')

In [147]:
procedureevents_df['itemid'].nunique()

143

In [148]:
save_df_as_csv(procedureevents_df, 'icu_procedureevents.csv')

DataFrame has been saved as dataframes/icu_procedureevents.csv


We are working with 143 different procedures, which isn't terrible to start out with. Unfortunately they are not mapped to ICD-9 or ICD-10 but use an itemid instead... Keep as-is for now and see if it causes an issue down the line?

# ED Pre-processing

## Relevant tables to start with
- diagnosis
- edstays (minus demographic fields -- already captured)
- medrecon - will probably filter on ndc
- pyxis
- triage
- vitalsign

### diagnosis

In [149]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
ed_diagnosis_df = load_table(connection, 'mimiciv_ed.filtered_diagnosis')
print(ed_diagnosis_df.shape)
ed_diagnosis_df.head()

Connected to db: smcdougall
(91603, 6)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10001884,31306678,1,J45901,10,Unspecified asthma with (acute) exacerbation
1,10001884,31742950,1,J441,10,Chronic obstructive pulmonary disease w (acute...
2,10001884,33281437,1,78650,9,CHEST PAIN NOS
3,10001884,33281437,2,4019,9,HYPERTENSION NOS
4,10001884,33281437,3,49390,9,"ASTHMA, UNSPECIFIED"


In [150]:
ed_diagnosis_df['icd_code'].nunique()

4818

In [151]:
PREG_ICD9_PREFIXES = ['V22', 'V23', 'V24', 'V27', 'V28', '63', '64', '65', '66', '67', '7651', '7650']
PREG_ICD10_PREFIXES = ['Z33', 'Z34', 'Z3A', 'O0', 'O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9']

def does_not_start_with_prefixes(string, prefixes):
    for prefix in prefixes:
        if string.startswith(prefix):
            return False
    return True

mapping_file = 'ICD9_to_ICD10_mapping.txt'

### NOTE: The following functions come from https://github.com/healthylaife/MIMIC-IV-Data-Pipeline ###
def read_icd_mapping(map_path: str) -> pd.DataFrame:
    """Reads in mapping table for converting ICD9 to ICD10 codes"""

    mapping = pd.read_csv(map_path, header=0, delimiter="\t")
    mapping.diagnosis_description = mapping.diagnosis_description.apply(str.lower)
    return mapping

def standardize_icd(
    mapping: pd.DataFrame, diag: pd.DataFrame, map_code_col="diagnosis_code", root=True
) -> str:
    """Takes an ICD9 -> ICD10 mapping table and a diagnosis dataframe;
    adds column with converted ICD10 column"""

    count = 0
    code_cols = mapping.columns
    errors = []

    def icd_9to10(icd):
        """Function use to apply over the diag DataFrame for ICD9->ICD10 conversion"""
        # If root is true, only map an ICD 9 -> 10 according to the
        # ICD9's root (first 3 digits)

        # NOTE - modified from the original code
        # if root AND not a pregnancy-related code
        if root and does_not_start_with_prefixes(icd, PREG_ICD9_PREFIXES):
            icd = icd[:3]

        if map_code_col not in code_cols:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        matches = mapping.loc[mapping[map_code_col] == icd]
        if matches.shape[0] == 0:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        return mapping.loc[mapping[map_code_col] == icd].icd10cm.iloc[0]

    # Create new column with original codes as default
    col_name = "root_icd10_convert"
    diag[col_name] = diag["icd_code"].values

    # Group identical ICD9 codes, then convert all ICD9 codes within
    # a group to ICD10
    for code, group in diag.loc[diag.icd_version == 9].groupby(by="icd_code"):
        new_code = icd_9to10(code)
        for idx in group.index.values:
            # Modify values of original df at the indexes in the groups
            diag.at[idx, col_name] = new_code

        count += group.shape[0]
        #print(f"{count}/{diag.shape[0]} rows processed")

    # Column for just the roots of the converted ICD10 column
    diag["root"] = diag[col_name].apply(lambda x: x[:3] if type(x) is str and does_not_start_with_prefixes(x, PREG_ICD10_PREFIXES)
                                        else x if type(x)
                                        else np.nan)

###
###
### TODO: may need to edit this to just work with existing diagnosis df that I have
def preproc_icd_module(h_ids,
    module_path: str, ICD10_code: str, icd_map_path: str
) -> tuple:
    """Takes an module dataset with ICD codes and puts it in long_format,
    mapping ICD-codes by a mapping table path"""

    diag = get_diagnosis_icd(module_path)
    icd_map = read_icd_mapping(icd_map_path)

    standardize_icd(icd_map, diag, root=True)

    # patient ids that have at least 1 record of the given ICD10 code category
    diag.dropna(subset=["root"], inplace=True)
    pos_ids = pd.DataFrame(
        diag.loc[diag.root.str.contains(ICD10_code)].hadm_id.unique(),
        columns=["hadm_id"]
    )
    return pos_ids

In [152]:
icd_mapping = read_icd_mapping(mapping_file)
icd_mapping.head()

Unnamed: 0,diagnosis_type,diagnosis_code,diagnosis_description,icd9cm,icd10cm,flags
0,ICD9,996.76,other complications due to genitourinary devic...,99676,T8384XA,10000
1,ICD9,V54.12,aftercare for healing traumatic fracture of lo...,V5412,S52602D,10000
2,ICD9,730.06,acute osteomyelitis involving lower leg,73006,M86169,10000
3,ICD9,345.61,"infantile spasms, with intractable epilepsy",34561,G40824,10000
4,ICD9,989.5,toxic effect of venom,9895,T63421A,10000


In [153]:
standardize_icd(icd_mapping, ed_diagnosis_df, root=True)

# patient ids that have at least 1 record of the given ICD10 code category
ed_diagnosis_df.dropna(subset=["root"], inplace=True)

In [154]:
ed_diagnosis_df = ed_diagnosis_df.assign(icd_code=ed_diagnosis_df['root']).drop(columns=['root_icd10_convert', 'root', 'icd_version'])
ed_diagnosis_df.head()

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_title
0,10001884,31306678,1,J45,Unspecified asthma with (acute) exacerbation
1,10001884,31742950,1,J44,Chronic obstructive pulmonary disease w (acute...
2,10001884,33281437,1,R06,CHEST PAIN NOS
3,10001884,33281437,2,I10,HYPERTENSION NOS
4,10001884,33281437,3,J45,"ASTHMA, UNSPECIFIED"


In [155]:
ed_diagnosis_df['icd_code'].nunique()

1193

In [156]:
ed_diagnosis_df.isna().sum()

subject_id    0
stay_id       0
seq_num       0
icd_code      0
icd_title     0
dtype: int64

In [157]:
save_df_as_csv(ed_diagnosis_df, 'ed_diagnosis.csv')

DataFrame has been saved as dataframes/ed_diagnosis.csv


### edstays

In [158]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
edstays_df = load_table(connection, 'mimiciv_ed.filtered_edstays')
print(edstays_df.shape)
edstays_df.head()

Connected to db: smcdougall
(43198, 9)


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10001884,21192799.0,38708413,2130-10-05 11:58:00,2130-10-06 15:05:00,F,BLACK/AFRICAN AMERICAN,WALK IN,HOME
1,10001884,22532141.0,38021228,2130-10-13 21:00:00,2130-10-14 13:57:00,F,BLACK/AFRICAN AMERICAN,WALK IN,HOME
2,10001884,24325811.0,33281437,2126-11-03 19:15:00,2126-11-04 12:49:00,F,BLACK/AFRICAN AMERICAN,WALK IN,HOME
3,10001884,24746267.0,35329716,2130-12-27 15:48:00,2130-12-27 22:30:00,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED
4,10001884,24962904.0,31742950,2130-12-06 16:46:00,2130-12-06 22:05:00,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED


In [159]:
edstays_df.isna().sum()

subject_id               0
hadm_id              22444
stay_id                  0
intime                   0
outtime                  0
gender                   0
race                     0
arrival_transport        0
disposition              0
dtype: int64

In [160]:
# remove gender and race since we used it for patient demographics pre-processing previously
edstays_df = edstays_df.drop(columns=['gender', 'race'])
edstays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,arrival_transport,disposition
0,10001884,21192799.0,38708413,2130-10-05 11:58:00,2130-10-06 15:05:00,WALK IN,HOME
1,10001884,22532141.0,38021228,2130-10-13 21:00:00,2130-10-14 13:57:00,WALK IN,HOME
2,10001884,24325811.0,33281437,2126-11-03 19:15:00,2126-11-04 12:49:00,WALK IN,HOME
3,10001884,24746267.0,35329716,2130-12-27 15:48:00,2130-12-27 22:30:00,WALK IN,ADMITTED
4,10001884,24962904.0,31742950,2130-12-06 16:46:00,2130-12-06 22:05:00,WALK IN,ADMITTED


#### Arrival transport
- Possible values: WALKIN, AMBULANCE, UNKNOWN, OTHER, HELICOPTER
- in order to simplify this for the machine learning model, we can have this field be indicative of the level of urgency for the ED arrival. We can combine AMBULANCE and HELICOPTER, versus the other values
- Do a transformation on this column

In [161]:
urgent_transport_values = ['AMBULANCE', 'HELICOPTER']
edstays_df['arrived_by_urgent_transport'] = edstays_df['arrival_transport'].apply(lambda x: 1 if x.lower() in urgent_transport_values else 0)
edstays_df.drop(columns=['arrival_transport'], inplace=True)
edstays_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,disposition,arrived_by_urgent_transport
0,10001884,21192799.0,38708413,2130-10-05 11:58:00,2130-10-06 15:05:00,HOME,0
1,10001884,22532141.0,38021228,2130-10-13 21:00:00,2130-10-14 13:57:00,HOME,0
2,10001884,24325811.0,33281437,2126-11-03 19:15:00,2126-11-04 12:49:00,HOME,0
3,10001884,24746267.0,35329716,2130-12-27 15:48:00,2130-12-27 22:30:00,ADMITTED,0
4,10001884,24962904.0,31742950,2130-12-06 16:46:00,2130-12-06 22:05:00,ADMITTED,0


#### Disposition
- Possible values: HOME, ADMITTED, TRANSFER, LEFT WITHOUT BEING SEEN, OTHER, LEFT AGAINST MEDICAL ADVICE, ELOPED, EXPIRED
- eloped == patient leaves prematurely (so, similar to leaving against medical advice)
- To start, we can consolidate some of the categories in regard to leaving -- and I wonder if eventually we should just filter those out to begin with...

In [162]:
disposition_mapping = {
    'HOME': 'HOME',
    'ADMITTED': 'ADMITTED',
    'TRANSFER': 'TRANSFER',
    'LEFT WITHOUT BEING SEEN': 'LEFT',
    'OTHER': 'OTHER',
    'LEFT AGAINST MEDICAL ADVICE': 'LEFT',
    'ELOPED': 'LEFT',
    'EXPIRED': 'EXPIRED'
}

# Apply the mapping to create a new column
edstays_df['disposition'] = edstays_df['disposition'].map(disposition_mapping)
edstays_df['disposition'].value_counts()

disposition
HOME        22690
ADMITTED    17506
TRANSFER     1323
LEFT         1224
OTHER         438
EXPIRED        17
Name: count, dtype: int64

In [163]:
save_df_as_csv(edstays_df, 'ed_edstays.csv')

DataFrame has been saved as dataframes/ed_edstays.csv


### medrecon

In [164]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
medrecon_df = load_table(connection, 'mimiciv_ed.filtered_medrecon')
print(medrecon_df.shape)
medrecon_df.head()

Connected to db: smcdougall
(391033, 9)


Unnamed: 0,subject_id,stay_id,charttime,name,gsn,ndc,etc_rn,etccode,etcdescription
0,10001884,31306678,2130-10-19 14:38:00,acetaminophen,4489,10135012301,1,577,Analgesic or Antipyretic Non-Opioid
1,10001884,31306678,2130-10-19 14:38:00,albuterol sulfate,48698,49502069203,1,5970,Asthma/COPD Therapy - Beta 2-Adrenergic Agents...
2,10001884,31306678,2130-10-19 14:38:00,albuterol sulfate [ProAir HFA],28090,21695042308,1,5970,Asthma/COPD Therapy - Beta 2-Adrenergic Agents...
3,10001884,31306678,2130-10-19 14:38:00,amiodarone,266,13107005605,1,2734,Antiarrhythmic - Class III
4,10001884,31306678,2130-10-19 14:38:00,Aspirin,16995,10135017301,1,575,Salicylate Analgesics


In [165]:
medrecon_df.isna().sum()

subject_id          0
stay_id             0
charttime           0
name                0
gsn                 0
ndc                 0
etc_rn              0
etccode           947
etcdescription    947
dtype: int64

For hospital-related data preprocessing, we consolidated the meds based on the NDC mapping. Use similar logic here:

In [166]:
medrecon_df['ndc'].nunique()

4788

In [167]:
ndc_mapping = pd.read_csv('ndc_product.txt', header=0, delimiter="\t", encoding='cp1252')

In [168]:
##### from MIMIC preprocessing -- but modified to work with the ED table instead of Hosp
# returned fields like hadm_id are replaced with stay_id, which is used across the ED modules (same with charttime)
def read_ndc_mapping(map_path):
    ndc_map = pd.read_csv(map_path, header=0, delimiter='\t', encoding='latin1')
    ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.fillna("")
    ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.apply(str.lower)
    ndc_map.columns = list(map(str.lower, ndc_map.columns))
    return ndc_map

def read_prescriptions_table(mimic4_path):
    meds = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/prescriptions.csv.gz'))
    meds = meds.reset_index()
    return meds[['subject_id', 'stay_id', 'charttime', 'ndc', 'gsn']]

### NOTE - not used anywhere...
def get_generic_drugs(mapping, df):
    """Takes NDC product table and prescriptions dataframe; adds column with NDC table's corresponding generic name"""

    def brand_to_generic(ndc):
        # We only want the first 2 sections of the NDC code: xxxx-xxxx-xx
        matches = list(re.finditer(r"-", ndc))
        if len(matches) > 1:
            ndc = ndc[:matches[1].start()]
        try:
            return mapping.loc[mapping.PRODUCTNDC == ndc].NONPROPRIETARYNAME.iloc[0]
        except:
            print("Error: ", ndc)
            return np.nan

    df['generic_drug_name'] = df['ndc'].apply(brand_to_generic)

def preproc_meds(module_path:str, adm_cohort_path:str, mapping:str) -> pd.DataFrame:
  
    adm = pd.read_csv(adm_cohort_path, usecols=['stay_id', 'charttime'], parse_dates = ['charttime'])
    med = pd.read_csv(module_path, compression='gzip', usecols=['subject_id', 'stay_id', 'charttime', 'ndc'], parse_dates = ['charttime'])
    med = med.merge(adm, left_on = 'stay_id', right_on = 'stay_id', how = 'inner')
    
    # Normalize drug strings and remove potential duplicates

    med.drug = med.drug.fillna("").astype(str)
    med.drug = med.drug.apply(lambda x: x.lower().strip().replace(" ", "_") if not "" else "")
    med.drug=med.drug.dropna().apply(lambda x: x.lower().strip())
    
    #meds.to_csv(output_path, compression='gzip', index=False)
    med = ndc_meds(med,mapping)
    
    print("Number of unique type of drug: ", med.drug.nunique())
    print("Number of unique type of drug (after grouping to use Non propietary names): ", med.nonproprietaryname.nunique())
    print("Total number of rows: ", med.shape[0])
    print("# Admissions:  ", med.hadm_id.nunique())
        
    return med
    
    
def ndc_meds(med, mapping:str) -> pd.DataFrame:
    
    # Convert any nan values to a dummy value
    med.ndc = med.ndc.fillna(-1)

    # Ensures the decimal is removed from the ndc col
    med.ndc = med.ndc.astype("Int64")
    
    # The NDC codes in the prescription dataset is the 11-digit NDC code, although codes are missing
    # their leading 0's because the column was interpreted as a float then integer; this function restores
    # the leading 0's, then obtains only the PRODUCT and MANUFACTUERER parts of the NDC code (first 9 digits)
    def to_str(ndc):
        if ndc < 0:         # dummy values are < 0
            return np.nan
        ndc = str(ndc)
        return (("0"*(11 - len(ndc))) + ndc)[0:-2]

    # The mapping table is ALSO incorrectly formatted for 11 digit NDC codes. An 11 digit NDC is in the
    # form of xxxxx-xxxx-xx for manufacturer-product-dosage. The hyphens are in the correct spots, but
    # the number of digits within each section may not be 5-4-2, in which case we add leading 0's to each
    # to restore the 11 digit format. However, we only take the 5-4 sections, just like the to_str function
    def format_ndc_table(ndc):
        parts = ndc.split("-")
        return ("0"*(5 - len(parts[0])) + parts[0]) + ("0"*(4 - len(parts[1])) + parts[1])
    
    def read_ndc_mapping2(map_path):
        ndc_map = pd.read_csv(map_path, header=0, delimiter='\t', encoding = 'latin1')
        ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.fillna("")
        ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.apply(str.lower)
        ndc_map.columns = list(map(str.lower, ndc_map.columns))
        return ndc_map
        # Read in NDC mapping table
    
    ndc_map = read_ndc_mapping2(mapping)[['productndc', 'nonproprietaryname', 'pharm_classes']]
    
    # Normalize the NDC codes in the mapping table so that they can be merged
    ndc_map['new_ndc'] = ndc_map.productndc.apply(format_ndc_table)
    ndc_map.drop_duplicates(subset=['new_ndc', 'nonproprietaryname'], inplace=True)
    med['new_ndc'] = med.ndc.apply(to_str)  
    
    # Left join the med dataset to the mapping information
    med = med.merge(ndc_map, how='inner', left_on='new_ndc', right_on='new_ndc')
    
    # In NDC mapping table, the pharm_class col is structured as a text string, separating different pharm classes from eachother
    # This can be [PE], [EPC], and others, but we're interested in EPC. Luckily, between each commas, it states if a phrase is [EPC]
    # So, we just string split by commas and keep phrases containing "[EPC]"
    def get_EPC(s):
        """Gets the Established Pharmacologic Class (EPC) from the mapping table"""
        if type(s) != str:
            return np.nan
        words = s.split(",")
        return [x for x in words if "[EPC]" in x]
    
    # Function generates a list of EPCs, as a drug can have multiple EPCs
    med['EPC'] = med.pharm_classes.apply(get_EPC)
    
    return med

In [169]:
ndc_mapping = read_ndc_mapping('ndc_product.txt')
med = medrecon_df
# Normalize drug strings and remove potential duplicates

# med.dropna(inplace=True)

med = ndc_meds(med,'ndc_product.txt')

print("Number of unique type of drug (after grouping to use Non propietary names): ", med.nonproprietaryname.nunique())
print("Total number of rows: ", med.shape[0])
print("# Admissions:  ", med.stay_id.nunique())

Number of unique type of drug (after grouping to use Non propietary names):  605
Total number of rows:  77323
# Admissions:   22935


In [170]:
# we will likely be working with the nonproprietary name
cols_to_drop= ['gsn', 'etc_rn', 'etccode', 'etcdescription', 'new_ndc', 'ndc', 'productndc', 'pharm_classes', 'EPC']
med.drop(columns=cols_to_drop, inplace=True)
med.drop(columns=['name'], inplace=True)
med.head()

Unnamed: 0,subject_id,stay_id,charttime,nonproprietaryname
0,10001884,31306678,2130-10-19 14:38:00,acetaminophen
1,10001884,31306678,2130-10-19 14:38:00,oxycodone hydrochloride
2,10001884,31306678,2130-10-19 14:38:00,aspirin
3,10001884,31306678,2130-10-19 14:38:00,aspirin
4,10001884,31306678,2130-10-19 14:38:00,diltiazem hydrochloride


In [171]:
save_df_as_csv(med, 'ed_medrecon.csv')

DataFrame has been saved as dataframes/ed_medrecon.csv


### pyxis

In [172]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
pyxis_df = load_table(connection, 'mimiciv_ed.filtered_pyxis')
print(pyxis_df.shape)
pyxis_df.head()

Connected to db: smcdougall
(173941, 7)


Unnamed: 0,subject_id,stay_id,charttime,med_rn,name,gsn_rn,gsn
0,10001884,31306678,2130-10-19 13:50:00,1,MethylPREDNISolone Sodium Succ,1,6730
1,10001884,31306678,2130-10-19 13:50:00,1,MethylPREDNISolone Sodium Succ,2,51555
2,10001884,31306678,2130-10-19 13:50:00,1,MethylPREDNISolone Sodium Succ,3,65978
3,10001884,31306678,2130-10-19 13:56:00,2,Aspirin,1,4380
4,10001884,31306678,2130-10-19 15:01:00,3,Azithromycin,1,31452


- medrecon == medicine reconcilation (what the patient is already taking)
- pyxis - refers to a med station - assumed that these are dispensed medications that are then given to the patient
- note that it is challenging to map NDC to GSN - no easily identifiable mapping for completing this
- GSN - Generic Sequence Number (GSN), a sequential number assigned to each drug based on its chemical composition and strength

From the docs: "Note that as the same medication may have multiple gsn values, each row does not necessarily indicate a unique dispensation. The med_rn column allows for subselecting to individual dispensations."

The `gsn_rn` column differentiates the many different gsn's that the medicine may correspond to... Should consolidate this if possible

In [173]:
pyxis_df['name'].nunique()

930

In [174]:
pyxis_df['gsn'].nunique()

636

In [175]:
pyxis_df['gsn'].value_counts()

gsn
015869    9431
061716    9431
004490    7111
062823    6138
074851    4283
          ... 
048492       1
070644       1
041440       1
006611       1
059794       1
Name: count, Length: 636, dtype: int64

In [176]:
pyxis_df.shape

(173941, 7)

Steps for pre-processing: (1) Remove the dosage from the name, (2) lowercase everything

In [178]:
pyxis_df['name'] = pyxis_df['name'].str.lower()
print(pyxis_df['name'].nunique())

880


In [179]:
import re
# regex pattern to detect different dosages within the medication name
# also remove slashes, percent signs, and anything in parentheses
MED_DOSAGE_PATTERN = r'^([^\d/%()]+).*' #r'(\d+(\.\d+)?\s*(mg|g|mL|L|units|mcg)?|\s*/\s*|\s*%\s*|\s*\([^)]*\))'

def remove_dosage_from_med(med_name):
    match = re.match(MED_DOSAGE_PATTERN, med_name)
    if match:
        match = match.group(1).strip().lower()
        # remove any trailing asterisks
        match = re.sub(r'\*+', '', match).strip()
        # remove standalone "so" (for "solution")
        match = re.sub(r'\bso\b', '', match).strip()
        return match
    else:
        return med_name.strip().lower()
    # return re.sub(MED_DOSAGE_PATTERN, '', med_name, flags=re.IGNORECASE).strip()

pyxis_df['name'] = pyxis_df['name'].apply(remove_dosage_from_med)
print(pyxis_df['name'].nunique())

415


there are a few medications that need to be manually mapped


Rule here is that if the medication says the way it is used (ex. methadone vs. methadone oral vs. methadone oral liquid), consolidate into a single value (ex. methadone)
- acetaminophen (if it starts with acetaminophen, convert to acetaminophen)
- albuterol (if it starts with albuterol, convert to albuterol)
- aluminum-magnesium hy (if it ...)
- amoxicillin
- amphetamine-dextroamphetamine
- aspirin
- balanced salt opth.
- diltiazem
- epinephrine
- fluorescein strip
- lidocaine
- methadone
- tranexamic ac

In [181]:
prefixes = [
    'acetaminophen',
    'albuterol',
    'aluminum-magnesium hy',
    'amoxicillin',
    'amphetamine-dextroamphetamine',
    'aspirin',
    'balanced salt opth.',
    'diltiazem',
    'epinephrine',
    'fluorescein strip',
    'lidocaine',
    'methadone',
    'tranexamic ac'
]
def apply_prefix_check(name):
    for prefix in prefixes:
        if name.lower().startswith(prefix):
            return prefix
    return name

# Apply function to 'name' column
pyxis_df['name'] = pyxis_df['name'].apply(apply_prefix_check)
print(pyxis_df['name'].nunique())

388


In [182]:
save_df_as_csv(pyxis_df, 'ed_pyxis.csv')

DataFrame has been saved as dataframes/ed_pyxis.csv


### vitalsign
- numeric fields were originally free-text
- Free-text entries which could not be converted trivially were removed
- for the most part, missing data indicates that no information was documented
- routine vital signs taken ever 1-4 hours

In [183]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
vitalsign_df = load_table(connection, 'mimiciv_ed.filtered_vitalsign')
print(vitalsign_df.shape)
vitalsign_df.head()

Connected to db: smcdougall
(154819, 11)


Unnamed: 0,subject_id,stay_id,charttime,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,pain
0,10001884,31306678,2130-10-19 13:34:00,,,,79.0,,,,unable
1,10001884,31306678,2130-10-19 14:31:00,,75.0,15.0,100.0,133.0,68.0,,
2,10001884,31306678,2130-10-19 14:44:00,98.2,76.0,16.0,99.0,139.0,70.0,,0
3,10001884,31306678,2130-10-19 15:50:00,98.2,76.0,20.0,97.0,138.0,72.0,,0
4,10001884,31742950,2130-12-06 16:46:00,97.6,67.0,22.0,97.0,132.0,82.0,,0


In [184]:
vitalsign_df.isna().sum()

subject_id          0
stay_id             0
charttime           0
temperature     52369
heartrate        7199
resprate         9008
o2sat           13803
sbp              8119
dbp              8119
rhythm         149170
pain            44618
dtype: int64

In [185]:
save_df_as_csv(vitalsign_df, 'ed_vitalsign.csv')

DataFrame has been saved as dataframes/ed_vitalsign.csv


### triage
- all fields were originally free-text
- no time associated with the observations
- missing data in the numeric columns indicates either deidentified data or no data recorded


**Design Decision** - We are missing a good bit of data... and we capture temp, heart rate, resprate, etc. in the `vitalsign` table. One route to take is to exclude the triage data since it is not time-stamped and we have access to time-stamped data in the `vitalsign` table. On the other hand, we can also impute missing values in this table with the first value that is captured for the stay in the `vitalsign` table (?) - that way we can still retain useful information from this table where we have it defined already

In [186]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
triage_df = load_table(connection, 'mimiciv_ed.filtered_triage')
print(triage_df.shape)
triage_df.head()

Connected to db: smcdougall
(43198, 11)


Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint
0,10001884,31306678,,,,79.0,,,unable,1.0,Hypoxia
1,10001884,31742950,97.6,67.0,22.0,97.0,132.0,82.0,0,2.0,Dyspnea
2,10001884,33281437,98.2,72.0,20.0,97.0,157.0,66.0,2,3.0,CP/SOB
3,10001884,33478776,98.4,74.0,16.0,100.0,142.0,69.0,0,3.0,"Dyspnea, Fatigue"
4,10001884,34226385,98.2,65.0,18.0,97.0,113.0,91.0,0,2.0,Dyspnea


In [187]:
triage_df.isna().sum()

subject_id           0
stay_id              0
temperature       1431
heartrate          847
resprate          1223
o2sat             1211
sbp                956
dbp               1036
pain               869
acuity             273
chiefcomplaint       1
dtype: int64

In [188]:
triage_df['stay_id'].nunique()

43198

In [189]:
triage_df.shape

(43198, 11)

In [190]:
save_df_as_csv(triage_df, 'ed_triage.csv')

DataFrame has been saved as dataframes/ed_triage.csv


Each row in this dataframe corresponds to a unique stay - so there will be a temp, heart rate, etc. associated with the stay if it is present