# Vital signs and Hospital Preprocessing
This notebook applies feature engineering to select fields within the MIMIC-IV dataset's "hosp" module. This notebook also applies preprocessing to vital signs data by reverse-mapping the vitals signs labels to their MIMIC-IV itemids, and then filtering the vital signs to those specific labels.

In [1]:
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql
import pandas as pd
import csv
import numpy as np
import seaborn as sns
# for fuzzy matching on the item ids
from fuzzywuzzy import process
import os

### Environment Variables for Connection ###
DB_NAME = 'smcdougall'
USERNAME = 'postgres'
PASSWORD = 'postgres'
HOST = 'localhost'
PORT = 5432 

def connect_to_postgres(db_name, username, password, host, port):
    connection = None
    try:
        connection = psycopg2.connect(
            dbname=db_name,
            user=username,
            password=password,
            host=host,
            port=port
        )
        print('Connected to db:', db_name)
        return connection
    except OperationalError as e:
        print('Received the following error:', e)
        return None

def verify_postgres_connection(connection):
    if connection is not None:
        try:
            cur = connection.cursor()
            cur.execute('SELECT version();')
            db_version = cur.fetchone()
            print('The Postgres database version is:', db_version)
            cur.close()
        except DatabaseError as e:
            print('Received the following error:', e)
    else:
        print('Connection to Postgres failed.')

def close_connection(connection):
    if connection is not None:
        connection.close()
        print('Postgres connection has been closed.')

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
verify_postgres_connection(connection)
close_connection(connection)

Connected to db: smcdougall
The Postgres database version is: ('PostgreSQL 14.5 on aarch64-apple-darwin20.6.0, compiled by Apple clang version 12.0.5 (clang-1205.0.22.9), 64-bit',)
Postgres connection has been closed.


In [2]:
"""
General function for loading data from an existing SQL table that retrieves all of the fields
for the table.
"""
def load_table(connection, table_name):
    cur = connection.cursor()
    query = f'SELECT * FROM "{table_name}"'
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    
    # get column names
    columns = [desc[0] for desc in cur.description]
    # use column names retrieved from the cursor
    df = pd.DataFrame(rows, columns=columns)
    return df

In [3]:
"""
Saves pandas DataFrame as a CSV file.
"""
def save_df_as_csv(df, csv_name, directory='dataframes'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_path = os.path.join(directory, csv_name)
    df.to_csv(file_path, index=False)

    print(f'DataFrame has been saved as {file_path}')

## Vital Signs Data

The vital signs below originate from Appendix A of [MIMIC-Extract](https://arxiv.org/pdf/1907.08322) - these vital sign strings refer to the labels in the dataset.

In [182]:
mimic_extract_vital_signs = [
    'alanine aminotransferase', 'albumin', 'albumin ascites', 'albumin pleural', 'albumin urine', 'alkaline phosphate', 
    'anion gap', 'asparate aminotransferase', 'basophils', 'bicarbonate', 'bilirubin', 'blood urea nitrogen', 'calcium',
    'calcium ionized', 'calcium urine', 'cardiac index', 'cardiac output fick', 'cardiac output thermodilution',
    'central venous pressure', 'chloride', 'chloride urine', 'cholesterol', 'cholesterol hdl', 'cholesterol ldl', 'co2',
    'co2 (etco2, pco2, etc.)', 'creatinine', 'creatinine ascites', 'creatinine body fluid', 'creatinine pleural',
    'creatinine urine', 'diastolic blood pressure', 'eosinophils', 'fibrinogen', 'fraction inspired oxygen', 
    'fraction inspired oxygen set', 'glascow coma scale total', 'glucose', 'heart rate', 'height', 'hematocrit', 
    'hemoglobin', 'lactate', 'lactate dehydrogenase', 'lactate dehydrogenase pleural', 'lactic acid', 'lymphocytes',
    'lymphocytes ascites', 'lymphocytes atypical', 'lymphocytes atypical csl', 'lymphocytes body fluid',
    'lymphocytes percent', 'lymphocytes pleural', 'magnesium', 'mean blood pressure', 'mean corpuscular hemoglobin',
    'mean corpuscular hemoglobin concentration', 'mean corpuscular volume', 'monocytes', 'monocytes csl', 'neutrophils',
    'oxygen saturation', 'partial pressure of carbon dioxide', 'partial pressure of oxygen',
    'partial thromboplastin time', 'peak inspiratory pressure', 'ph', 'ph urine', 'phosphate', 'phosphorous',
    'plateau pressure', 'platelets', 'positive end-expiratory pressure', 'positive end-expiratory pressure set',
    'post void residual', 'potassium', 'potassium serum', 'prothrombin time inr', 'prothrombin time pt',
    'pulmonary artery pressure mean', 'pulmonary artery pressure systolic', 'pulmonary capillary wedge pressure',
    'red blood cell count', 'red blood cell count ascites', 'red blood cell count csf', 'red blood cell count pleural',
    'red blood cell count urine', 'respiratory rate', 'respiratory rate set', 'sodium', 'systemic vascular resistance',
    'systolic blood pressure', 'temperature', 'tidal volume observed', 'tidal volume set', 'tidal volume spontaneous',
    'total protein', 'total protein urine', 'troponin-i', 'troponin-t', 'venous pvo2', 'weight', 'white blood cell count', 
    'white blood cell count urine'
]

In [183]:
len(mimic_extract_vital_signs)

104

The additional vital signs originate from personal research on vital signs that are commonly measured during pregnancy.

In [184]:
additional_vital_signs = [
    'leukocyte',
    'blood glucose level',
    'proteineuria',
    'fetal heart rate',
    'amniotic fluid',
    'umbilical artery doppler'
]

#### Definitions of the additional vital signs (and how they relate to pregnancy)
- Leukocyte - type of white blood cell (deemed useful feature in related works, couldn’t find a direct blood cell mapping from MIMIC-III to MIMIC-IV - can help detect infection)
- Blood glucose level - concentration of glucose in blood, important to track for detection of gestational diabetes or other complications (preterm birth, preeclampsia)
- Proteinuria - presence of abnormal amounts of protein in urine, could indicate preeclampsia
- Amniotic fluid - fluid that surrounds fetus during pregnancy, abnormal amounts can be indicative of fetal abnormalities or pregnancy complications
- Umbilical artery doppler - ultrasound technique to assess blood flow in the umbilical artery, abnormalities may indicate complications with growing the fetus


For each of the above vital signs, try mapping them to an item_id from the `hosp.d_labitems` table:

In [4]:
def load_lab_items_table(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT itemid, label
        FROM mimiciv_hosp.d_labitems
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["itemid", "label"])
    return df

In [5]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
lab_item_df = load_lab_items_table(connection)
lab_item_df.head()

Connected to db: smcdougall


Unnamed: 0,itemid,label
0,50801,Alveolar-arterial Gradient
1,50802,Base Excess
2,50803,"Calculated Bicarbonate, Whole Blood"
3,50804,Calculated Total CO2
4,50805,Carboxyhemoglobin


In [7]:
print(lab_item_df.shape)

(1622, 2)


In [187]:
# drop the three NA rows from the dataframe
lab_item_df = lab_item_df.dropna()

In [188]:
# use fuzzy matching to find close matches when matching the lists of vital signs to their labels (and item ids)
# try matching against the exact term
matched_labels = []
for label in mimic_extract_vital_signs:
    match = lab_item_df[lab_item_df['label'].str.contains(label, case=False)]
    if not match.empty:
        matched_labels.append((label, match.iloc[0]['itemid']))
    else:
        matched_labels.append((label, None))  # handle unmatched labels

# try using fuzzy matching for better accuracy
fuzzy_matched_labels = []
for label in mimic_extract_vital_signs:
    print(process.extractOne(label, lab_item_df['label']))
    match, score, _ = process.extractOne(label, lab_item_df['label'])
    if score >= 80:  # threshold set at 80
        fuzzy_matched_labels.append((label, lab_item_df[lab_item_df['label'] == match]['itemid'].iloc[0]))
    else:
        fuzzy_matched_labels.append((label, None))
print(matched_labels)
print(fuzzy_matched_labels)

  match = lab_item_df[lab_item_df['label'].str.contains(label, case=False)]


('Alanine Aminotransferase', 100, 1558)
('Albumin', 100, 60)
('Albumin, Ascites', 97, 33)
('Albumin, Pleural', 97, 234)
('Albumin, Urine', 96, 257)
('Alkaline Phosphatase', 95, 61)
('Anion Gap', 100, 66)
('Asparate Aminotransferase', 100, 1562)
('Basophils', 100, 300)
('Bicarbonate', 100, 80)
('Bilirubin', 100, 624)
('Urea Nitrogen', 95, 202)
('Free Calcium', 90, 6)
('% Ionized Calcium', 95, 1122)
('Calcium, Urine', 96, 265)
('Calculated Thyroxine (T4) Index', 86, 94)
('I', 60, 144)
('Other', 72, 311)
('Cben', 68, 1063)
('Chloride', 100, 100)
('Chloride, Urine', 97, 266)
('Cholesterol, HDL', 95, 102)
('Cholesterol, HDL', 97, 102)
('Cholesterol, HDL', 90, 102)
('Calculated Total CO2', 90, 3)
('pCO2', 90, 16)
('Creatinine', 100, 110)
('Creatinine, Ascites', 97, 39)
('Creatinine, Body Fluid', 98, 220)
('Creatinine, Pleural', 97, 240)
('Creatinine, Urine', 97, 270)
('Blood', 90, 626)
('Eosinophils', 100, 302)
('Fibrinogen', 100, 745)
('Oxygen', 90, 14)
('Oxygen', 90, 14)
('Bilirubin, Total

In [189]:
# Exact matching
matched_labels = []
for label in mimic_extract_vital_signs:
    match = lab_item_df[lab_item_df['label'].str.contains(label, case=False)]
    if not match.empty:
        matched_labels.append((label, match.iloc[0]['label'], match.iloc[0]['itemid']))
    else:
        matched_labels.append((label, None, None))  # handle unmatched labels

# Fuzzy matching
fuzzy_matched_labels = []
for label in mimic_extract_vital_signs:
    match, score, _ = process.extractOne(label, lab_item_df['label'])
    if score >= 80:  # threshold set at 80
        fuzzy_matched_labels.append((label, match, lab_item_df[lab_item_df['label'] == match]['itemid'].iloc[0]))
    else:
        fuzzy_matched_labels.append((label, None, None))

# Merge the two lists based on the original labels
merged_labels = []
for exact, fuzzy in zip(matched_labels, fuzzy_matched_labels):
    original_label = exact[0]
    exact_label = exact[1]
    exact_id = exact[2]
    fuzzy_label = fuzzy[1]
    fuzzy_id = fuzzy[2]
    
    # Prefer fuzzy match results if available
    if fuzzy_label:
        matched_label = fuzzy_label
        matched_id = fuzzy_id
    else:
        matched_label = exact_label
        matched_id = exact_id
    
    merged_labels.append({
        "original_label": original_label,
        "matched_label": matched_label,
        "id": matched_id,
        "table_name": "mimiciv_hosp.d_labitems"
    })

  match = lab_item_df[lab_item_df['label'].str.contains(label, case=False)]


## Observations
- the item id's don't match the item id's for MIMIC-III, so we can't use the existing mapping
- "These raw ItemIDs are not robust to changes in software or human data entry practices. For example, “HeartRate” may be recorded under ItemID 211 (using CareVue EHR systems before 2008) or under ItemID 220045 (using MetaVision EHR software after 2008)."
- the MIMIC Extract paper developed a manually curated clinical taxonomy designed to group semantically equivalent ItemIDs together into more robust “clinical aggregate” features
- also note that the paper used the ICU items table and not the hospital one -- so look at that one as well

In [190]:
lab_item_df.head()

Unnamed: 0,itemid,label
0,50801,Alveolar-arterial Gradient
1,50802,Base Excess
2,50803,"Calculated Bicarbonate, Whole Blood"
3,50804,Calculated Total CO2
4,50805,Carboxyhemoglobin


From here:
- Inspect all of the above and reset to None if the match doesn't seem right
- Compile all of the matches that are 'None' and see if I can search for them in the csv file from github (maybe download it from github and do a search that way)
- Repeat the same for the second list of vital signs

In [191]:
none_labels = [label for label in merged_labels if label['matched_label'] is None]
print(len(none_labels))

12


The following labels do not have a strong enough match and will be re-matched using the ICU labitems table, which contains additional lab values.

In [193]:
labels_to_reset = [
    'cardiac index',
    'cholesterol',
    'cholesterol hdl',
    'cholesterol ldl',
    'diastolic blood pressure',
    'fraction inspired oxygen',
    'fraction inspired oxygen set',
    'glascow coma scale total',
    'lactic acid',
    'lymphocytes atypical csl',
    'lymphocytes body fluid',
    'lymphocytes pleural',
    'mean blood pressure',
    'mean corpuscular hemoglobin',
    'mean corpuscular hemoglobin concentration',
    'mean corpuscular volume',
    'monocytes csl',
    'partial pressure of carbon dioxide',
    'partial thromboplastin time',
    'phosphorous',
    'potassium serum',
    'prothrombin time inr',
    'prothrombin time pt',
    'red blood cell count',
    'red blood cell count ascites',
    'red blood cell count csf',
    'red blood cell count pleural',
    'red blood cell count urine',
    'respiratory rate',
    'respiratory rate set',
    'systolic blood pressure',
    'tidal volume observed',
    'tidal volume set',
    'tidal volume spontaneous',
    'weight',
    'white blood cell count',
    'white blood cell count urine'
]

In [194]:
for label in labels_to_reset:
    found_dict = next((d for d in merged_labels if d['original_label'] == label), None)
    found_dict['matched_label'] = None
    found_dict['id'] = None
    found_dict['table_name'] = None
    none_labels.append(found_dict)
print(len(none_labels))
print(none_labels[:5])

49
[{'original_label': 'cardiac output fick', 'matched_label': None, 'id': None, 'table_name': 'mimiciv_hosp.d_labitems'}, {'original_label': 'cardiac output thermodilution', 'matched_label': None, 'id': None, 'table_name': 'mimiciv_hosp.d_labitems'}, {'original_label': 'central venous pressure', 'matched_label': None, 'id': None, 'table_name': 'mimiciv_hosp.d_labitems'}, {'original_label': 'peak inspiratory pressure', 'matched_label': None, 'id': None, 'table_name': 'mimiciv_hosp.d_labitems'}, {'original_label': 'plateau pressure', 'matched_label': None, 'id': None, 'table_name': 'mimiciv_hosp.d_labitems'}]


In [195]:
def load_icu_lab_table(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT itemid, label
        FROM mimiciv_icu.d_items
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["itemid", "label"])
    return df

In [196]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
item_df = load_icu_lab_table(connection)
print(item_df.head())
print(item_df.shape)

Connected to db: smcdougall
   itemid                    label
0  220001             Problem List
1  220003       ICU Admission date
2  220045               Heart Rate
3  220046  Heart rate Alarm - High
4  220047   Heart Rate Alarm - Low
(4014, 2)


In [197]:
remaining_items = [d['original_label'] for d in none_labels]
print(remaining_items[:5])

['cardiac output fick', 'cardiac output thermodilution', 'central venous pressure', 'peak inspiratory pressure', 'plateau pressure']


In [198]:
# Exact matching
matched_icu_labels = []
for label in remaining_items:
    match = item_df[item_df['label'].str.contains(label, case=False)]
    if not match.empty:
        matched_icu_labels.append((label, match.iloc[0]['label'], match.iloc[0]['itemid']))
    else:
        matched_icu_labels.append((label, None, None))  # handle unmatched labels

# Fuzzy matching
fuzzy_matched_icu_labels = []
for label in remaining_items:
    match, score, _ = process.extractOne(label, item_df['label'])
    if score >= 80:  # threshold set at 80
        fuzzy_matched_icu_labels.append((label, match, item_df[item_df['label'] == match]['itemid'].iloc[0]))
    else:
        fuzzy_matched_icu_labels.append((label, None, None))

# Merge the two lists based on the original labels
merged_icu_labels = []
for exact, fuzzy in zip(matched_icu_labels, fuzzy_matched_icu_labels):
    original_label = exact[0]
    exact_label = exact[1]
    exact_id = exact[2]
    fuzzy_label = fuzzy[1]
    fuzzy_id = fuzzy[2]
    
    # Prefer fuzzy match results if available
    if fuzzy_label:
        matched_label = fuzzy_label
        matched_id = fuzzy_id
    else:
        matched_label = exact_label
        matched_id = exact_id
    
    merged_icu_labels.append({
        "original_label": original_label,
        "matched_label": matched_label,
        "id": matched_id,
        "table_name": "mimiciv_icu.d_items"
    })

In [199]:
# for label in merged_icu_labels:
#     print(label)

- Keep "Cardiac Output Fick" and "Cardiac Output (thermodilution)" as synonymous
- Keep 'pulmonary capillary wedge pressure' and 'Left Artrial Pressure' as synonymous
- HDL considered "good cholesterol," LDL considered "bad cholesterol", MIMIC Extract study had both measured separately but we only have a general "Cholesterol" label, so use that
- Assume 'mean corpuscular hemoglobin concentration' and 'Hemoglobin' mean the same thing

In [201]:
item_df[item_df['label'].str.lower().str.contains('void')]

Unnamed: 0,itemid,label
1878,226560,Void
1942,226713,Incontinent/voids (estimate)


In [202]:
print(item_df['label'].unique())

['Problem List' 'ICU Admission date' 'Heart Rate' ...
 'Documented on DC Summary (Thora)' 'Replacement Fluid (Pre-Filter)'
 'Replacement Fluid (Post-Filter)']


In [203]:
# these labels do not have strong enough matches across either of the itemid-related tables
labels_to_remove = [
    'peak inspiratory pressure',
    'positive end-expiratory pressure',
    'positive end-expiratory pressure set',
    'post void residual',
    'venous pvo2',
    'cholesterol ldl',
    'glascow coma scale total',
    'lymphocytes atypical csl',
    'lymphocytes body fluid',
    'monocytes csl',
    'mean corpuscular volume',
    'partial pressure of carbon dioxide',
    'red blood cell count',
    'red blood cell count ascites',
    'red blood cell count csf',
    'red blood cell count pleural',
    'red blood cell count urine',
     'white blood cell count',
    'white blood cell count urine'
]

In [204]:
print(len(merged_icu_labels))
for label in labels_to_remove:
    for index, d in enumerate(merged_icu_labels):
        if d.get('original_label') == label:
            del merged_icu_labels[index]

for label in labels_to_remove:
    for index, d in enumerate(merged_labels):
        if d.get('original_label') == label:
            del merged_labels[index]

print(len(merged_icu_labels))

49
30


In [205]:
# merge the two together
merged_labels.extend(merged_icu_labels)
print(len(merged_labels))

115


In [206]:
for label in merged_labels:
    for index, d in enumerate(merged_labels):
        if d.get('matched_label') == None:
            del merged_labels[index]
print(len(merged_labels))

85


List of pregnancy-related vital signs to try including:

In [207]:
preg_vital_signs = [
    'fetal heart rate', # can't find any that match :(
    'uterine contractions', # can't find any that match :(
    'amniotic fluid levels',
    'umbilical artery doppler'
    'Fetal Fibronectin (fFN)' # used for predicting preterm labor,
    'FETAL LUNG MATURITY - LBC' # used to predict timing of delivery
]

In [208]:
item_df[item_df['label'].str.lower().str.contains('maternal')]

Unnamed: 0,itemid,label


In [209]:
lab_item_df[lab_item_df['label'].str.lower().str.contains('maternal')]

Unnamed: 0,itemid,label
58,50860,"AFP, Maternal Screen"
134,50936,"HCG, Maternal Screen"
200,51004,"UE3, Maternal Screen"
730,51607,"Dimeric Inhibin A, Maternal Screen"


In [210]:
lab_item_df[lab_item_df['itemid'] == 51876]

Unnamed: 0,itemid,label
979,51876,"L/S Ratio, Amniotic Fluid"


In [211]:
preg_labels = [
    {'original_label': 'FetalFN', 'matched_label': 'FetalFN', 'id': 51033, 'table_name': 'mimiciv_hosp.d_labitems'},
    {'original_label': 'FETAL LUNG MATURITY - LBC', 'matched_label': 'FETAL LUNG MATURITY - LBC', 'id': 52352, 
     'table_name': 'mimiciv_hosp.d_labitems'},
    {'original_label': 'AFP, Amniotic Fluid', 'matched_label': 'AFP, Amniotic Fluid', 'id': 51833, 'table_name': 'mimiciv_hosp.d_labitems'},
    {'original_label': 'DSPC, Amniotic Fluid', 'matched_label': 'DSPC, Amniotic Fluid', 'id': 51855, 'table_name': 'mimiciv_hosp.d_labitems'},
    {'original_label': 'L/S Ratio, Amniotic Fluid', 'matched_label': 'L/S Ratio, Amniotic Fluid', 'id': 51876,
    'table_name': 'mimiciv_hosp.d_labitems'}
]

In [212]:
merged_labels.extend(preg_labels)
print(len(merged_labels))

90


In [213]:
# save as JSON file so these labels can be used for determining outliers based on variable ranges
import json

mimiciv_labels = {str(d['id']): d['matched_label'] for d in merged_labels}
with open('mimiciv_vitalsigns_labels.json', 'w') as file:
    json.dump(mimiciv_labels, file, indent=4)

In [214]:
print(len(mimiciv_labels))

84


From here:
- filter the relevant tables according to these ids
- match up the measurements with the admissions

### Relevant Tables:
- `hosp.labevents` (on `itemid`)
- `icu.chartevents`
- `icu.datetimeevents`
- `icu.inputevents`
- `icu.outputevents`
- `icu.procedureevents` (need to look into this one some more)

In [35]:
vital_signs_tables = [
    'mimiciv_hosp.filtered_labevents',
    'mimiciv_icu.filtered_chartevents',
    'mimiciv_icu.filtered_datetimeevents',
    'mimiciv_icu.filtered_inputevents',
    'mimiciv_icu.filtered_outputevents'
]

In [36]:
# compile the item ids into a list
relevant_item_ids = [d['id'] for d in merged_labels]
relevant_item_ids = [int(itemid) for itemid in relevant_item_ids]

print(relevant_item_ids[:5])

[53084, 50862, 50835, 51046, 51069]


In [37]:
"""
General function for loading data from an existing SQL table that retrieves all of the fields
for the table and filters on the itemid field.
"""
def filter_vital_signs_table(connection, table_name, relevant_item_ids):
    cur = connection.cursor()
    sql_query = f'SELECT * FROM "{table_name}" WHERE itemid IN %s'
    cur.execute(sql_query, (tuple(relevant_item_ids),))
    rows = cur.fetchall()
    cur.close()
    
    # get column names
    columns = [desc[0] for desc in cur.description]
    # use column names retrieved from the cursor
    df = pd.DataFrame(rows, columns=columns)
    return df

### Filtering the hospital labevents table (mimiciv_hosp.filtered_labevents):

In [38]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
filtered_labevents = filter_vital_signs_table(connection, 'mimiciv_hosp.filtered_labevents', relevant_item_ids)

Connected to db: smcdougall


In [39]:
filtered_labevents.head()

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,2437,10000719,,70783909,51221,P30FVI,2139-09-14 15:10:00,2139-09-14 20:08:00,32.1,32.1,%,36.0,48.0,abnormal,ROUTINE,
1,2444,10000719,,70783909,51256,P30FVI,2139-09-14 15:10:00,2139-09-14 20:08:00,75.6,75.6,%,50.0,70.0,abnormal,ROUTINE,
2,2449,10000719,24558333.0,9035511,51221,,2140-04-15 00:22:00,2140-04-15 01:01:00,31.4,31.4,%,36.0,48.0,abnormal,STAT,
3,2458,10000719,24558333.0,93908058,51221,,2140-04-16 06:40:00,2140-04-16 07:54:00,32.6,32.6,%,36.0,48.0,abnormal,ROUTINE,
4,2464,10000719,,99456512,51221,P484YY,2140-11-14 17:08:00,2140-11-14 20:01:00,35.4,35.4,%,36.0,48.0,abnormal,ROUTINE,


In [40]:
print(len(filtered_labevents))

3533581


In [41]:
filtered_labevents = filtered_labevents[['labevent_id', 'subject_id', 'hadm_id', 'specimen_id', 'itemid', 'order_provider_id', 
                                    'charttime', 'storetime', 'value', 'valuenum', 'valueuom', 'ref_range_lower',
                                    'ref_range_upper', 'flag', 'priority', 'comments']]
filtered_labevents.head()

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,2437,10000719,,70783909,51221,P30FVI,2139-09-14 15:10:00,2139-09-14 20:08:00,32.1,32.1,%,36.0,48.0,abnormal,ROUTINE,
1,2444,10000719,,70783909,51256,P30FVI,2139-09-14 15:10:00,2139-09-14 20:08:00,75.6,75.6,%,50.0,70.0,abnormal,ROUTINE,
2,2449,10000719,24558333.0,9035511,51221,,2140-04-15 00:22:00,2140-04-15 01:01:00,31.4,31.4,%,36.0,48.0,abnormal,STAT,
3,2458,10000719,24558333.0,93908058,51221,,2140-04-16 06:40:00,2140-04-16 07:54:00,32.6,32.6,%,36.0,48.0,abnormal,ROUTINE,
4,2464,10000719,,99456512,51221,P484YY,2140-11-14 17:08:00,2140-11-14 20:01:00,35.4,35.4,%,36.0,48.0,abnormal,ROUTINE,


In [61]:
save_df_as_csv(filtered_labevents, 'hosp_labevents.csv')

DataFrame has been saved as dataframes/hosp_labevents.csv


### chartevents
ventilator settings, lab values, code status, mental status
- sometimes lab values from labevents are duplicated here
- need to see if there are specific values worth keeping aside from the labevent ones - ventilator maybe?

In [42]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
filtered_chartevents = filter_vital_signs_table(connection, 'mimiciv_icu.filtered_chartevents', relevant_item_ids)
print(filtered_chartevents.shape)
filtered_chartevents.head()

Connected to db: smcdougall
(994991, 11)


Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,10001884,26184834,37510196,10221.0,2131-01-18 19:00:00,2131-01-18 21:05:00,220210,17,17.0,insp/min,0
1,10001884,26184834,37510196,10221.0,2131-01-18 20:00:00,2131-01-18 21:05:00,220210,16,16.0,insp/min,0
2,10001884,26184834,37510196,10221.0,2131-01-18 21:00:00,2131-01-18 21:05:00,220210,15,15.0,insp/min,0
3,10001884,26184834,37510196,10221.0,2131-01-18 22:00:00,2131-01-19 00:36:00,220210,13,13.0,insp/min,0
4,10001884,26184834,37510196,10221.0,2131-01-18 23:00:00,2131-01-19 00:36:00,220210,12,12.0,insp/min,0


In [47]:
filtered_chartevents = filtered_chartevents.drop(columns=['caregiver_id', 'storetime', 'warning'])
filtered_chartevents.head()

KeyError: "['caregiver_id', 'storetime', 'warning'] not found in axis"

In [62]:
save_df_as_csv(filtered_chartevents, 'icu_chartevents.csv')

DataFrame has been saved as dataframes/icu_chartevents.csv


### datetimeevents
- also needs filtering on itemid

In [45]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
filtered_dateevents = filter_vital_signs_table(connection, 'mimiciv_icu.filtered_datetimeevents', relevant_item_ids)
print(filtered_dateevents.shape)
filtered_dateevents.head()

Connected to db: smcdougall
(0, 10)


Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom,warning


Consensus - do not include datetimeevents table - also looking at the itemid mapping from MIMIC-Extract, a lot of the datetime events are not specific to our pregnancy study and (based on my subjective opinion) would not yield substantial results. Examples of datetime events include: cap change, tubing change, catheter change, catheter dressing change

### inputevents
- MIMIC extract - has inputevents listed in `itemid_to_variable` map
- same with outputevents

In [48]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
filtered_inputevents = filter_vital_signs_table(connection, 'mimiciv_icu.filtered_inputevents', relevant_item_ids)
print(filtered_inputevents.shape)
filtered_inputevents.head()

Connected to db: smcdougall
(0, 26)


Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercomponenttypedescription,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate


Same consensus as datetimevents - no results. The MIMIC-Extract mapping shows the following examples of inputevents: insulin drop, vitamin K, magnesium, IV Fluid. For a first pass at the model, so not include this table.

### outputevents

In [49]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
filtered_outputevents = filter_vital_signs_table(connection, 'mimiciv_icu.filtered_outputevents', relevant_item_ids)
print(filtered_outputevents.shape)
filtered_outputevents.head()

Connected to db: smcdougall
(338, 9)


Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom
0,10553084,28481755,35401987,72655,2199-10-17 20:06:00,2199-10-18 00:06:00,226590,400.0,ml
1,10553084,28481755,35401987,72655,2199-10-17 22:00:00,2199-10-18 05:08:00,226590,450.0,ml
2,10553084,28481755,35401987,72655,2199-10-18 04:00:00,2199-10-18 05:08:00,226590,50.0,ml
3,10553084,28481755,35401987,72655,2199-10-18 06:00:00,2199-10-18 06:41:00,226590,100.0,ml
4,10553084,28481755,35401987,72687,2199-10-18 12:00:00,2199-10-18 12:50:00,226590,100.0,ml


In [51]:
filtered_outputevents['subject_id'].nunique()

25

In [52]:
filtered_outputevents['itemid'].nunique()

1

In [53]:
item_df[item_df['itemid'] == 226590]

Unnamed: 0,itemid,label
1901,226590,L Pleural #1


In [56]:
filtered_outputevents = filtered_outputevents.drop(columns=['caregiver_id', 'storetime'])

In [162]:
save_df_as_csv(filtered_outputevents, 'icu_outputevents.csv')

DataFrame has been saved as dataframes/icu_outputevents.csv


We could decide to include this in the final table... only tracks a single item - L Pleural #1

# OMR Table Preprocessing

In [217]:
def load_omr_table(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT *
        FROM "mimiciv_hosp.filtered_omr"
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "chartdate", "seq_num", "result_name", "result_value"])
    return df

In [218]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
omr_df = load_omr_table(connection)
omr_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
0,10000719,2140-11-14,1,Blood Pressure,144/88
1,10000719,2140-11-14,1,BMI (kg/m2),37.0
2,10000719,2140-11-14,1,Height (Inches),67
3,10000719,2140-11-14,1,Weight (Lbs),236
4,10001472,2185-10-13,1,Blood Pressure,130/72


In [219]:
omr_df['result_name'].value_counts()

result_name
Blood Pressure                      339041
Weight (Lbs)                        307525
BMI (kg/m2)                         253243
Height (Inches)                      77958
Blood Pressure Sitting                 276
Blood Pressure Lying                   242
Blood Pressure Standing (1 min)        230
BMI                                     71
Blood Pressure Standing (3 mins)        58
Weight                                  45
Blood Pressure Standing                 42
eGFR                                    24
Height                                   1
Name: count, dtype: int64

Relevant values to keep:
- Blood Pressure
- Weight (Lbs)
- BMI (kg/m2)
- BMI
- Weight

Rename "Weight" to "Weight (Lbs)" and "BMI" to "BMI (kg/m2)"

In [220]:
omr_df['result_name'] = omr_df['result_name'].replace('BMI', 'BMI (kg/m2)')
omr_df['result_name'] = omr_df['result_name'].replace('Weight', 'Weight (Lbs)')
omr_df['result_name'].value_counts()

result_name
Blood Pressure                      339041
Weight (Lbs)                        307570
BMI (kg/m2)                         253314
Height (Inches)                      77958
Blood Pressure Sitting                 276
Blood Pressure Lying                   242
Blood Pressure Standing (1 min)        230
Blood Pressure Standing (3 mins)        58
Blood Pressure Standing                 42
eGFR                                    24
Height                                   1
Name: count, dtype: int64

In [221]:
print(len(omr_df))
relevant_result_names = ['Blood Pressure', 'Weight (Lbs)', 'BMI (kg/m2)']
filtered_omr_df = omr_df[omr_df['result_name'].isin(relevant_result_names)]
print(len(filtered_omr_df))

978756
899925


In [222]:
save_df_as_csv(filtered_omr_df, 'hosp_omr.csv')

DataFrame has been saved as dataframes/hosp_omr.csv


## Services Table

In [71]:
def load_services_table(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT *
        FROM "mimiciv_hosp.filtered_services"
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "hadm_id", "transfertime", "prev service", "curr_service"])
    return df

In [72]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
services_df = load_services_table(connection)
services_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,hadm_id,transfertime,prev service,curr_service
0,10000719,24558333,2140-04-15 00:15:12,,OBS
1,10001319,23005466,2135-07-20 03:53:25,,OBS
2,10001319,24591241,2138-11-09 20:30:59,,OBS
3,10001319,29230609,2134-04-15 08:01:20,,OBS
4,10001472,23506139,2186-01-10 00:26:41,,OBS


In [73]:
services_df.isna().sum()

subject_id          0
hadm_id             0
transfertime        0
prev service    56712
curr_service        0
dtype: int64

In [74]:
services_df.shape

(60107, 5)

In [75]:
services_df['curr_service'].value_counts()

curr_service
MED      24983
OBS      19239
SURG      4130
OMED      3030
CMED      2325
GYN       1652
ORTHO     1006
NMED       959
VSURG      848
PSYCH      371
PSURG      335
NSURG      326
TRAUM      252
CSURG      236
TSURG      194
GU         165
ENT         54
EYE          1
DENT         1
Name: count, dtype: int64

In [76]:
services_df['prev service'].value_counts()

prev service
MED      1571
SURG      404
OMED      313
CMED      224
OBS       209
GYN       160
ORTHO     141
NMED      104
VSURG      65
NSURG      48
TRAUM      46
TSURG      33
CSURG      31
PSURG      22
GU         15
ENT         8
EYE         1
Name: count, dtype: int64

There are so many different categories - see if we can narrow them down at all
- Previous service is null for so many of them - can make binary for whether patient came from another service or not
- We can make the assumption that if previous service is null, there really is no previous service -- so make it it's own categorical variable

In [77]:
services_df = services_df.fillna('None')

In [78]:
save_df_as_csv(services_df, 'hosp_services.csv')

DataFrame has been saved as dataframes/hosp_services.csv


## Transfers

In [79]:
def load_transfers_table(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT *
        FROM "mimiciv_hosp.filtered_transfers"
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "hadm_id", "transfer_id", "eventtype", "careunit", "intime", "outtime"])
    return df

In [80]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
transfers_df = load_transfers_table(connection)
transfers_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
0,10000719,24558333.0,31719052,discharge,,2140-04-18 12:41:26,NaT
1,10000719,24558333.0,32323060,admit,Labor & Delivery,2140-04-15 00:15:12,2140-04-16 02:43:48
2,10000719,24558333.0,35042205,transfer,Obstetrics (Postpartum & Antepartum),2140-04-16 02:43:48,2140-04-18 12:41:26
3,10001319,23005466.0,32828864,admit,Labor & Delivery,2135-07-20 03:53:25,2135-07-20 11:26:28
4,10001319,23005466.0,33014199,transfer,Obstetrics (Postpartum & Antepartum),2135-07-20 11:26:28,2135-07-22 11:44:16


In [81]:
transfers_df.isna().sum()

subject_id         0
hadm_id        32865
transfer_id        0
eventtype          0
careunit       56822
intime             0
outtime        56821
dtype: int64

In [82]:
transfers_df.shape

(234354, 7)

### Some observations on the transfers table
- `icustays` table is derived from the transfers table
- the `stay_id` in `icustays` corresponds to the entire stay, and is **equal to the `transfer_id`** of the furst physical locaqtion
- note: a stay could have multiple `transfer_id`'s for each physical location
- `transfer_id` is unique to a patient physical location


In [83]:
transfers_df['careunit'].value_counts()

careunit
Emergency Department                                63638
Labor & Delivery                                    21649
Medicine                                            19392
Obstetrics (Postpartum & Antepartum)                15838
Emergency Department Observation                     5706
Med/Surg                                             5503
Obstetrics Postpartum                                4275
Hematology/Oncology                                  4018
Discharge Lounge                                     3719
Transplant                                           3665
Med/Surg/GYN                                         3300
Vascular                                             3030
Medicine/Cardiology                                  2639
Medical Intensive Care Unit (MICU)                   2222
Neurology                                            1907
Medical/Surgical Intensive Care Unit (MICU/SICU)     1860
Med/Surg/Trauma                                      1678
PACU 

In [84]:
transfers_df['careunit'].unique()

array([None, 'Labor & Delivery', 'Obstetrics (Postpartum & Antepartum)',
       'Emergency Department', 'Emergency Department Observation',
       'Medicine', 'Med/Surg', 'Medicine/Cardiology', 'Transplant',
       'Medical Intensive Care Unit (MICU)', 'Vascular',
       'Discharge Lounge', 'Obstetrics Postpartum',
       'Surgical Intensive Care Unit (SICU)', 'Med/Surg/GYN', 'Neurology',
       'Obstetrics Antepartum', 'PACU', 'Surgery/Trauma',
       'Cardiac Surgery',
       'Medical/Surgical Intensive Care Unit (MICU/SICU)',
       'Hematology/Oncology', 'Coronary Care Unit (CCU)',
       'Med/Surg/Trauma', 'Medical/Surgical (Gynecology)',
       'Hematology/Oncology Intermediate',
       'Neuro Surgical Intensive Care Unit (Neuro SICU)',
       'Neuro Stepdown', 'Surgery/Pancreatic/Biliary/Bariatric',
       'Cardiology Surgery Intermediate',
       'Medicine/Cardiology Intermediate', 'Trauma SICU (TSICU)',
       'Neuro Intermediate', 'Psychiatry', 'Surgery', 'Cardiology',
      

Will likely need to further narrow this down... to reduce dimensionality and improve model interpretability

** Narrow down into specific categories:
- Labor and Delivery
- Emergency
- General Medicine and Surgery
- Intensive Care Units
- Specialized Units (Neuro, Psychiatry, Transplant, etc.)

In [85]:
transfers_df['careunit'] = transfers_df['careunit'].fillna('Unknown')

In [86]:
care_unit_mappings = {
    'Labor & Delivery': ['Labor & Delivery', 'Obstetrics (Postpartum & Antepartum)', 'Obstetrics Postpartum', 'Obstetrics Antepartum'],
    'Emergency': ['Emergency Department', 'Emergency Department Observation', 'PACU', 'Observation'],
    'General Medical/Surgical Units': ['Medicine', 'Med/Surg', 'Med/Surg/GYN', 'Med/Surg/Trauma', 'Med/Surg/Gynecology', 'Surgery/Trauma', 'Surgery/Pancreatic/Biliary/Bariatric', 'Surgery'],
    'Intensive Care Units': ['Medical Intensive Care Unit (MICU)', 'Surgical Intensive Care Unit (SICU)', 'Neuro Surgical Intensive Care Unit (Neuro SICU)', 'Trauma SICU (TSICU)', 'Medical/Surgical Intensive Care Unit (MICU/SICU)', 'Cardiac Vascular Intensive Care Unit (CVICU)', 'Coronary Care Unit (CCU)', 'Neurology'],
    'Specialized Units': ['Transplant', 'Vascular', 'Cardiac Surgery', 'Cardiology Surgery Intermediate', 'Medicine/Cardiology', 'Medicine/Cardiology Intermediate', 'Hematology/Oncology', 'Hematology/Oncology Intermediate', 'Thoracic Surgery', 'Neuro Stepdown', 'Neuro Intermediate', 'Psychiatry'],
    'Discharged': ['Discharge Lounge'],
    'Unknown': ['Unknown']
}
transfers_df['care_unit_group'] = transfers_df['careunit'].apply(lambda x: next((k for k, v in care_unit_mappings.items() if x in v), 'Unknown'))
transfers_df = transfers_df.drop('careunit', axis=1)

In [87]:
transfers_df.head()

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,intime,outtime,care_unit_group
0,10000719,24558333.0,31719052,discharge,2140-04-18 12:41:26,NaT,Unknown
1,10000719,24558333.0,32323060,admit,2140-04-15 00:15:12,2140-04-16 02:43:48,Labor & Delivery
2,10000719,24558333.0,35042205,transfer,2140-04-16 02:43:48,2140-04-18 12:41:26,Labor & Delivery
3,10001319,23005466.0,32828864,admit,2135-07-20 03:53:25,2135-07-20 11:26:28,Labor & Delivery
4,10001319,23005466.0,33014199,transfer,2135-07-20 11:26:28,2135-07-22 11:44:16,Labor & Delivery


In [88]:
transfers_df['eventtype'].value_counts()

eventtype
ED           63638
transfer     57070
admit        56824
discharge    56822
Name: count, dtype: int64

In [89]:
save_df_as_csv(transfers_df, 'hosp_transfers.csv')

DataFrame has been saved as dataframes/hosp_transfers.csv


## Admissions

In [90]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
admissions_df = load_table(connection, 'mimiciv_hosp.filtered_admissions')
admissions_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000719,24558333,2140-04-15 00:14:00,2140-04-18 12:29:00,NaT,URGENT,P65T9Y,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,SINGLE,WHITE,NaT,NaT,0
1,10001319,23005466,2135-07-20 03:45:00,2135-07-22 11:38:00,NaT,URGENT,P09WS1,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,MARRIED,WHITE,NaT,NaT,0
2,10001319,24591241,2138-11-09 20:00:00,2138-11-12 10:40:00,NaT,URGENT,P09WS1,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,MARRIED,WHITE,NaT,NaT,0
3,10001319,29230609,2134-04-15 07:59:00,2134-04-17 13:23:00,NaT,URGENT,P09WS1,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,MARRIED,WHITE,NaT,NaT,0
4,10001472,23506139,2186-01-10 00:00:00,2186-01-13 15:02:00,NaT,URGENT,P36D2C,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,MARRIED,WHITE,NaT,NaT,0


Preprocessing decisions:
1. Group admission types based on urgency and similarity - (1) urgent/emergency, (2)observation/outpatient admissions, (3) elective admissions (since they are not urgent)
2. Create new categorical variables to reduce dimensionality - `is urgent`, `is observation`, `is elective` (or something similar)
3. Can encode as ordinal variables since there is a ranking of urgency within the admission types

Ultimately what we will do: do an ordinal mapping of a grouping of the admission types (for a simpler model)

In [91]:
admission_type_map = {
    'Urgent': ['URGENT', 'EW EMER.', 'DIRECT EMER.'],
    'Observation': ['EU OBSERVATION', 'OBSERVATION ADMIT', 'DIRECT OBSERVATION', 'SURGICAL SAME DAY ADMISSION', 'AMBULATORY OBSERVATION'],
    'Elective': ['ELECTIVE']
}
admissions_df['admission_type'] = admissions_df['admission_type'].apply(lambda x: next((k for k, v in admission_type_map.items() if x in v), 'Unknown'))


In [92]:
admissions_df['admission_type'].unique()

array(['Urgent', 'Observation', 'Elective'], dtype=object)

In [93]:
admissions_df['admission_type'].value_counts()

admission_type
Urgent         38026
Observation    18185
Elective         611
Name: count, dtype: int64

In [94]:
ordinal_mapping = {
    'Elective': 0,
    'Observation': 1,
    'Urgent': 2,
}

# apply ordinal encoding
admissions_df['admission_type_ordinal'] = admissions_df['admission_type'].map(ordinal_mapping)
admissions_df = admissions_df.drop('admission_type', axis=1)

In [95]:
admissions_df.isna().sum()

subject_id                    0
hadm_id                       0
admittime                     0
dischtime                     0
deathtime                 56369
admit_provider_id             0
admission_location            0
discharge_location        10100
insurance                     0
language                      0
marital_status              540
race                          0
edregtime                 25944
edouttime                 25944
hospital_expire_flag          0
admission_type_ordinal        0
dtype: int64

In [96]:
admissions_df['discharge_location'].value_counts()

discharge_location
HOME                            29572
HOME HEALTH CARE                 9816
SKILLED NURSING FACILITY         4636
CHRONIC/LONG TERM ACUTE CARE      715
REHAB                             522
DIED                              456
HOSPICE                           354
AGAINST ADVICE                    302
PSYCH FACILITY                    139
ACUTE HOSPITAL                    106
ASSISTED LIVING                    62
OTHER FACILITY                     42
Name: count, dtype: int64

In [97]:
admissions_df['discharge_location'].unique()

array(['HOME', None, 'HOME HEALTH CARE', 'SKILLED NURSING FACILITY',
       'DIED', 'CHRONIC/LONG TERM ACUTE CARE', 'REHAB', 'ASSISTED LIVING',
       'AGAINST ADVICE', 'HOSPICE', 'PSYCH FACILITY', 'ACUTE HOSPITAL',
       'OTHER FACILITY'], dtype=object)

Mappings for discharge location:
- Home or home health care
- Facility care
- Another Medical/Healthcare facility
- Died
- left against advice
- Unknown

In [98]:
admissions_df['discharge_location'] = admissions_df['discharge_location'].fillna('UNKNOWN')

In [99]:
admission_discharge_location_map = {
    "Home/Home Health Care": ["HOME", 'HOME HEALTH CARE' ],
    "Facility Care": ['SKILLED NURSING FACILITY', 'CHRONIC/LONG TERM ACUTE CARE', 'REHAB',
                     'ASSISTED LIVING', 'HOSPICE'],
    "Another Medical/Healthcare Facility": ['PSYCH FACILITY', 'ACUTE HOSPITAL', 'OTHER FACILITY'],
    "Died": ['DIED'],
    "Left against Medical Advice": ['AGAINST ADVICE'],
    "Unknown": ["UNKNOWN"]
}

In [100]:
admissions_df['discharge_location'] = admissions_df['discharge_location'].apply(lambda x: next((k for k,
                                                                                                v in admission_discharge_location_map.items()
                                                                                                if x in v), 'Unknown'))
admissions_df['discharge_location'].value_counts()

discharge_location
Home/Home Health Care                  39388
Unknown                                10100
Facility Care                           6289
Died                                     456
Left against Medical Advice              302
Another Medical/Healthcare Facility      287
Name: count, dtype: int64

Admission location mappings:
- Referrals
- Walk-in
- Emergency and urgent care
- Other/Unknown

In [101]:
admissions_df['admission_location'].unique()

array(['PHYSICIAN REFERRAL', 'EMERGENCY ROOM', 'WALK-IN/SELF REFERRAL',
       'TRANSFER FROM SKILLED NURSING FACILITY', 'TRANSFER FROM HOSPITAL',
       'CLINIC REFERRAL', 'PROCEDURE SITE',
       'INTERNAL TRANSFER TO OR FROM PSYCH', 'PACU',
       'INFORMATION NOT AVAILABLE', 'AMBULATORY SURGERY TRANSFER'],
      dtype=object)

In [102]:
admission_location_map = {
    'Referral': ['PHYSICIAN REFERRAL', 'CLINIC REFERRAL'],
    'Walk-in': ['WALK-IN/SELF REFERRAL'],
    'Emergency/Urgent Care': ['EMERGENCY ROOM', 'TRANSFER FROM SKILLED NURSING FACILITY',
                             'TRANSFER FROM HOSPITAL', 'INTERNAL TRANSFER TO OR FROM PSYCH',
                             'PACU', 'AMBULATORY SURGERY TRANSFER'],
    'Unknown': ['INFORMATION NOT AVAILABLE'],
    'Procedure Site': ['PROCEDURE SITE']
}

In [103]:
admissions_df['admission_location'] = admissions_df['admission_location'].apply(lambda x: next((k for k,
                                                                                                v in admission_location_map.items()
                                                                                                if x in v), 'Unknown'))
admissions_df['admission_location'].value_counts()

admission_location
Emergency/Urgent Care    27674
Referral                 26762
Walk-in                   1748
Procedure Site             605
Unknown                     33
Name: count, dtype: int64

In [104]:
admissions_df['edregtime'].isna().sum()

25944

### TODO: figure out how to manage missing ED in/out times

### Columns to remove:
Could run into multicollinearity if including both the `deathtime` and `hospital_expire_flag` fields since they capture very similar information.

Pros/cons of including `deathtime` over `hospital_expire_flag`:
- pro - more granular information - good for time-to-event analysis
- con - requires more preprocessing for handling missing values and capturing time since admission
- con - introduces more noise/complexity to the model

Pros/cons of including `hospital_expire_flag` over `deathtime`:
- pro - simple for interpretation
- con - removes information from the model

Since we are not specifically measuring mortality during hospitalization, the binary flag is likely preferred for this scenario.

### Columns to remove:
- deathtime
- admit_provider_id
- insurance, language, marital_status, race (captured in previous notebook on patient demographics)

In [105]:
columns_to_drop = ['deathtime', 'admit_provider_id', 'insurance', 'language', 'marital_status', 'race']
admissions_df.drop(columns=columns_to_drop, inplace=True)

In [106]:
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_location,discharge_location,edregtime,edouttime,hospital_expire_flag,admission_type_ordinal
0,10000719,24558333,2140-04-15 00:14:00,2140-04-18 12:29:00,Referral,Home/Home Health Care,NaT,NaT,0,2
1,10001319,23005466,2135-07-20 03:45:00,2135-07-22 11:38:00,Referral,Home/Home Health Care,NaT,NaT,0,2
2,10001319,24591241,2138-11-09 20:00:00,2138-11-12 10:40:00,Referral,Home/Home Health Care,NaT,NaT,0,2
3,10001319,29230609,2134-04-15 07:59:00,2134-04-17 13:23:00,Referral,Home/Home Health Care,NaT,NaT,0,2
4,10001472,23506139,2186-01-10 00:00:00,2186-01-13 15:02:00,Referral,Home/Home Health Care,NaT,NaT,0,2


In [107]:
save_df_as_csv(admissions_df, 'hosp_admissions.csv')

DataFrame has been saved as dataframes/hosp_admissions.csv


## Diagnoses_ICD

In [108]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
diagnoses_df = load_table(connection, 'mimiciv_hosp.filtered_diagnoses_icd')
diagnoses_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000719,24558333,1,66401,9
1,10000719,24558333,2,65951,9
2,10000719,24558333,3,64891,9
3,10000719,24558333,4,V270,9
4,10000719,24558333,5,V0251,9


### Diagnosis Filtering Approach
- use the general process outlined in "An Extensive Data Processing Pipeline for MIMIC-IV" (the "feature extraction" step)
- look at `disease_cohort.py`
- approach used by the paper: use first 3 digits to group the dx codes
- need to also map ICD9 to ICD10 - they have a .txt file to do this

My approach:
- see how many ICD-9 codes I'm working with and whether I can use the .txt file for mapping
- apply mapping conversion logic to non-pregnancy codes -- find necessary prefixes for the exclusion. The idea is to not restrict the pregnancy codes because then we lose useful information
- For non-pregnancy codes, restrict to first three digits when reducing dimensionality

In [109]:
# number of unique diagnoses that we are starting with
diagnoses_df['icd_code'].nunique()

12966

In [110]:
diagnoses_df[diagnoses_df['icd_code'].str.startswith('O')]

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
277,10002266,24160398,1,O4202,10
278,10002266,24160398,2,O411230,10
280,10002266,24160398,4,O621,10
281,10002266,24160398,5,O631,10
285,10002266,24160398,9,O99284,10
...,...,...,...,...,...
608975,19999043,23037011,6,O09522,10
608977,19999043,23037011,8,O09812,10
608979,19999043,23037011,10,O359XX2,10
608981,19999043,24799384,1,O046,10


In [111]:
PREG_ICD9_PREFIXES = ['V22', 'V23', 'V24', 'V27', 'V28', '63', '64', '65', '66', '67', '7651', '7650']
PREG_ICD10_PREFIXES = ['Z33', 'Z34', 'Z3A', 'O0', 'O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8', 'O9']

def does_not_start_with_prefixes(string, prefixes):
    for prefix in prefixes:
        if string.startswith(prefix):
            return False
    return True


In [112]:
mapping_file = 'ICD9_to_ICD10_mapping.txt'

### NOTE: The following functions come from https://github.com/healthylaife/MIMIC-IV-Data-Pipeline ###
def read_icd_mapping(map_path: str) -> pd.DataFrame:
    """Reads in mapping table for converting ICD9 to ICD10 codes"""

    mapping = pd.read_csv(map_path, header=0, delimiter="\t")
    mapping.diagnosis_description = mapping.diagnosis_description.apply(str.lower)
    return mapping

def standardize_icd(
    mapping: pd.DataFrame, diag: pd.DataFrame, map_code_col="diagnosis_code", root=True
) -> str:
    """Takes an ICD9 -> ICD10 mapping table and a diagnosis dataframe;
    adds column with converted ICD10 column"""

    count = 0
    code_cols = mapping.columns
    errors = []

    def icd_9to10(icd):
        """Function use to apply over the diag DataFrame for ICD9->ICD10 conversion"""
        # If root is true, only map an ICD 9 -> 10 according to the
        # ICD9's root (first 3 digits)

        # NOTE - modified from the original code
        # if root AND not a pregnancy-related code
        if root and does_not_start_with_prefixes(icd, PREG_ICD9_PREFIXES):
            icd = icd[:3]

        if map_code_col not in code_cols:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        matches = mapping.loc[mapping[map_code_col] == icd]
        if matches.shape[0] == 0:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        return mapping.loc[mapping[map_code_col] == icd].icd10cm.iloc[0]

    # Create new column with original codes as default
    col_name = "root_icd10_convert"
    diag[col_name] = diag["icd_code"].values

    # Group identical ICD9 codes, then convert all ICD9 codes within
    # a group to ICD10
    for code, group in diag.loc[diag.icd_version == 9].groupby(by="icd_code"):
        new_code = icd_9to10(code)
        for idx in group.index.values:
            # Modify values of original df at the indexes in the groups
            diag.at[idx, col_name] = new_code

        count += group.shape[0]
        #print(f"{count}/{diag.shape[0]} rows processed")

    # Column for just the roots of the converted ICD10 column
    diag["root"] = diag[col_name].apply(lambda x: x[:3] if type(x) is str and does_not_start_with_prefixes(x, PREG_ICD10_PREFIXES)
                                        else x if type(x)
                                        else np.nan)

def preproc_icd_module(h_ids,
    module_path: str, ICD10_code: str, icd_map_path: str
) -> tuple:
    """Takes an module dataset with ICD codes and puts it in long_format,
    mapping ICD-codes by a mapping table path"""

    diag = get_diagnosis_icd(module_path)
    icd_map = read_icd_mapping(icd_map_path)

    standardize_icd(icd_map, diag, root=True)

    # patient ids that have at least 1 record of the given ICD10 code category
    diag.dropna(subset=["root"], inplace=True)
    pos_ids = pd.DataFrame(
        diag.loc[diag.root.str.contains(ICD10_code)].hadm_id.unique(),
        columns=["hadm_id"]
    )
    return pos_ids

In [113]:
icd_mapping = read_icd_mapping(mapping_file)

In [114]:
icd_mapping.head()

Unnamed: 0,diagnosis_type,diagnosis_code,diagnosis_description,icd9cm,icd10cm,flags
0,ICD9,996.76,other complications due to genitourinary devic...,99676,T8384XA,10000
1,ICD9,V54.12,aftercare for healing traumatic fracture of lo...,V5412,S52602D,10000
2,ICD9,730.06,acute osteomyelitis involving lower leg,73006,M86169,10000
3,ICD9,345.61,"infantile spasms, with intractable epilepsy",34561,G40824,10000
4,ICD9,989.5,toxic effect of venom,9895,T63421A,10000


The plan:
- first try filtering codes to the first 3 digits unless it is pregnancy-related (used in Data Filtering jupyter notebook)
- if too large or can't map appropriately, default instead to the first 3 digits
- the `icd_mapping` object is not restricted to the first three characters

ICD-9 Codes related to pregnancy:
['V22', 'V23', 'V24', 'V27', 'V28', '63', '64', '65', '66', '67', '7651', '7650']

- preeclampsia - 64
- preterm - 7651, 7650
- hemorrhage - 64, 66

In [116]:
standardize_icd(icd_mapping, diagnoses_df, root=True)

# patient ids that have at least 1 record of the given ICD10 code category
diagnoses_df.dropna(subset=["root"], inplace=True)

diagnoses_df[diagnoses_df['root'].str.startswith('O')]

In [117]:
diagnoses_df['root'].nunique()

2072

In [118]:
diagnoses_df = diagnoses_df.assign(icd_code=diagnoses_df['root']).drop(columns=['root_icd10_convert', 'root', 'icd_version'])
diagnoses_df.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code
14,10001472,23506139,4,E89
17,10001472,23506139,7,D25
19,10001472,23506139,9,M48
22,10001884,21192799,1,J44
23,10001884,21192799,2,R09


Reduced from 12966 to 2072!

In [119]:
diagnoses_df.isna().sum()

subject_id    0
hadm_id       0
seq_num       0
icd_code      0
dtype: int64

In [120]:
save_df_as_csv(diagnoses_df, 'hosp_diagnoses.csv')

DataFrame has been saved as dataframes/hosp_diagnoses.csv


## DRG Codes

Deciding whether it's worth including DRG Codes...
- could introduce redundancy if giving the same level of information as the diagnosis codes
- could use feature importance analysis

- my opinion: the table has columns for DRG Severity and DRG Mortality that could be useful... but otherwise we have categorized the ICD codes down to 2000 unique codes, which I feel should be enough information to convery diagnoses within the model
- **skip DRG Codes table**

## EMAR and EMAR Detail
Skip because according to https://mimic.mit.edu/docs/iv/modules/hosp/emar/, EMAR system was implemented during 2011-2013 and is not available for all patients, which makes it difficult to include in an analysis of this nature.

## Microbiology Events

Pros/cons of including microbiology events:
- good for conveying information about infections during pregnancy -- known risk factor for preterm birth
- could capture STIs...
- could enhance predictive power of the model because you are including more data to paint the picture
- should use feature engineering techniques
- do EDA to understand distribution and relevance
- may require some domain expertise

Example infections to include that are related to maternal health:
- UTIs
- Group B Streptococcus (GBS)
- STIs
- BV
- TORCH Infections - group of infections that could be transmitted from mother to fetus (noted in more detail below)
- Toxoplasmosis
- "Other" (e.g. syphilis)
- Rubella
- Cytomegalovirus (CMV)
- Herpes Simplex Virus (HSV)
- Flu, RSV
- Mastitis

Next steps:
- should look at the different values of `spec_type_dec`
- should look at `org_name`, which corresponds to each organism from the bacteria if the bacteria is found

In [121]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
micro_df = load_table(connection, 'mimiciv_hosp.filtered_microbiologyevents')
micro_df.head()

Connected to db: smcdougall


Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,org_name,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments
0,94,10000719,,2137138,P65T9Y,2139-12-23,NaT,70079,URINE,1,...,,,,,,,,,,"MIXED BACTERIAL FLORA ( >= 3 COLONY TYPES), CO..."
1,95,10000719,,4691510,P65T9Y,2140-03-28,NaT,70006,ANORECTAL/VAGINAL,1,...,POSITIVE FOR GROUP B BETA STREPTOCOCCI,1.0,,,,,,,,
2,610,10001884,,802664,,2122-02-12,2122-02-12 12:30:00,70012,BLOOD CULTURE,1,...,,,,,,,,,,NO GROWTH.
3,611,10001884,,802664,,2122-02-12,2122-02-12 12:30:00,70012,BLOOD CULTURE,2,...,,,,,,,,,,NO GROWTH.
4,419,10001319,,1253248,P83A4L,2135-06-10,2135-06-10 11:15:00,70091,MRSA SCREEN,1,...,,,,,,,,,,No MRSA isolated.


In [122]:
micro_df.shape

(506163, 25)

In [123]:
micro_df['org_name'].nunique()

337

In [124]:
micro_df['spec_type_desc'].value_counts()

spec_type_desc
URINE                                   181736
SWAB                                     88577
BLOOD CULTURE                            73091
SEROLOGY/BLOOD                           38661
STOOL                                    23472
                                         ...  
POST-MORTEM VIRAL CULTURE                    2
Influenza A/B by DFA - Bronch Lavage         2
SWAB, R/O GC                                 2
                                             1
MICRO PROBLEM PATIENT                        1
Name: count, Length: 88, dtype: int64

The specimen type is likely less important than the actual `org_name`...

In [125]:
value_counts = micro_df['org_name'].value_counts()
value_counts[value_counts > 40].index.tolist()

['ESCHERICHIA COLI',
 'STAPH AUREUS COAG +',
 'KLEBSIELLA PNEUMONIAE',
 'PROTEUS MIRABILIS',
 'PSEUDOMONAS AERUGINOSA',
 'ENTEROCOCCUS SP.',
 'STAPHYLOCOCCUS, COAGULASE NEGATIVE',
 'GRAM POSITIVE BACTERIA',
 'YEAST',
 'CANCELLED',
 'POSITIVE FOR GROUP B BETA STREPTOCOCCI',
 'KLEBSIELLA OXYTOCA',
 'ENTEROBACTER CLOACAE COMPLEX',
 'STAPHYLOCOCCUS EPIDERMIDIS',
 'CITROBACTER FREUNDII COMPLEX',
 'CITROBACTER KOSERI',
 'BETA STREPTOCOCCUS GROUP B',
 'ENTEROBACTER AEROGENES',
 'SERRATIA MARCESCENS',
 'CLOSTRIDIUM DIFFICILE',
 'MIXED BACTERIAL FLORA',
 'ENTEROBACTER CLOACAE',
 'MORGANELLA MORGANII',
 'CORYNEBACTERIUM SPECIES (DIPHTHEROIDS)',
 'ENTEROCOCCUS FAECIUM',
 'CHLAMYDIA TRACHOMATIS',
 'POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS',
 'STREPTOCOCCUS ANGINOSUS (MILLERI) GROUP',
 'ENTEROCOCCUS FAECALIS',
 'VIRIDANS STREPTOCOCCI',
 'PRESUMPTIVE GARDNERELLA VAGINALIS',
 'BETA STREPTOCOCCUS GROUP A',
 'LACTOBACILLUS SPECIES',
 'GRAM NEGATIVE ROD(S)',
 'ACINETOBACTER BAUMANNII COMPLEX',
 '

- KLEBSIELLA PNEUMONIAE - can cause UTIs
- PROTEUS MIRABILIS - can cause UTIs
- GRAM POSITIVE BACTERIA - a broader category that includes GBS
- POSITIVE FOR GROUP B BETA STREPTOCOCCI - GBS
- ENTEROBACTER CLOACAE COMPLEX - can cause UTIs
- CITROBACTER FREUNDII COMPLEX - can cause UTIs
- BETA STREPTOCOCCUS GROUP B - (GBS)
- ENTEROBACTER AEROGENES - can cause UTIs
- SERRATIA MARCESCENS - can cause UTIs
- ENTEROBACTER CLOACAE - can cause UTIs
- MORGANELLA MORGANII - can cause UTIs
- CHLAMYDIA TRACHOMATIS - type of STI
- PRESUMPTIVE GARDNERELLA VAGINALIS - often associated with BV
- BETA STREPTOCOCCUS GROUP A - can lead to streptococcus toxic shock syndrome (a form of TSS)
- LACTOBACILLUS SPECIES - beneficial bacteria often found in urinary and vaginal tract
- PROTEUS VULGARIS - can cause UTIs
- HERPES SIMPLEX VIRUS TYPE 2 - type of STI
- CANDIDA ALBICANS - species of yeast that can cause vaginal infections
- NEISSERIA GONORRHOEAE - causes the STI Gonorrhea
- POSITIVE FOR INFLUENZA A VIRAL ANTIGEN - the flu

In [126]:
RELEVANT_MICROBIOLOGY_EVENTS = [
    'KLEBSIELLA PNEUMONIAE',
    'PROTEUS MIRABILIS',
    'GRAM POSITIVE BACTERIA',
    'POSITIVE FOR GROUP B BETA STREPTOCOCCI',
    'ENTEROBACTER CLOACAE COMPLEX',
    'CITROBACTER FREUNDII COMPLEX',
    'BETA STREPTOCOCCUS GROUP B',
    'ENTEROBACTER AEROGENES',
    'SERRATIA MARCESCENS',
    'ENTEROBACTER CLOACAE',
    'MORGANELLA MORGANII',
    'CHLAMYDIA TRACHOMATIS',
    'PRESUMPTIVE GARDNERELLA VAGINALIS',
    'BETA STREPTOCOCCUS GROUP A',
    'LACTOBACILLUS SPECIES',
    'PROTEUS VULGARIS',
    'HERPES SIMPLEX VIRUS TYPE 2',
    'CANDIDA ALBICANS',
    'NEISSERIA GONORRHOEAE',
    'POSITIVE FOR INFLUENZA A VIRAL ANTIGEN'
]

In [127]:
micro_df = micro_df[micro_df['org_name'].isin(RELEVANT_MICROBIOLOGY_EVENTS)]
print(micro_df.shape)
micro_df.head()

(47058, 25)


Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,org_name,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments
1,95,10000719,,4691510,P65T9Y,2140-03-28,NaT,70006,ANORECTAL/VAGINAL,1,...,POSITIVE FOR GROUP B BETA STREPTOCOCCI,1.0,,,,,,,,
5,420,10001319,,2654897,P93OSG,2135-06-10,2135-06-10 11:19:00,70079,URINE,1,...,GRAM POSITIVE BACTERIA,1.0,,,,,,,,
23,614,10001884,,2893463,,2125-12-01,NaT,70042,Influenza A/B by DFA,1,...,POSITIVE FOR INFLUENZA A VIRAL ANTIGEN,1.0,,,,,,,,
64,871,10002266,,3980855,,2124-08-28,NaT,70079,URINE,1,...,GRAM POSITIVE BACTERIA,1.0,,,,,,,,
117,927,10002428,,1704201,,2156-05-11,NaT,70012,BLOOD CULTURE,1,...,LACTOBACILLUS SPECIES,1.0,,90003.0,PENICILLIN G,0.5,=,0.5,S,


In [128]:
micro_df.columns

Index(['microevent_id', 'subject_id', 'hadm_id', 'micro_specimen_id',
       'order_provider_id', 'chartdate', 'charttime', 'spec_itemid',
       'spec_type_desc', 'test_seq', 'storedate', 'storetime', 'test_itemid',
       'test_name', 'org_itemid', 'org_name', 'isolate_num', 'quantity',
       'ab_itemid', 'ab_name', 'dilution_text', 'dilution_comparison',
       'dilution_value', 'interpretation', 'comments'],
      dtype='object')

In [129]:
columns_to_drop = ['order_provider_id', 'charttime', 'spec_itemid', 'spec_type_desc', 'test_seq', 'isolate_num',
                  'quantity', 'ab_itemid', 'ab_name', 'dilution_text', 'dilution_comparison', 'dilution_value',
                  'interpretation', 'comments', 'storedate', 'storetime', 'test_itemid', 'test_name', 'org_itemid']
micro_df = micro_df.drop(columns=columns_to_drop)

In [130]:
micro_df.head()

Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,chartdate,org_name
1,95,10000719,,4691510,2140-03-28,POSITIVE FOR GROUP B BETA STREPTOCOCCI
5,420,10001319,,2654897,2135-06-10,GRAM POSITIVE BACTERIA
23,614,10001884,,2893463,2125-12-01,POSITIVE FOR INFLUENZA A VIRAL ANTIGEN
64,871,10002266,,3980855,2124-08-28,GRAM POSITIVE BACTERIA
117,927,10002428,,1704201,2156-05-11,LACTOBACILLUS SPECIES


In [131]:
save_df_as_csv(micro_df, 'hosp_microbiologyevents.csv')

DataFrame has been saved as dataframes/hosp_microbiologyevents.csv


## Pharmacy

## POE

## Prescriptions
- EMAR - administration of the medication
- POE - provider order of the medication
- Pharmacy - info on filled meds prescribed
- Prescriptions - similar to pharmacy (?)

  The plan - start with prescriptions, and then can try joining with pharmacy table and see if there's other information that could be of use

In [132]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
prescriptions_df = load_table(connection, 'mimiciv_hosp.filtered_prescriptions')
prescriptions_df.head()

Connected to db: smcdougall


Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
0,10000719,24558333,17659077,10000719-58,58.0,P40TJE,2140-04-16 10:00:00,2140-04-18 17:00:00,MAIN,Ferrous Sulfate,...,1645,182402889,325 mg Tablet,,325,mg,1,TAB,1.0,PO
1,10000719,24558333,27930529,10000719-48,48.0,P459OQ,2140-04-16 03:00:00,2140-04-18 17:00:00,MAIN,Oxycodone-Acetaminophen,...,4222,406051262,5mg/325mg Tablet,,1-2,TAB,1-2,TAB,,PO
2,10000719,24558333,28367240,10000719-52,52.0,P459OQ,2140-04-16 03:00:00,2140-04-18 17:00:00,MAIN,Bisacodyl,...,2944,574705050,10mg Suppository,,10,mg,1,SUPP,,PR
3,10000719,24558333,30078745,10000719-49,49.0,P459OQ,2140-04-16 03:00:00,2140-04-18 17:00:00,MAIN,Ibuprofen,...,8349,182181089,600mg Tablet,,600,mg,1,TAB,,PO
4,10000719,24558333,31677531,10000719-36,36.0,P459OQ,2140-04-16 03:00:00,2140-04-18 17:00:00,MAIN,Oxytocin,...,3283,63323001201,10 Units/mL Vial,,20,UNIT,2,VIAL,,IV DRIP


In [133]:
ndc_mapping = pd.read_csv('ndc_product.txt', header=0, delimiter="\t", encoding='cp1252')

In [134]:
##### from MIMIC preprocessing
def read_ndc_mapping(map_path):
    ndc_map = pd.read_csv(map_path, header=0, delimiter='\t', encoding='latin1')
    ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.fillna("")
    ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.apply(str.lower)
    ndc_map.columns = list(map(str.lower, ndc_map.columns))
    return ndc_map

def read_prescriptions_table(mimic4_path):
    meds = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/prescriptions.csv.gz'))
    meds = meds.reset_index()
    return meds[['subject_id', 'hadm_id', 'starttime', 'stoptime', 'ndc', 'gsn', 'drug', 'drug_type']]

### NOTE - not used anywhere...
def get_generic_drugs(mapping, df):
    """Takes NDC product table and prescriptions dataframe; adds column with NDC table's corresponding generic name"""

    def brand_to_generic(ndc):
        # We only want the first 2 sections of the NDC code: xxxx-xxxx-xx
        matches = list(re.finditer(r"-", ndc))
        if len(matches) > 1:
            ndc = ndc[:matches[1].start()]
        try:
            return mapping.loc[mapping.PRODUCTNDC == ndc].NONPROPRIETARYNAME.iloc[0]
        except:
            print("Error: ", ndc)
            return np.nan

    df['generic_drug_name'] = df['ndc'].apply(brand_to_generic)

def preproc_meds(module_path:str, adm_cohort_path:str, mapping:str) -> pd.DataFrame:
  
    adm = pd.read_csv(adm_cohort_path, usecols=['hadm_id', 'admittime'], parse_dates = ['admittime'])
    med = pd.read_csv(module_path, compression='gzip', usecols=['subject_id', 'hadm_id', 'drug', 'starttime', 'stoptime','ndc','dose_val_rx'], parse_dates = ['starttime', 'stoptime'])
    med = med.merge(adm, left_on = 'hadm_id', right_on = 'hadm_id', how = 'inner')
    med['start_hours_from_admit'] = med['starttime'] - med['admittime']
    med['stop_hours_from_admit'] = med['stoptime'] - med['admittime']
    
    # Normalize drug strings and remove potential duplicates

    med.drug = med.drug.fillna("").astype(str)
    med.drug = med.drug.apply(lambda x: x.lower().strip().replace(" ", "_") if not "" else "")
    med.drug=med.drug.dropna().apply(lambda x: x.lower().strip())
    
    #meds.to_csv(output_path, compression='gzip', index=False)
    med = ndc_meds(med,mapping)
    
    print("Number of unique type of drug: ", med.drug.nunique())
    print("Number of unique type of drug (after grouping to use Non propietary names): ", med.nonproprietaryname.nunique())
    print("Total number of rows: ", med.shape[0])
    print("# Admissions:  ", med.hadm_id.nunique())
    
    return med
    
    
def ndc_meds(med, mapping:str) -> pd.DataFrame:
    
    # Convert any nan values to a dummy value
    med.ndc = med.ndc.fillna(-1)

    # Ensures the decimal is removed from the ndc col
    med.ndc = med.ndc.astype("Int64")
    
    # The NDC codes in the prescription dataset is the 11-digit NDC code, although codes are missing
    # their leading 0's because the column was interpreted as a float then integer; this function restores
    # the leading 0's, then obtains only the PRODUCT and MANUFACTUERER parts of the NDC code (first 9 digits)
    def to_str(ndc):
        if ndc < 0:         # dummy values are < 0
            return np.nan
        ndc = str(ndc)
        return (("0"*(11 - len(ndc))) + ndc)[0:-2]

    # The mapping table is ALSO incorrectly formatted for 11 digit NDC codes. An 11 digit NDC is in the
    # form of xxxxx-xxxx-xx for manufacturer-product-dosage. The hyphens are in the correct spots, but
    # the number of digits within each section may not be 5-4-2, in which case we add leading 0's to each
    # to restore the 11 digit format. However, we only take the 5-4 sections, just like the to_str function
    def format_ndc_table(ndc):
        parts = ndc.split("-")
        return ("0"*(5 - len(parts[0])) + parts[0]) + ("0"*(4 - len(parts[1])) + parts[1])
    
    def read_ndc_mapping2(map_path):
        ndc_map = pd.read_csv(map_path, header=0, delimiter='\t', encoding = 'latin1')
        ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.fillna("")
        ndc_map.NONPROPRIETARYNAME = ndc_map.NONPROPRIETARYNAME.apply(str.lower)
        ndc_map.columns = list(map(str.lower, ndc_map.columns))
        return ndc_map
    
    # Read in NDC mapping table
    ndc_map = read_ndc_mapping2(mapping)[['productndc', 'nonproprietaryname', 'pharm_classes']]
    
    # Normalize the NDC codes in the mapping table so that they can be merged
    ndc_map['new_ndc'] = ndc_map.productndc.apply(format_ndc_table)
    ndc_map.drop_duplicates(subset=['new_ndc', 'nonproprietaryname'], inplace=True)
    med['new_ndc'] = med.ndc.apply(to_str)  
    
    # Left join the med dataset to the mapping information
    med = med.merge(ndc_map, how='inner', left_on='new_ndc', right_on='new_ndc')
    
    # In NDC mapping table, the pharm_class col is structured as a text string, separating different pharm classes from eachother
    # This can be [PE], [EPC], and others, but we're interested in EPC. Luckily, between each commas, it states if a phrase is [EPC]
    # So, we just string split by commas and keep phrases containing "[EPC]"
    def get_EPC(s):
        """Gets the Established Pharmacologic Class (EPC) from the mapping table"""
        if type(s) != str:
            return np.nan
        words = s.split(",")
        return [x for x in words if "[EPC]" in x]
    
    # Function generates a list of EPCs, as a drug can have multiple EPCs
    med['EPC'] = med.pharm_classes.apply(get_EPC)
    
    return med

In [135]:
ndc_mapping = read_ndc_mapping('ndc_product.txt')
med = prescriptions_df
# Normalize drug strings and remove potential duplicates

med.dropna(inplace=True)
# med.drug = med.drug.fillna("").astype(str)
# med.drug = med.drug.apply(lambda x: x.lower().strip().replace(" ", "_") if not "" else "")
# med.drug=med.drug.dropna().apply(lambda x: x.lower().strip())

#meds.to_csv(output_path, compression='gzip', index=False)
med = ndc_meds(med,'ndc_product.txt')

print("Number of unique type of drug: ", med.drug.nunique())
print("Number of unique type of drug (after grouping to use Non propietary names): ", med.nonproprietaryname.nunique())
print("Total number of rows: ", med.shape[0])
print("# Admissions:  ", med.hadm_id.nunique())

Number of unique type of drug:  368
Number of unique type of drug (after grouping to use Non propietary names):  289
Total number of rows:  92311
# Admissions:   24443


In [136]:
med.head()

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,new_ndc,productndc,nonproprietaryname,pharm_classes,EPC
0,10001884,21268656,20911660,10001884-235,235.0,P71L3J,2125-10-18 23:00:00,2125-10-19 22:00:00,MAIN,Magnesium Sulfate,...,gm,1,BAG,1.0,IV,4096729,0409-6729,magnesium sulfate in water,"Calculi Dissolution Agent [EPC],Magnesium Ion ...","[Calculi Dissolution Agent [EPC], Osmotic Laxa..."
1,10001884,21577720,43978059,10001884-379,379.0,P66XA3,2125-12-27 10:00:00,2125-12-27 21:00:00,MAIN,Magnesium Sulfate,...,gm,1,BAG,1.0,IV,4096729,0409-6729,magnesium sulfate in water,"Calculi Dissolution Agent [EPC],Magnesium Ion ...","[Calculi Dissolution Agent [EPC], Osmotic Laxa..."
2,10001884,23594368,88623458,10001884-337,337.0,P23RXE,2125-12-03 10:00:00,2125-12-03 19:00:00,MAIN,Magnesium Sulfate,...,gm,1,BAG,1.0,IV,4096729,0409-6729,magnesium sulfate in water,"Calculi Dissolution Agent [EPC],Magnesium Ion ...","[Calculi Dissolution Agent [EPC], Osmotic Laxa..."
3,10001884,26170293,8650292,10001884-739,739.0,P063OH,2130-04-17 12:00:00,2130-04-18 11:00:00,MAIN,Magnesium Sulfate,...,gm,1,BAG,1.0,IV,4096729,0409-6729,magnesium sulfate in water,"Calculi Dissolution Agent [EPC],Magnesium Ion ...","[Calculi Dissolution Agent [EPC], Osmotic Laxa..."
4,10001884,26184834,3491803,10001884-1794,1794.0,P72T1M,2131-01-14 20:00:00,2131-01-16 07:00:00,MAIN,Vancomycin,...,mg,1,BAG,2.0,IV,3383580,0338-3580,vancomycin hydrochloride,"Glycopeptide Antibacterial [EPC],Glycopeptides...",[Glycopeptide Antibacterial [EPC]]


In [137]:
med_cols_to_drop = ['drug_type', 'formulary_drug_cd', 'gsn', 'form_rx', 'poe_id', 'poe_seq', 'order_provider_id',
                   'productndc', 'new_ndc', 'pharm_classes', 'EPC', 'form_unit_disp', 'form_val_disp', 'ndc',
                   'drug']
med = med.drop(columns=med_cols_to_drop)
med.head()
# columns used in MIMIC preprocessing paper:
#'subject_id', 'hadm_id', 'starttime','stoptime','drug','nonproprietaryname', 'start_hours_from_admit', 'stop_hours_from_admit','dose_val_rx

Unnamed: 0,subject_id,hadm_id,pharmacy_id,starttime,stoptime,prod_strength,dose_val_rx,dose_unit_rx,doses_per_24_hrs,route,nonproprietaryname
0,10001884,21268656,20911660,2125-10-18 23:00:00,2125-10-19 22:00:00,2 g / 50 mL Premix Bag,2,gm,1.0,IV,magnesium sulfate in water
1,10001884,21577720,43978059,2125-12-27 10:00:00,2125-12-27 21:00:00,2 g / 50 mL Premix Bag,2,gm,1.0,IV,magnesium sulfate in water
2,10001884,23594368,88623458,2125-12-03 10:00:00,2125-12-03 19:00:00,2 g / 50 mL Premix Bag,2,gm,1.0,IV,magnesium sulfate in water
3,10001884,26170293,8650292,2130-04-17 12:00:00,2130-04-18 11:00:00,2 g / 50 mL Premix Bag,2,gm,1.0,IV,magnesium sulfate in water
4,10001884,26184834,3491803,2131-01-14 20:00:00,2131-01-16 07:00:00,750 mg / 150 mL Premix Bag,750,mg,2.0,IV,vancomycin hydrochloride


In [138]:
save_df_as_csv(med, 'hosp_prescriptions.csv')

DataFrame has been saved as dataframes/hosp_prescriptions.csv


## Procedures_ICD

- Use similar approach that was used in "An Extensive Data Processing Pipeline for MIMIC-IV" for diagnoses
- Note that the paper filters out ICD9 codes because there is no access to a mapping of ICD-9 to ICD-10 procedures


My approach:
- read in procedure table
- drop unnecessary columns
- look at how many ICD-9 codes there are and how to deal with them
- convert each of the 3 first digits, then check new shape

In [139]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
proc_df = load_table(connection, 'mimiciv_hosp.filtered_procedures_icd')
print(proc_df.shape)
proc_df.head()

Connected to db: smcdougall
(82672, 6)


Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,10000719,24558333,1,2140-04-16,7569,9
1,10001319,23005466,1,2135-07-20,7359,9
2,10001319,24591241,1,2138-11-10,7359,9
3,10001319,29230609,1,2134-04-15,7569,9
4,10001472,23506139,1,2186-01-11,7569,9


In [140]:
proc_df[proc_df['icd_version'] == 9].shape 

(53882, 6)

The majority of the procedures are icd version 9... so it doesn't really make sense in this case to drop them. Maybe start with converting to first 3 digits and then do a manual mapping somehow

See https://www.cms.gov/medicare/coding/icd10/downloads/icd-10_gem_fact_sheet.pdf - not sure if CMS actually has an ICD-9 to ICD-10 mapping that is publicly available

In [141]:
proc_df['icd_code'].nunique()

4227

In [143]:
proc_df[proc_df['icd_version'] == 9]['icd_code'].nunique()

1416

In [144]:
proc_df[proc_df['icd_version'] == 10]['icd_code']

6        5A1945Z
7        5A1955Z
8        0BH17EZ
9        5A1223Z
10       5A12012
          ...   
82664    04LE3DT
82665    0U7C7ZZ
82666    B41CYZZ
82667    0DBN8ZX
82668    0DBP8ZX
Name: icd_code, Length: 28790, dtype: object

- https://www.nber.org/research/data/icd-9-cm-and-icd-10-cm-and-icd-10-pcs-crosswalk-or-general-equivalence-mappings -- look into this and see if I can find a CSV file that can be used for mapping purposes


- PCS == Procedure Coding System
- use General Equivalence Mappings (GEMs) from CMS for mappings ICD-9 PCS to ICD-10 PCS
- then group by root operation -----> first character is broad category (e.g., Medical and Surgical, Obstetrics, Imaging, etc.), fourth character (Root Operation), and sometimes subsequent characters (Body Part, Approach, Device, etc.)
- Filter by section and root operation
- ICD-9-CM codes consist of up to five alphanumeric characters but ICD-10-PCS codes are much more detailed and specific than ICD-9-CM. They consist of seven alphanumeric characters ((which means we may need to use a different approach for consolidation rather than simply truncating to the first 3 digits...)

Source - https://www.cms.gov/medicare/coding/icd10/downloads/2014-pcs-procedure-coding-system.pdf

First character represents the "section":
- 0 - Medical and Surgical
- 1 - Obstetrics
- 2 - Placement
- 3 - Administration
- 4 - Measurement and Monitoring
- 5 - Extracorporeal Assistance and Performance
- 6 - Extracorporeal Therapies
- 7 - Osteopathic
- 8 - Other Procedures
- 9 - Chiropractic
- B - Imaging
- C - Nuclear Medicine
- D - Radiation Oncology
- F - Physical Rehabilitation and Diagnostic Audiology
- G - Mental Health
- H - Substance Abuse Treatment

- Character 1 = Section
- Character 2 = Body System
- Character 3 = Root Operation
- Character 4 = Body Part
- Character 5 = Approach
- Character 6 = Device
- Character 7 = Qualifier


**Decision**: Only use the first 4 characters. 5 represents approach to reach procedure site. 6 represents whether device was used during procedure. 7 may have specific meaning for a limited range of values. Not super necessary for the analysis.

In [146]:
# ICD-9-PCS to ICD-10-PCS CSV file from nber.org (link above)
procedure_mapping_df = pd.read_csv('icd9toicd10pcsgem.csv')

In [147]:
# captures ICD-9 codes from 0-9999
# note 'icd9cm' and 'icd10cm' columns are the same as those references in the diagnosis mapping that was imported
procedure_mapping_df.tail()

Unnamed: 0,icd9cm,icd10cm,flags,approximate,no_map,combination,scenario,choice_list
69361,9999,0WQLXZZ,10000,1,0,0,0,0
69362,9999,0XQ6XZZ,10000,1,0,0,0,0
69363,9999,0XQ7XZZ,10000,1,0,0,0,0
69364,9999,0YQ9XZZ,10000,1,0,0,0,0
69365,9999,0YQBXZZ,10000,1,0,0,0,0


In [148]:
def standardize_icd_procedures(
    mapping: pd.DataFrame, proc: pd.DataFrame, map_code_col="icd9cm", root=True
) -> str:
    """Takes an ICD9 -> ICD10 mapping table and a ICD procedures dataframe;
    adds column with converted ICD10 column"""

    count = 0
    code_cols = mapping.columns
    errors = []

    def icd_9to10(icd):
        """Function use to apply over the proc DataFrame for ICD9->ICD10 conversion"""

        # if root and does_not_start_with_prefixes(icd, PREG_ICD9_PREFIXES):
        #     icd = icd[:3]

        if map_code_col not in code_cols:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        print(int(icd))
        matches = mapping.loc[mapping[map_code_col] == int(icd)]
        if matches.shape[0] == 0:
            errors.append(f"ICD NOT FOUND: {icd}")
            return np.nan

        return mapping.loc[mapping[map_code_col] == int(icd)].icd10cm.iloc[0]

    # Create new column with original codes as default
    col_name = "root_icd10_convert"
    proc[col_name] = proc["icd_code"].values

    # Group identical ICD9 codes, then convert all ICD9 codes within
    # a group to ICD10
    for code, group in proc.loc[proc.icd_version == 9].groupby(by="icd_code"):
        new_code = icd_9to10(code)
        for idx in group.index.values:
            # Modify values of original df at the indexes in the groups
            proc.at[idx, col_name] = new_code

        count += group.shape[0]

    proc["root"] = proc[col_name].apply(lambda x: x[:4] if type(x) is str
                                        else np.nan)

In [150]:
proc_df['root_icd10_convert'].nunique()

3640

In [151]:
proc_df.dropna(inplace=True)

In [152]:
proc_df[proc_df['root'].str.startswith('0')]['root'].nunique()

1848

In [153]:
proc_df[proc_df['root'].str.startswith('1')]['root'].nunique()

14

In [154]:
# only keep Medical and Surgical, and Obstetrics procedures
filtered_proc_df = proc_df[proc_df['root'].str.startswith(('0', '1'))]

In [155]:
filtered_proc_df['root'].nunique()

1862

In [156]:
filtered_proc_df['root']

0        0TQD
1        10E0
2        10E0
3        0TQD
4        0TQD
         ... 
82665    0U7C
82667    0DBN
82668    0DBP
82670    1090
82671    0TQD
Name: root, Length: 61667, dtype: object

In [157]:
filtered_proc_df

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,root_icd10_convert,root
0,10000719,24558333,1,2140-04-16,7569,9,0TQD7ZZ,0TQD
1,10001319,23005466,1,2135-07-20,7359,9,10E0XZZ,10E0
2,10001319,24591241,1,2138-11-10,7359,9,10E0XZZ,10E0
3,10001319,29230609,1,2134-04-15,7569,9,0TQD7ZZ,0TQD
4,10001472,23506139,1,2186-01-11,7569,9,0TQD7ZZ,0TQD
...,...,...,...,...,...,...,...,...
82665,19999043,24799384,4,2164-12-18,0U7C7ZZ,10,0U7C7ZZ,0U7C
82667,19999464,23033573,1,2171-08-02,0DBN8ZX,10,0DBN8ZX,0DBN
82668,19999464,23033573,2,2171-08-02,0DBP8ZX,10,0DBP8ZX,0DBP
82670,19999464,28135642,2,2164-01-19,7309,9,10907ZC,1090


In [158]:
filtered_proc_df = filtered_proc_df.assign(icd_code_root=filtered_proc_df['root']).drop(columns=['root_icd10_convert', 'root', 'icd_version',
                                                                                                'icd_code'])
filtered_proc_df.head()

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code_root
0,10000719,24558333,1,2140-04-16,0TQD
1,10001319,23005466,1,2135-07-20,10E0
2,10001319,24591241,1,2138-11-10,10E0
3,10001319,29230609,1,2134-04-15,0TQD
4,10001472,23506139,1,2186-01-11,0TQD


In [159]:
filtered_proc_df.shape

(61667, 5)

In [160]:
filtered_proc_df['icd_code_root'].nunique()

1862

In [161]:
save_df_as_csv(filtered_proc_df, 'hosp_procedures.csv')

DataFrame has been saved as dataframes/hosp_procedures.csv
