# Static Fields Integration
This Jupyter Notebook compiles the dataframes with "static" data fields into a single dataframe. "Static" in this context refers to data that is representative at either the "admission level"/"stay level" or at the "patient level." It does not include data that is time series data (those data frames are handled in `time_series_integration.ipynb`.

## Relevant data frames
- ed_diagnosis
- hosp_admissions
- hosp_diagnoses
- hosp_patients

In [34]:
import pandas as pd
import csv
import numpy as np
import seaborn as sns
import os
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql

def read_csv(csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    print(csv_file_path)
    print('Shape:', df.shape)
    print(df.head())
    return df

def convert_to_int(value):
    try:
        # Check if the value is not NaN and is a valid number
        if pd.notna(value):
            return int(value)
        else:
            return np.nan
    except ValueError:
        return np.nan

def is_numeric(value):
    """Check if a value is numeric."""
    try:
        float(value)
        return True
    except ValueError:
        return False

def convert_outliers(df, vitalsigns_df, itemid_col='itemid', low_col='OUTLIER LOW', high_col='OUTLIER HIGH'):
    # Merge original DataFrame with mapping DataFrame on the itemid column
    merged_df = pd.merge(df, vitalsigns_df, how='left', left_on=itemid_col, right_on=itemid_col)
    
    # convert to nan if outside of the acceptable range
    for itemid in merged_df[itemid_col].unique()[:1]:
        # get thresholds for the item
        thresholds = merged_df[merged_df[itemid_col] == itemid]
        if thresholds.empty:
            continue

        low_threshold = thresholds[low_col].values[0]
        high_threshold = thresholds[high_col].values[0]

        df['value'] = df.apply(
                lambda row: np.nan if is_numeric(row['value']) and row['itemid'] == itemid and (
                    float(row['value']) > high_threshold or
                    float(row['value']) < low_threshold
                ) else row['value'],
                axis=1
            )

"""
Saves pandas DataFrame as a CSV file.
"""
def save_df_as_csv(df, csv_name, directory='dataframes'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_path = os.path.join(directory, csv_name)
    df.to_csv(file_path, index=False)

    print(f'DataFrame has been saved as {file_path}')

In [2]:
ed_diagnosis_df = read_csv('dataframes/ed_diagnosis.csv')
admissions_df = read_csv('dataframes/hosp_admissions.csv')
hosp_diagnosis_df = read_csv('dataframes/hosp_diagnoses.csv')
triage_df = read_csv('dataframes/ed_triage.csv')
patients_df = read_csv('dataframes/hosp_patients.csv')

dataframes/ed_diagnosis.csv
Shape: (84431, 5)
   subject_id   stay_id  seq_num icd_code  \
0    10001884  31306678        1      J45   
1    10001884  31742950        1      J44   
2    10001884  33281437        1      R06   
3    10001884  33281437        2      I10   
4    10001884  33281437        3      J45   

                                           icd_title  
0       Unspecified asthma with (acute) exacerbation  
1  Chronic obstructive pulmonary disease w (acute...  
2                                     CHEST PAIN NOS  
3                                   HYPERTENSION NOS  
4                                ASTHMA, UNSPECIFIED  
dataframes/hosp_admissions.csv
Shape: (56822, 10)
   subject_id   hadm_id            admittime            dischtime  \
0    10000719  24558333  2140-04-15 00:14:00  2140-04-18 12:29:00   
1    10001319  23005466  2135-07-20 03:45:00  2135-07-22 11:38:00   
2    10001319  24591241  2138-11-09 20:00:00  2138-11-12 10:40:00   
3    10001319  29230609  21

In [3]:
dfs = {
    'ed_diagnosis': ed_diagnosis_df,
    'admissions': admissions_df,
    'hosp_diagnosis': hosp_diagnosis_df,
    'patients': patients_df,
    'triage': triage_df
}

# Add 'source' column 
for name, df in dfs.items():
    df['source'] = name

# verify success
ed_diagnosis_df.head()

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_title,source
0,10001884,31306678,1,J45,Unspecified asthma with (acute) exacerbation,ed_diagnosis
1,10001884,31742950,1,J44,Chronic obstructive pulmonary disease w (acute...,ed_diagnosis
2,10001884,33281437,1,R06,CHEST PAIN NOS,ed_diagnosis
3,10001884,33281437,2,I10,HYPERTENSION NOS,ed_diagnosis
4,10001884,33281437,3,J45,"ASTHMA, UNSPECIFIED",ed_diagnosis


In [4]:
ed_diagnosis_df = ed_diagnosis_df.drop(columns=['seq_num', 'icd_title'])
static_df = ed_diagnosis_df

In [5]:
hosp_diagnosis_df.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,source
0,10001472,23506139,4,E89,hosp_diagnosis
1,10001472,23506139,7,D25,hosp_diagnosis
2,10001472,23506139,9,M48,hosp_diagnosis
3,10001884,21192799,1,J44,hosp_diagnosis
4,10001884,21192799,2,R09,hosp_diagnosis


In [6]:
hosp_diagnosis_df = hosp_diagnosis_df.drop(columns=['seq_num'])

In [7]:
static_df = pd.concat([static_df, hosp_diagnosis_df], ignore_index=True)

In [8]:
static_df = static_df.drop_duplicates()
static_df

Unnamed: 0,subject_id,stay_id,icd_code,source,hadm_id
0,10001884,31306678.0,J45,ed_diagnosis,
1,10001884,31742950.0,J44,ed_diagnosis,
2,10001884,33281437.0,R06,ed_diagnosis,
3,10001884,33281437.0,I10,ed_diagnosis,
4,10001884,33281437.0,J45,ed_diagnosis,
...,...,...,...,...,...
607059,19999464,,Q45,hosp_diagnosis,23033573.0
607060,19999464,,R51,hosp_diagnosis,23033573.0
607061,19999464,,R19,hosp_diagnosis,23033573.0
607062,19999464,,E55,hosp_diagnosis,23033573.0


In [9]:
triage_df.head()

Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint,source
0,10001884,31306678,,,,79.0,,,unable,1.0,Hypoxia,triage
1,10001884,31742950,97.6,67.0,22.0,97.0,132.0,82.0,0,2.0,Dyspnea,triage
2,10001884,33281437,98.2,72.0,20.0,97.0,157.0,66.0,2,3.0,CP/SOB,triage
3,10001884,33478776,98.4,74.0,16.0,100.0,142.0,69.0,0,3.0,"Dyspnea, Fatigue",triage
4,10001884,34226385,98.2,65.0,18.0,97.0,113.0,91.0,0,2.0,Dyspnea,triage


In [10]:
triage_df['pain'].value_counts()

pain
0                   11926
10                   5958
8                    5307
7                    3827
5                    3023
                    ...  
in legs to touch        1
"10"                    1
uta.                    1
9-10                    1
+                       1
Name: count, Length: 204, dtype: int64

In [11]:
import re

def clean_pain_value(value):
    if pd.isna(value):
        return np.nan
    value = value.lower()
    # if the value is 'unable', we do not know if it was set to 'unable' because the patient could
    # not provide an answer, or if there was a data collection issue, so set to N/A
    if 'unable' in value or 'refused' in value:
        return np.nan
    if 'bad' in value:
        return 8

    value = re.sub(r'[^\d/.]', '', value)  # remove all non-numeric and non-slash characters
    value = re.sub(r'/.*$', '', value)     # remove anything after and including a slash (ex. 8/10)
    value = re.sub(r'^(\d+)', r'\1', value)  # keep only the leading digits
    
    try:
        numeric_value = float(value)
        # round down if the value is a decimal
        numeric_value = np.floor(numeric_value)
        if 0 <= numeric_value <= 10:
            return int(numeric_value)
        else:
            return np.nan
    except ValueError:
        return np.nan
triage_df['pain'] = triage_df['pain'].apply(clean_pain_value)

In [12]:
def clean_temperature(value):
    if pd.isna(value):
        return np.nan
    # if temperature between 14 and 58, it likely needs to be converted from Celsius to Fahrenheit
    if 14 <= value <= 58:
        # C --> F conversion
        return value * 9/5 + 32
    # if temperature between 900 and 1000, it's likely that a decimal needs to be inserted
    elif 900 <= value <= 1000:
        # insert a decimal so that temp is between 90 - 100
        return value / 10
    # these are considered outliers
    # hard to distinguish whether there is a typo or the wrong vital sign recorded (ex. heart rate as temp)
    elif value < 58 or value > 117:
        return np.nan
triage_df['temperature'] = triage_df['temperature'].apply(clean_temperature)

In [13]:
triage_df['temperature'].describe()

count     42.000000
mean      97.592857
std        6.285377
min       62.600000
25%       97.160000
50%       97.790000
75%       98.690000
max      116.420000
Name: temperature, dtype: float64

In [14]:
# MIMIC Extract outliers are < 0 or > 390
triage_df.loc[triage_df['heartrate'] > 390, 'heartrate'] = np.nan

In [15]:
# everything is writhin range
triage_df['resprate'].describe()

count    41975.000000
mean        17.629089
std          2.364246
min          0.000000
25%         16.000000
50%         18.000000
75%         18.000000
max         85.000000
Name: resprate, dtype: float64

In [16]:
# MIMIC Extract outliers are < 0 or > 150
triage_df.loc[triage_df['o2sat'] > 150, 'o2sat'] = np.nan

In [17]:
# MIMIC Extract outliers are < 0 or > 375
triage_df.loc[triage_df['dbp'] > 375, 'dbp'] = np.nan
triage_df.loc[triage_df['sbp'] > 375, 'sbp'] = np.nan

In [18]:
triage_df['acuity'].value_counts()

acuity
3.0    25458
2.0    13187
4.0     2148
1.0     2067
5.0       65
Name: count, dtype: int64

In [19]:
triage_df['chiefcomplaint'].value_counts()

chiefcomplaint
Abd pain                                   1512
Chest pain                                  857
ABD PAIN                                    746
Dyspnea                                     700
Wound eval                                  515
                                           ... 
ST,RASH                                       1
FACIAL NUMBNESS/SWELLING                      1
RLQ PAIN/PREG                                 1
Chest pain, Epistaxis, Productive cough       1
Back pain, Headache, s/p Fall                 1
Name: count, Length: 10911, dtype: int64

In [20]:
triage_df['is_preg_chief_complaint'] = triage_df['chiefcomplaint'].astype(str).apply(lambda x: 1 if 'preg' in x.lower() else 0)
triage_df = triage_df.drop(columns=['chiefcomplaint'])

In [21]:
triage_df['is_preg_chief_complaint'].value_counts()

is_preg_chief_complaint
0    40645
1     2553
Name: count, dtype: int64

In [22]:
static_df = pd.concat([static_df, triage_df], ignore_index=True)
static_df = static_df.drop_duplicates()
static_df.head()

Unnamed: 0,subject_id,stay_id,icd_code,source,hadm_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,is_preg_chief_complaint
0,10001884,31306678.0,J45,ed_diagnosis,,,,,,,,,,
1,10001884,31742950.0,J44,ed_diagnosis,,,,,,,,,,
2,10001884,33281437.0,R06,ed_diagnosis,,,,,,,,,,
3,10001884,33281437.0,I10,ed_diagnosis,,,,,,,,,,
4,10001884,33281437.0,J45,ed_diagnosis,,,,,,,,,,


In [23]:
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,admission_location,discharge_location,edregtime,edouttime,hospital_expire_flag,admission_type_ordinal,source
0,10000719,24558333,2140-04-15 00:14:00,2140-04-18 12:29:00,Referral,Home/Home Health Care,,,0,2,admissions
1,10001319,23005466,2135-07-20 03:45:00,2135-07-22 11:38:00,Referral,Home/Home Health Care,,,0,2,admissions
2,10001319,24591241,2138-11-09 20:00:00,2138-11-12 10:40:00,Referral,Home/Home Health Care,,,0,2,admissions
3,10001319,29230609,2134-04-15 07:59:00,2134-04-17 13:23:00,Referral,Home/Home Health Care,,,0,2,admissions
4,10001472,23506139,2186-01-10 00:00:00,2186-01-13 15:02:00,Referral,Home/Home Health Care,,,0,2,admissions


In [31]:
# we can drop these fields because we are already using ED data in the time series df
admissions_df = admissions_df.drop(columns=['edregtime', 'edouttime', 'admission_type_ordinal'])

In [32]:
static_df = pd.concat([static_df, admissions_df], ignore_index=True)
static_df = static_df.drop_duplicates()
static_df.head()

Unnamed: 0,subject_id,stay_id,icd_code,source,hadm_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,is_preg_chief_complaint,admittime,dischtime,admission_location,discharge_location,hospital_expire_flag
0,10001884,31306678.0,J45,ed_diagnosis,,,,,,,,,,,,,,,
1,10001884,31742950.0,J44,ed_diagnosis,,,,,,,,,,,,,,,
2,10001884,33281437.0,R06,ed_diagnosis,,,,,,,,,,,,,,,
3,10001884,33281437.0,I10,ed_diagnosis,,,,,,,,,,,,,,,
4,10001884,33281437.0,J45,ed_diagnosis,,,,,,,,,,,,,,,


In [35]:
save_df_as_csv(static_df, 'static_data.csv', 'final_dfs')

DataFrame has been saved as final_dfs/static_data.csv


In [36]:
patients_df

Unnamed: 0,subject_id,hadm_id,insurance,language,marital_status,race,dod,age_at_admission,admit_date,source
0,10000719,24558333.0,Other,ENGLISH,SINGLE,White,,34,2140-04-15,patients
1,10001319,23005466.0,Other,ENGLISH,MARRIED,White,,30,2135-07-20,patients
2,10001319,24591241.0,Other,ENGLISH,MARRIED,White,,33,2138-11-09,patients
3,10001319,29230609.0,Other,ENGLISH,MARRIED,White,,29,2134-04-15,patients
4,10001472,23506139.0,Other,ENGLISH,MARRIED,White,,35,2186-01-10,patients
...,...,...,...,...,...,...,...,...,...,...
19101,19985387,,Other,ENGLISH,SINGLE,White,,34,2156-05-13,patients
19102,19986146,,Other,ENGLISH,SINGLE,White,,24,2151-01-25,patients
19103,19986146,,Other,ENGLISH,SINGLE,White,,24,2151-01-25,patients
19104,19991798,,Other,ENGLISH,SINGLE,Black or African American,,39,2127-08-04,patients


In [37]:
save_df_as_csv(patients_df, 'patient_data.csv', 'final_dfs')

DataFrame has been saved as final_dfs/patient_data.csv
