In [1]:
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql
import pandas as pd
import csv
import numpy as np

### Environment Variables for Connection ###
DB_NAME = 'smcdougall'
USERNAME = 'postgres'
PASSWORD = 'postgres'
HOST = 'localhost'
PORT = 5432 

def connect_to_postgres(db_name, username, password, host, port):
    connection = None
    try:
        connection = psycopg2.connect(
            dbname=db_name,
            user=username,
            password=password,
            host=host,
            port=port
        )
        print('Connected to db:', db_name)
        return connection
    except OperationalError as e:
        print('Received the following error:', e)
        return None

def verify_postgres_connection(connection):
    if connection is not None:
        try:
            cur = connection.cursor()
            cur.execute('SELECT version();')
            db_version = cur.fetchone()
            print('The Postgres database version is:', db_version)
            cur.close()
        except DatabaseError as e:
            print('Received the following error:', e)
    else:
        print('Connection to Postgres failed.')

def close_connection(connection):
    if connection is not None:
        connection.close()
        print('Postgres connection has been closed.')

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
verify_postgres_connection(connection)
close_connection(connection)

Connected to db: smcdougall
The Postgres database version is: ('PostgreSQL 14.5 on aarch64-apple-darwin20.6.0, compiled by Apple clang version 12.0.5 (clang-1205.0.22.9), 64-bit',)
Postgres connection has been closed.


In [2]:
def load_csv_column_as_list(file_path):
    column_values = []
    with open(file_path, newline='') as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            if row:  # Check if row is not empty
                column_values.append(row[0])
    return column_values

file_path = "hosp_and_ed_pregnant_subjects.csv"
# remove column header from the list since we are only interested in the actual ids
hosp_and_ed_subjects = load_csv_column_as_list(file_path)[1:]

print(len(hosp_and_ed_subjects))

19076


# Pre-processing the tabular data

## The approach
We will be using a similar approach to that used of the MIMIC Extract paper (co-authored by Michael Hughes of Tufts), in which the many tables are categorized into different groups based on the data being represented. This approach will build up the pandas/python representation of the data by preprocessing the fields along the way.

## Pre-processing tabular data
We will start by pre-processing the tabular, static data. This primarily corresponds to patient demographics data.

In [3]:
def get_patient_table_data(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id, gender, anchor_age, anchor_year, anchor_year_group, dod
        FROM "mimiciv_hosp.filtered_patients"
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "gender", "anchor_age",
                                    "anchor_year", "anchor_year_group",
                                    "dod"])
    return df

In [4]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
patients_df = get_patient_table_data(connection)

Connected to db: smcdougall


In [5]:
patients_df

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000719,F,34,2140,2008 - 2010,
1,10001319,F,28,2133,2008 - 2010,
2,10001472,F,35,2186,2011 - 2013,
3,10001884,F,68,2122,2008 - 2010,2131-01-20
4,10002266,F,31,2124,2011 - 2013,
...,...,...,...,...,...,...
19071,19997367,F,63,2126,2011 - 2013,
19072,19997911,F,79,2188,2008 - 2010,
19073,19998198,F,24,2122,2011 - 2013,
19074,19999043,F,35,2164,2014 - 2016,


In [6]:
na_counts = patients_df.isna().sum()
print(na_counts)

subject_id               0
gender                   0
anchor_age               0
anchor_year              0
anchor_year_group        0
dod                  17285
dtype: int64


In [7]:
patients_df['gender'].value_counts()

gender
F    19076
Name: count, dtype: int64

### Handling Deceased Patients
There are several factors to consider when deciding how to handle deceased patients, such as:
- the patient may have passed many years after their pregnancy, but still within the timeframe in which the data was collected. If that is the case, keep the patient record
- maternal mortality is crucial to this analysis - conditions like hemorrhaging and preeclampsia may lead to death. if a patient has been diagnosed with one of the "adverse pregnancy" outcomes, they should be kept in the study, even if they ultimately died
- if a patient was pregnant and died during their pregnancy but did *not* receive one of the adverse outcomes, it is a gray area as to whether they should be kept or not. It may help to focus the analysis if these patients are removed

More investigation needs to be performed into the records to see whether their death occurred during, immediately after, or significantly after their pregnancy.

In [8]:
patients_df[patients_df['dod'].notna()]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
3,10001884,F,68,2122,2008 - 2010,2131-01-20
9,10003400,F,72,2134,2011 - 2013,2137-09-02
24,10008924,F,46,2139,2008 - 2010,2139-09-09
37,10018052,F,45,2137,2011 - 2013,2137-04-27
61,10030753,F,47,2190,2008 - 2010,2201-06-10
...,...,...,...,...,...,...
18998,19954715,F,79,2129,2011 - 2013,2129-08-07
19001,19956777,F,83,2115,2008 - 2010,2118-11-22
19025,19972371,F,69,2154,2011 - 2013,2155-09-22
19030,19974576,F,75,2122,2014 - 2016,2123-05-07


For these patients, try to find the date for which the diagnosis was made -- requires looking at the `diagnoses_icd` table, joining that to on admissions on `hadm_id`, and looking at the difference between `admittime` and `dod`

In [9]:
def determine_death_during_preg(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT di.subject_id,
       ad.admittime AS admittime,
       p.dod AS date_of_death,
       di.icd_code AS diagnosis_code,
       ad.hadm_id
FROM mimiciv_hosp.diagnoses_icd di
INNER JOIN mimiciv_hosp.admissions ad ON di.hadm_id = ad.hadm_id
INNER JOIN mimiciv_hosp.patients p ON di.subject_id = p.subject_id
WHERE (
    (
        (di.icd_code LIKE 'Z33%' OR di.icd_code LIKE 'Z34%' OR di.icd_code LIKE 'Z3A%' 
         OR di.icd_code LIKE 'O0%' OR di.icd_code LIKE 'O1%' OR di.icd_code LIKE 'O2%' 
         OR di.icd_code LIKE 'O3%' OR di.icd_code LIKE 'O4%' OR di.icd_code LIKE 'O5%' 
         OR di.icd_code LIKE 'O6%' OR di.icd_code LIKE 'O7%' OR di.icd_code LIKE 'O8%' 
         OR di.icd_code LIKE 'O9%') AND di.icd_version = 10
    )
    OR
    (
        (di.icd_code LIKE 'V22%' OR di.icd_code LIKE 'V23%' OR di.icd_code LIKE 'V24%' 
         OR di.icd_code LIKE 'V27%' OR di.icd_code LIKE 'V28%' OR di.icd_code LIKE '63%' 
         OR di.icd_code LIKE '64%' OR di.icd_code LIKE '65%' OR di.icd_code LIKE '66%' 
         OR di.icd_code LIKE '67%') AND di.icd_version = 9
    )
)
AND p.dod IS NOT NULL  -- Include only rows where dod is not null
AND DATE_PART('month', age(p.dod, ad.admittime)) <= 10;  -- Ensure death date is within 10 months
    """)
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "admit_time", "dod", "icd_code", "hadm_id"])
    return df

In [10]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
death_df = determine_death_during_preg(connection)

Connected to db: smcdougall


Death after diagnosis made == admit time - dod < 0 

In [11]:
death_df['dod'] = pd.to_datetime(death_df['dod'])

# Calculate timedelta in days
death_df['days_diff'] = (death_df['dod'] - death_df['admit_time']).dt.days

filtered_df = death_df[death_df['days_diff'] <= 365]

died during the pregnancy == got the icd code within 10 months of dying, means that dod came *after* admission

ultimately we want to look for patients that were pregnant and died within 10 months of the pregnancy diagnosis (meaning they died during the pregnancy) -- use roughly 300 days as the buffer -- or better yet, start with a year and then we can narrow down from there

In [12]:
filtered_df['subject_id'].nunique()

18

We have 18 patients whom received pregnancy-related dx within a year of their death... let's narrow it down some more and see which ones had one of the identifiable adverse outcomes and which ones didn't

In [13]:
patterns = ['O20', 'O72', 'O46', 'O67', '641', '666', '64421', '6441', '7651', '7650', 'O14']
filtered_df = filtered_df[~(filtered_df['icd_code'].str.contains('|'.join(patterns)))]
filtered_df['subject_id'].nunique()

18

None of the patients are in the "adverse outcome" category -- do last checks before removing from the dataset going forward

Patients to remove: 10495653, 11611136, 10504589, 11101737, 15047583, 15014156, 15695321, 19017858, 18805396, 18892314, 17809756, 18186302


(a lot of these are unfortunately linked to mental health disorder icd codes)
some were not removed if the ICD-10 code was something like Z3A - so 38 weeks and the death happened 100 days later

Note - **14969719** and **17086592** has code 677 - "late effects of complications from pregnancy, childbirth, and the puerperium" - included or excluded?
**15500024** has peripartum cardiomyopathy - include or exclude?

Ultimately should include these patients because part of the experiment is to see if we can decipher patients with the pregnancy complications above from the ones that we are specifically studying

In [16]:
deceased_pats_to_exclude = [10495653, 11611136, 10504589, 11101737, 15047583, 15014156, 15695321, 19017858, 18805396, 18892314, 17809756, 18186302]

In [17]:
len(deceased_pats_to_exclude)

12

In [18]:
remaining_pats = [pat for pat in filtered_df['subject_id'] if pat not in deceased_pats_to_exclude]
print(set(remaining_pats))

{17086592, 16334309, 19104262, 14969719, 15500024, 11795258}


In [19]:
deceased_pats_to_exclude = [10495653, 11611136, 10504589, 11101737, 15047583, 15014156, 15695321, 19017858, 18805396, 18892314, 17809756, 18186302]
hosp_and_ed_subjects = [sub for sub in hosp_and_ed_subjects if sub not in deceased_pats_to_exclude]
print(len(hosp_and_ed_subjects))

19076


In [20]:
hosp_and_ed_subjects[0]

'10000719'

### Deciphering the Age Data

### How the logic works
- can subtract age from anchor year to determine birth year under the shifted system
- then find admit_time - birth year to determine their age during pregnancy
- need to join on diagnosis table - and find earliest record for their pregnancy dx?
- may need to determine how to handle outliers - approach could be to do the calculation first and see unique entries, and figure out how to capture... most recent pregnancy only? don't want to use patient multiple times if it will inflate their data

In [22]:
# temporarily (?) add column for birth year
patients_df['birth_year'] = patients_df['anchor_year'] - patients_df['anchor_age']

Next - join on diagnosis table (and admission table?) to get earliest record (?) of the pregnancy

In [24]:
def get_admissions_data(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT di.subject_id, 
        ad.admittime AS admittime,
       di.icd_code AS diagnosis_code,
       ad.hadm_id
                FROM mimiciv_hosp.diagnoses_icd di
                INNER JOIN mimiciv_hosp.patients p ON di.subject_id = p.subject_id
                INNER JOIN mimiciv_hosp.admissions ad ON di.hadm_id = ad.hadm_id
                WHERE (((di.icd_code LIKE 'Z33%' 
                    OR di.icd_code LIKE 'Z34%' 
                    OR di.icd_code LIKE 'Z3A%' 
                    OR di.icd_code LIKE 'O0%' 
                    OR di.icd_code LIKE 'O1%' 
                    OR di.icd_code LIKE 'O2%' 
                    OR di.icd_code LIKE 'O3%' 
                    OR di.icd_code LIKE 'O4%' 
                    OR di.icd_code LIKE 'O5%' 
                    OR di.icd_code LIKE 'O6%' 
                    OR di.icd_code LIKE 'O7%' 
                    OR di.icd_code LIKE 'O8%' 
                    OR di.icd_code LIKE 'O9%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE 'V22%'
                    OR di.icd_code LIKE 'V23%'
                    OR di.icd_code LIKE 'V24%'
                    OR di.icd_code LIKE 'V27%'
                    OR di.icd_code LIKE 'V28%'
                    OR di.icd_code LIKE '63%'
                    OR di.icd_code LIKE '64%'
                    OR di.icd_code LIKE '65%'
                    OR di.icd_code LIKE '66%'
                    OR di.icd_code LIKE '67%') AND di.icd_version = 9)
                    AND p.gender = 'F')
        UNION
        SELECT di.subject_id,
                ed.intime AS admittime, 
                di.icd_code AS diagnosis_code,
                ed.hadm_id
                FROM mimiciv_ed.diagnosis di
                INNER JOIN mimiciv_hosp.patients p ON di.subject_id = p.subject_id
                INNER JOIN mimiciv_ed.edstays ed ON ed.stay_id = di.stay_id
                WHERE (((di.icd_code LIKE 'Z33%' 
                    OR di.icd_code LIKE 'Z34%' 
                    OR di.icd_code LIKE 'Z3A%' 
                    OR di.icd_code LIKE 'O0%' 
                    OR di.icd_code LIKE 'O1%' 
                    OR di.icd_code LIKE 'O2%' 
                    OR di.icd_code LIKE 'O3%' 
                    OR di.icd_code LIKE 'O4%' 
                    OR di.icd_code LIKE 'O5%' 
                    OR di.icd_code LIKE 'O6%' 
                    OR di.icd_code LIKE 'O7%' 
                    OR di.icd_code LIKE 'O8%' 
                    OR di.icd_code LIKE 'O9%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE 'V22%'
                    OR di.icd_code LIKE 'V23%'
                    OR di.icd_code LIKE 'V24%'
                    OR di.icd_code LIKE 'V27%'
                    OR di.icd_code LIKE 'V28%'
                    OR di.icd_code LIKE '63%'
                    OR di.icd_code LIKE '64%'
                    OR di.icd_code LIKE '65%'
                    OR di.icd_code LIKE '66%'
                    OR di.icd_code LIKE '67%') AND di.icd_version = 9))
                    AND p.gender = 'F'
    """
               )
    rows = cur.fetchall()
    cur.close()
    # df = pd.DataFrame(rows, columns=["subject_id", "admit_time", "icd_code", "hadm_id"])
    df = pd.DataFrame(rows, columns=["subject_id", "admit_time", "icd_code", "hadm_id"])
    return df

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
admissions_dx_df = get_admissions_data(connection)

Connected to db: smcdougall


In [25]:
ad_dx_pat_df = pd.merge(admissions_dx_df, patients_df, on='subject_id', how='inner')

In [26]:
ad_dx_pat_df['subject_id'].nunique()

13955

In [27]:
ad_dx_pat_df

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod,birth_year
0,11162216,2157-12-10 02:53:00,O99214,28939128.0,F,20,2154,2014 - 2016,,2134
1,11629944,2128-02-20 14:59:00,64511,29417615.0,F,26,2125,2011 - 2013,,2099
2,10144712,2127-03-02 07:22:00,O770,26339743.0,F,26,2123,2014 - 2016,,2097
3,11787745,2119-07-27 18:16:00,V270,29640424.0,F,32,2119,2008 - 2010,,2087
4,11224954,2115-03-21 12:49:00,64511,26106646.0,F,31,2115,2011 - 2013,,2084
...,...,...,...,...,...,...,...,...,...,...
100289,18614452,2198-09-23 09:44:00,O24420,26836472.0,F,25,2188,2008 - 2010,,2163
100290,16812453,2167-06-10 00:00:00,O2686,22667011.0,F,29,2162,2014 - 2016,,2133
100291,15812409,2143-07-06 19:45:00,65961,24573027.0,F,36,2140,2008 - 2010,,2104
100292,19597429,2150-05-23 12:15:00,65971,23332417.0,F,35,2150,2014 - 2016,,2115


In [28]:
ad_dx_pat_df['admit_time'] = pd.to_datetime(ad_dx_pat_df['admit_time'])
ad_dx_pat_df['age_at_admission'] = ad_dx_pat_df['admit_time'].dt.year - ad_dx_pat_df['birth_year']

In [29]:
ad_dx_pat_df['age_at_admission']

0         23
1         29
2         30
3         32
4         31
          ..
100289    35
100290    34
100291    39
100292    35
100293    21
Name: age_at_admission, Length: 100294, dtype: int64

Next - group together all the diagnoses for each patient, and sort them by date -- group by subject_id, sort by admit_time

In [30]:
ad_dx_pat_df_sorted = ad_dx_pat_df.sort_values(by=['subject_id', 'admit_time']).groupby(['subject_id', 'admit_time']).first().reset_index()
ad_dx_pat_df_sorted['admit_date'] = ad_dx_pat_df_sorted['admit_time'].dt.date

In [31]:
ad_dx_pat_df_sorted

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod,birth_year,age_at_admission,admit_date
0,10000719,2140-04-15 00:14:00,65951,24558333.0,F,34,2140,2008 - 2010,,2106,34,2140-04-15
1,10001319,2134-04-15 07:59:00,66401,29230609.0,F,28,2133,2008 - 2010,,2105,29,2134-04-15
2,10001319,2135-07-20 03:45:00,650,23005466.0,F,28,2133,2008 - 2010,,2105,30,2135-07-20
3,10001319,2138-11-09 20:00:00,V270,24591241.0,F,28,2133,2008 - 2010,,2105,33,2138-11-09
4,10001472,2186-01-10 00:00:00,65811,23506139.0,F,35,2186,2011 - 2013,,2151,35,2186-01-10
...,...,...,...,...,...,...,...,...,...,...,...,...
27142,19999043,2164-12-18 22:42:00,O021,24799384.0,F,35,2164,2014 - 2016,,2129,35,2164-12-18
27143,19999043,2165-01-01 10:22:00,O4692,,F,35,2164,2014 - 2016,,2129,36,2165-01-01
27144,19999043,2165-06-03 15:16:00,O031,21756272.0,F,35,2164,2014 - 2016,,2129,36,2165-06-03
27145,19999043,2165-06-21 19:30:00,O031,,F,35,2164,2014 - 2016,,2129,36,2165-06-21


From here:
- I have a single diagnosis for a given date
- Should do an investigation to see whether the patient has had multiple pregnancies...
- To do this -- find unique on the anchor age and see if there are entries for the same subject with different ages
- Also need to join on ED table to get the ED equivalent since there are some patients without records after we do the join...

In [32]:
ad_dx_pat_df_sorted = ad_dx_pat_df_sorted.drop_duplicates(subset=['subject_id', 'age_at_admission'])
ad_dx_pat_df_sorted

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod,birth_year,age_at_admission,admit_date
0,10000719,2140-04-15 00:14:00,65951,24558333.0,F,34,2140,2008 - 2010,,2106,34,2140-04-15
1,10001319,2134-04-15 07:59:00,66401,29230609.0,F,28,2133,2008 - 2010,,2105,29,2134-04-15
2,10001319,2135-07-20 03:45:00,650,23005466.0,F,28,2133,2008 - 2010,,2105,30,2135-07-20
3,10001319,2138-11-09 20:00:00,V270,24591241.0,F,28,2133,2008 - 2010,,2105,33,2138-11-09
4,10001472,2186-01-10 00:00:00,65811,23506139.0,F,35,2186,2011 - 2013,,2151,35,2186-01-10
...,...,...,...,...,...,...,...,...,...,...,...,...
27138,19996869,2150-06-09 13:10:00,Z3A39,25367676.0,F,32,2148,2014 - 2016,,2116,34,2150-06-09
27139,19998198,2128-09-06 12:34:00,O1424,25917036.0,F,24,2122,2011 - 2013,,2098,30,2128-09-06
27140,19999043,2164-10-28 16:47:00,O09812,23037011.0,F,35,2164,2014 - 2016,,2129,35,2164-10-28
27143,19999043,2165-01-01 10:22:00,O4692,,F,35,2164,2014 - 2016,,2129,36,2165-01-01


Based on the number of rows (~30k), there are either patients who had a birthday during their pregnancy (hence the increase in age and addition column), *or* they had multiple pregnancies. Maybe keep the multiple pregnancies (since we are technically tracking the pregnancies and not necessarily the patient - so they can be represented by multiple rows). However, we only want one age represented for the pregnancy.

Let's make an assumption that the earliest a patient would become pregnant again is 12 weeks later (although it can be as early as three weeks, that is rare). So check the difference between the admission dates, if the difference is three months or less, drop the older age because it signifies that the patient became one year older during the pregnancy, but it's the same pregnancy

In [33]:
ad_dx_pat_df_sorted['age_at_admission']

0        34
1        29
2        30
3        33
4        35
         ..
27138    34
27139    30
27140    35
27143    36
27146    39
Name: age_at_admission, Length: 20227, dtype: int64

In [34]:
ad_dx_pat_df_sorted['admit_date'] = pd.to_datetime(ad_dx_pat_df_sorted['admit_date'])
ad_dx_pat_df_sorted['diff_admission_days'] = ad_dx_pat_df_sorted.groupby(['subject_id'])['admit_date'].diff().dt.days
mask_age_drop = (ad_dx_pat_df_sorted['diff_admission_days'] <= 90) & (ad_dx_pat_df_sorted['diff_admission_days'].notna())
df_filtered = ad_dx_pat_df_sorted[~mask_age_drop].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ad_dx_pat_df_sorted['admit_date'] = pd.to_datetime(ad_dx_pat_df_sorted['admit_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ad_dx_pat_df_sorted['diff_admission_days'] = ad_dx_pat_df_sorted.groupby(['subject_id'])['admit_date'].diff().dt.days


In [35]:
df_filtered.shape

(19952, 13)

In [36]:
df_filtered

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod,birth_year,age_at_admission,admit_date,diff_admission_days
0,10000719,2140-04-15 00:14:00,65951,24558333.0,F,34,2140,2008 - 2010,,2106,34,2140-04-15,
1,10001319,2134-04-15 07:59:00,66401,29230609.0,F,28,2133,2008 - 2010,,2105,29,2134-04-15,
2,10001319,2135-07-20 03:45:00,650,23005466.0,F,28,2133,2008 - 2010,,2105,30,2135-07-20,461.0
3,10001319,2138-11-09 20:00:00,V270,24591241.0,F,28,2133,2008 - 2010,,2105,33,2138-11-09,1208.0
4,10001472,2186-01-10 00:00:00,65811,23506139.0,F,35,2186,2011 - 2013,,2151,35,2186-01-10,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27135,19996869,2148-01-07 12:29:00,Z3A01,,F,32,2148,2014 - 2016,,2116,32,2148-01-07,
27138,19996869,2150-06-09 13:10:00,Z3A39,25367676.0,F,32,2148,2014 - 2016,,2116,34,2150-06-09,884.0
27139,19998198,2128-09-06 12:34:00,O1424,25917036.0,F,24,2122,2011 - 2013,,2098,30,2128-09-06,
27140,19999043,2164-10-28 16:47:00,O09812,23037011.0,F,35,2164,2014 - 2016,,2129,35,2164-10-28,


In [37]:
df_filtered = df_filtered.drop(columns=['diff_admission_days'])

In [38]:
df_filtered['subject_id'].nunique()

13955

In [39]:
row_count = df_filtered.groupby('subject_id').count().reset_index()
filtered_row_count = row_count[row_count['age_at_admission'] > 1]
filtered_row_count[['subject_id', 'age_at_admission']].sort_values(by='age_at_admission')

Unnamed: 0,subject_id,age_at_admission
13951,19996869,2
7436,15310366,2
7437,15311832,2
7442,15313680,2
7444,15314421,2
...,...,...
13433,19608334,6
7943,15645017,6
9183,16543178,6
12672,19067086,7


Keep all the entries for now - they seem legitimate even though some are very high and need more investigation.

In [40]:
df_filtered['age_at_admission'].describe()

count    19952.000000
mean        31.930784
std          5.761032
min         18.000000
25%         28.000000
50%         32.000000
75%         36.000000
max         91.000000
Name: age_at_admission, dtype: float64

In [41]:
df_filtered[df_filtered['age_at_admission'] > 60]

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,anchor_age,anchor_year,anchor_year_group,dod,birth_year,age_at_admission,admit_date
2949,11131318,2186-04-29 14:46:00,677,22805099.0,F,91,2186,2011 - 2013,2189-08-24,2095,91,2186-04-29
3552,11323384,2189-04-21 06:30:00,677,26184333.0,F,71,2183,2008 - 2010,,2112,77,2189-04-21
13422,14969719,2157-03-11 16:44:00,677,23822858.0,F,59,2155,2008 - 2010,2157-04-17,2096,61,2157-03-11
17733,16515239,2141-10-18 19:43:00,677,29019162.0,F,59,2138,2008 - 2010,,2079,62,2141-10-18


Seems that some of these patients refer to diagnosis codes that do not actually correspond to pregnancy... Most likely because of the use of the DRG Codes as filtering criteria.

First filter out records that are aged 60+ (pregnancy is virtually impossible at that point) - look to see all the diagnoses these patients had and whether any of them are actually pregnancy related or not -- all of their icd codes are 677 "late effects of complications of pregnancy, childbirth, and the puerperium"

In [42]:
df_filtered = df_filtered[df_filtered['age_at_admission'] < 60]

In [43]:
df_filtered['subject_id'].nunique()

13951

In [44]:
df_filtered['age_at_admission'].describe()

count    19947.000000
mean        31.921191
std          5.726583
min         18.000000
25%         28.000000
50%         32.000000
75%         36.000000
max         59.000000
Name: age_at_admission, dtype: float64

While we are here, drop the cols for gender, anchor age, anchor year, anchor year group since they are not needed for the analysis any longer

In [45]:
df_filtered = df_filtered.drop(columns=['anchor_age', 'anchor_year', 'anchor_year_group'])

In [46]:
df_filtered

Unnamed: 0,subject_id,admit_time,icd_code,hadm_id,gender,dod,birth_year,age_at_admission,admit_date
0,10000719,2140-04-15 00:14:00,65951,24558333.0,F,,2106,34,2140-04-15
1,10001319,2134-04-15 07:59:00,66401,29230609.0,F,,2105,29,2134-04-15
2,10001319,2135-07-20 03:45:00,650,23005466.0,F,,2105,30,2135-07-20
3,10001319,2138-11-09 20:00:00,V270,24591241.0,F,,2105,33,2138-11-09
4,10001472,2186-01-10 00:00:00,65811,23506139.0,F,,2151,35,2186-01-10
...,...,...,...,...,...,...,...,...,...
27135,19996869,2148-01-07 12:29:00,Z3A01,,F,,2116,32,2148-01-07
27138,19996869,2150-06-09 13:10:00,Z3A39,25367676.0,F,,2116,34,2150-06-09
27139,19998198,2128-09-06 12:34:00,O1424,25917036.0,F,,2098,30,2128-09-06
27140,19999043,2164-10-28 16:47:00,O09812,23037011.0,F,,2129,35,2164-10-28


## 
Filtering out multiple pregnancies...

Overall idea - we want to keep multiple pregnancies (?)

Pros/cons of keeping or filtering out multiple pregnancies:
- restricting to one pregnancy per patient might be preferred when dealing with high correlations between pregnancies
- including multiple pregnancies may be beneficial if observing factors that influence pregnancy outcomes over time (which I do care about)
- larger sample sizes would improve statistical power ((including multiple pregnancies means larger sample size))
- having multiple pregnancies may increase the complexity of the model, making sure that the multiple pregnancies are captured
- doing one pregnancy == loss of potentially useful information

**Bring this up when having meeting to determine common practice to use -- for now, continue to join on demographics-related data to move the analysis along**

## Observing Demographics Data from Admissions Table

Hosp.admissions table has: admission_type, admission_location, discharge_location, which are useful but maybe not worth adding just yet....

Ones to add now: insurance, language, marital_status, race

**Key Assumption: These factors will not change throughout the pregnancy**

To add: Load the admissions table, join on existing `df_filtered` dataframe on `hadm_id`

In [47]:
def get_admissions_dem_data(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id, hadm_id, insurance, language, marital_status, race
                FROM mimiciv_hosp.admissions
    """
               )
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "hadm_id", "insurance", "language", "marital_status", "race"])
    return df

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
admissions_dem_df = get_admissions_dem_data(connection)

# merge first on hadm_id
merged_df = pd.merge(admissions_dem_df, df_filtered, on=['subject_id', 'hadm_id'], how='inner')

# filter rows where hadm_id did not match and attempt to merge on subject_id (to catch ED cases that did not result in
# hospital stay but may have demographcis data saved from another visit)
# represents subjects that do have an hadm_id
subject_id_matches = merged_df['subject_id'].unique()
# represents subjects that do not have a corresponding hadm_id related to pregnancy (but may have some sort of
# hadm_id in the records somewhere)
remaining_df_filtered = df_filtered[~df_filtered['subject_id'].isin(subject_id_matches)]

merged_df_fallback = pd.merge(admissions_dem_df, remaining_df_filtered, on='subject_id', how='right', suffixes=('_admissions', '_filtered'))

final_merged_df = pd.concat([merged_df, merged_df_fallback])
demographics_df = final_merged_df.drop_duplicates()

Connected to db: smcdougall


Logic here: try merging on hadm_id first... if there's no match, fallback onto subject_id
Also note that the `edstays` table has a "race" field that we can look into for any of the patients who have no admissions (that can be done below after observing which subjects remain as NA - join to edstays table on subject_id and find the race value)

In [49]:
demographics_df.isna().sum()

subject_id                0
hadm_id                3074
insurance              1273
language               1273
marital_status         1601
race                   1273
admit_time                0
icd_code                  0
gender                    0
dod                   19062
birth_year                0
age_at_admission          0
admit_date                0
hadm_id_admissions    17305
hadm_id_filtered      19106
dtype: int64

In [50]:
df_filtered['subject_id'].nunique() - demographics_df['subject_id'].nunique()

0

There are about 1200 patients for which an ED visit occurred but they were not ultimately admitted to the hospital and therefore we don't have the admissions data for them... need to isolate these patients and see if there's enough data to include in the analysis 

We have a bunch of duplicates... figure out how to deal with that ((later))

### Consensus on the pre-processing of demographics data
Of the patients that do have this information available, the only field that has NA values is marital status - can add a new enum for unknown?

For now we can also convert the missing insurance, language, and race columns to UNKNOWN as well but may need to filter out these subjects entirely if they don't actually have enough data present...

In [51]:
demographics_df['marital_status'] = demographics_df['marital_status'].fillna('UNKNOWN')
demographics_df['marital_status'].value_counts(dropna=False)

marital_status
MARRIED     10528
SINGLE       6788
UNKNOWN      1601
DIVORCED      175
WIDOWED        14
Name: count, dtype: int64

The ED `edstays` table has a "race" field that we can use as an attempt to fill in some of the empty race values...

In [52]:
def get_ed_dem_data(connection):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id, race
                FROM mimiciv_ed.edstays
    """
               )
    rows = cur.fetchall()
    cur.close()
    df = pd.DataFrame(rows, columns=["subject_id", "race"])
    return df

connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
ed_dem_df = get_ed_dem_data(connection)

recovered_race_df = pd.merge(ed_dem_df, remaining_df_filtered, on='subject_id', how='inner')
recovered_race_df

Connected to db: smcdougall


Unnamed: 0,subject_id,race,admit_time,icd_code,hadm_id,gender,dod,birth_year,age_at_admission,admit_date
0,10002545,WHITE,2158-05-07 20:18:00,O26892,,F,,2121,37,2158-05-07
1,10007611,WHITE,2152-01-02 22:24:00,63492,,F,,2108,44,2152-01-02
2,10007949,WHITE,2158-07-07 09:35:00,O26891,,F,,2134,24,2158-07-07
3,10010053,HISPANIC/LATINO - GUATEMALAN,2125-05-21 12:11:00,64303,,F,,2103,22,2125-05-21
4,10010053,HISPANIC/LATINO - GUATEMALAN,2125-05-21 12:11:00,64303,,F,,2103,22,2125-05-21
...,...,...,...,...,...,...,...,...,...,...
5926,19986146,WHITE,2151-01-25 02:08:00,64893,,F,,2127,24,2151-01-25
5927,19991798,BLACK/AFRICAN AMERICAN,2127-08-04 08:08:00,O021,,F,,2088,39,2127-08-04
5928,19991798,BLACK/AFRICAN AMERICAN,2129-06-18 11:55:00,Z3A28,,F,,2088,41,2129-06-18
5929,19991798,BLACK/AFRICAN AMERICAN,2127-08-04 08:08:00,O021,,F,,2088,39,2127-08-04


We can see that we can actually retrieve the race values for these patients -- need to figure out how to replace the current "race" values with these ones...

In [53]:
recovered_race_map = dict(zip(recovered_race_df['subject_id'], recovered_race_df['race']))
print(demographics_df['race'].isna().sum())
# fill null values in demographics_df['race'] based on recovered_race_map
demographics_df['race'] = demographics_df.apply(lambda row: recovered_race_map[row['subject_id']] if pd.isna(row['race']) and row['subject_id'] in recovered_race_map else row['race'], axis=1)
print(demographics_df['race'].isna().sum())

1273
0


We were able to recover all of the races this way!

In [54]:
demographics_df['insurance'] = demographics_df['insurance'].fillna('UNKNOWN')
demographics_df['language'] = demographics_df['language'].fillna('UNKNOWN')
# replace '?' values with 'UNKNOWN' so we can combine all the unknown languages together
demographics_df['language'] = demographics_df['language'].replace('?', 'UNKNOWN')

# combine 'OTHER', 'UNKNOWN', 'UNABLE TO OBTAIN', 'PATIENT DECLINED TO ANSWER'
variations_to_replace = ['OTHER', 'UNKNOWN', 'UNABLE TO OBTAIN', 'PATIENT DECLINED TO ANSWER']

# Replace with 'UNKNOWN'
demographics_df['race'] = demographics_df['race'].fillna('UNKNOWN')
demographics_df['race'] = demographics_df['race'].replace(variations_to_replace, 'UNKNOWN')

In [55]:
race_list = demographics_df['race'].unique().tolist()
race_list

['WHITE',
 'UNKNOWN',
 'BLACK/AFRICAN AMERICAN',
 'BLACK/CAPE VERDEAN',
 'WHITE - RUSSIAN',
 'HISPANIC OR LATINO',
 'ASIAN',
 'BLACK/AFRICAN',
 'ASIAN - CHINESE',
 'ASIAN - SOUTH EAST ASIAN',
 'WHITE - OTHER EUROPEAN',
 'HISPANIC/LATINO - DOMINICAN',
 'HISPANIC/LATINO - SALVADORAN',
 'AMERICAN INDIAN/ALASKA NATIVE',
 'BLACK/CARIBBEAN ISLAND',
 'HISPANIC/LATINO - PUERTO RICAN',
 'WHITE - BRAZILIAN',
 'MULTIPLE RACE/ETHNICITY',
 'WHITE - EASTERN EUROPEAN',
 'PORTUGUESE',
 'ASIAN - ASIAN INDIAN',
 'HISPANIC/LATINO - HONDURAN',
 'HISPANIC/LATINO - CUBAN',
 'ASIAN - KOREAN',
 'HISPANIC/LATINO - CENTRAL AMERICAN',
 'HISPANIC/LATINO - MEXICAN',
 'HISPANIC/LATINO - GUATEMALAN',
 'SOUTH AMERICAN',
 'HISPANIC/LATINO - COLUMBIAN',
 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER']

Major race and ethnicity groups:

- White
- Black or African American
- Hispanic or Latino
- Asian
- Native American/Alaska Native
- Pacific Islander
- Mixed Race/Ethnicity

In [56]:
"""
Consolidate values for race/ethnicity into specific, common categories
"""
def consolidate_race(race):
    if 'WHITE' in race or 'PORTUGUESE' in race:
        return 'White'
    elif 'BLACK' in race:
        return 'Black or African American'
    elif 'HISPANIC' in race or 'LATINO' in race:
        return 'Hispanic or Latino'
    elif 'ASIAN' in race:
        return 'Asian'
    elif 'AMERICAN INDIAN' in race or 'ALASKA NATIVE' in race:
        return 'Native American/Alaska Native'
    elif 'PACIFIC ISLANDER' in race:
        return 'Pacific Islander'
    elif 'MULTIPLE RACE' in race or 'MULTIRACIAL' in race:
        return 'Mixed Race/Ethnicity'
    elif 'SOUTH AMERICAN' in race:
        return 'South American'
    else:
        return 'Unknown'

# Apply the mapping function to create a new column
demographics_df['race'] = demographics_df['race'].apply(consolidate_race)

In [57]:
demographics_df.isna().sum()

subject_id                0
hadm_id                3074
insurance                 0
language                  0
marital_status            0
race                      0
admit_time                0
icd_code                  0
gender                    0
dod                   19062
birth_year                0
age_at_admission          0
admit_date                0
hadm_id_admissions    17305
hadm_id_filtered      19106
dtype: int64

## End Result
Reduce as much as possible to see the different values for each of the pregnancies, save in a df, and do some basic EDA

Then save that df so it can be imported later

Columns to remove: `admit_time`, `gender`, `birth_year`, `hadm_id_admissions`, `hadm_id_filtered`, `icd_code`

In [58]:
demographics_df = demographics_df.drop(columns=['admit_time', 'gender', 'birth_year', 'hadm_id_admissions',
                                               'hadm_id_filtered', 'icd_code'])
demographics_df

Unnamed: 0,subject_id,hadm_id,insurance,language,marital_status,race,dod,age_at_admission,admit_date
0,10000719,24558333.0,Other,ENGLISH,SINGLE,White,,34,2140-04-15
1,10001319,23005466.0,Other,ENGLISH,MARRIED,White,,30,2135-07-20
2,10001319,24591241.0,Other,ENGLISH,MARRIED,White,,33,2138-11-09
3,10001319,29230609.0,Other,ENGLISH,MARRIED,White,,29,2134-04-15
4,10001472,23506139.0,Other,ENGLISH,MARRIED,White,,35,2186-01-10
...,...,...,...,...,...,...,...,...,...
3069,19985387,,Other,ENGLISH,SINGLE,White,,34,2156-05-13
3070,19986146,,Other,ENGLISH,SINGLE,White,,24,2151-01-25
3071,19986146,,Other,ENGLISH,SINGLE,White,,24,2151-01-25
3072,19991798,,Other,ENGLISH,SINGLE,Black or African American,,39,2127-08-04


In [59]:
demographics_df.to_csv('dataframes/hosp_patients.csv', index=False)  # Set index=False to exclude row indices from the CSV file