# Target Variable Construction

The purpose of this Jupyter Notebook is to construct the output dataframe for the adverse outcomes for each patient.

## Read in the List of Pregnant Patients we are starting with

In [30]:
import psycopg2
from psycopg2 import OperationalError, DatabaseError, sql
import pandas as pd
import csv
import os

In [2]:
### Environment Variables for Connection ###
DB_NAME = 'smcdougall'
USERNAME = 'postgres'
PASSWORD = 'postgres'
HOST = 'localhost'
PORT = 5432 

In [3]:
def connect_to_postgres(db_name, username, password, host, port):
    connection = None
    try:
        connection = psycopg2.connect(
            dbname=db_name,
            user=username,
            password=password,
            host=host,
            port=port
        )
        print('Connected to db:', db_name)
        return connection
    except OperationalError as e:
        print('Received the following error:', e)
        return None

In [4]:
def verify_postgres_connection(connection):
    if connection is not None:
        try:
            cur = connection.cursor()
            cur.execute('SELECT version();')
            db_version = cur.fetchone()
            print('The Postgres database version is:', db_version)
            cur.close()
        except DatabaseError as e:
            print('Received the following error:', e)
    else:
        print('Connection to Postgres failed.')

In [5]:
def close_connection(connection):
    if connection is not None:
        connection.close()
        print('Postgres connection has been closed.')

In [6]:
"""
Saves pandas DataFrame as a CSV file.
"""
def save_df_as_csv(df, csv_name, directory='dataframes'):
    if not os.path.exists(directory):
        os.makedirs(directory)

    file_path = os.path.join(directory, csv_name)
    df.to_csv(file_path, index=False)

    print(f'DataFrame has been saved as {file_path}')

In [7]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
verify_postgres_connection(connection)
close_connection(connection)

Connected to db: smcdougall
The Postgres database version is: ('PostgreSQL 14.5 on aarch64-apple-darwin20.6.0, compiled by Apple clang version 12.0.5 (clang-1205.0.22.9), 64-bit',)
Postgres connection has been closed.


In [8]:
def load_csv_column_as_list(file_path):
    column_values = []
    with open(file_path, newline='') as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            if row:  # Check if row is not empty
                column_values.append(row[0])
    return column_values

file_path = "final_dfs/final_subject_id_list.csv"
# remove column header from the list since we are only interested in the actual ids
hosp_and_ed_subjects = load_csv_column_as_list(file_path)[1:]

In [9]:
print(len(hosp_and_ed_subjects))

19076


We will be starting with 24895 pregnant patients in total.

## Relevant Diagnosis Codes

### Preeclampsia
- ICD-10: O14
- ICD-9: 642.4, 642.5

Note that the O14 diagnosis codes contain specific codes for the different trimesters (unspecified trimester, second trimester, and third trimester), so that can be leveraged in the future for that related analysis

Note that we exclude codes like 642.7 (preeclampsia or eclampsia superimposed on pre-existing hypertension) because it means the patient had hypertension as a pre-existing condition to pregnancy

### Preterm Delivery
- ICD-10: O60.1 (this is specifically preterm labor with delivery, as opposed to without delivery)
- ICD-9: 644.21, 644.1, 765.0, 765.1

### Obstetric Hemorrhage
- ICD-10: O20 (early pregnancy hemorrhage), O72 (postpartum hemorrhage), O46 - antepartum hemorrhage (during pregnancy but before onset of labor), O67 (labor and delivery complicated by intrapartum hemorrhage/during labor and delivery)
- maybe should not include O20 early pregnancy hemorrhage? not sure
- ICD-9: 641 (antepartum hemorrhage), 666 (postpartum hemorrhage)

## Preliminary Counts for Preeclampsia

In [10]:
def get_all_preeclampsia_subject_ids(connection, subject_id_list):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id
        FROM (
        SELECT di.subject_id
                FROM mimiciv_hosp.diagnoses_icd di
                WHERE ((di.icd_code LIKE 'O14%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '6424%'
                    OR di.icd_code LIKE '6425%') AND di.icd_version = 9)
        UNION
        SELECT di.subject_id
                FROM mimiciv_ed.diagnosis di
                WHERE ((di.icd_code LIKE 'O14%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '6424%'
                    OR di.icd_code LIKE '6425%') AND di.icd_version = 9)
    ) AS combined_results;
    """)
    rows = cur.fetchall()
    subject_ids_from_query = [row[0] for row in rows]
    print(len(subject_ids_from_query))

    # Filter subject IDs to those present in subject_id_list
    filtered_subject_ids = [sid for sid in subject_ids_from_query if str(sid) in subject_id_list]
    cur.close()
    return filtered_subject_ids

In [11]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
preeclampsia_subject_ids = get_all_preeclampsia_subject_ids(connection, hosp_and_ed_subjects)
close_connection(connection)
print(len(preeclampsia_subject_ids))

Connected to db: smcdougall
1024
Postgres connection has been closed.
1022


## Preliminary Counts for Preterm Delivery

In [12]:
def get_all_preterm_subject_ids(connection, subject_id_list):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id
        FROM (
        SELECT di.subject_id
                FROM mimiciv_hosp.diagnoses_icd di
                WHERE ((di.icd_code LIKE 'O601%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '64421%'
                    OR di.icd_code LIKE '6441'
                    OR di.icd_code LIKE '7651%'
                    OR di.icd_code LIKE '7650%') AND di.icd_version = 9)
        UNION
        SELECT di.subject_id
                FROM mimiciv_ed.diagnosis di
                WHERE ((di.icd_code LIKE 'O601%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '64421%'
                    OR di.icd_code LIKE '6441'
                    OR di.icd_code LIKE '7651%'
                    OR di.icd_code LIKE '7650%') AND di.icd_version = 9)
    ) AS combined_results;
    """)
    rows = cur.fetchall()
    subject_ids_from_query = [row[0] for row in rows]
    print(len(subject_ids_from_query))

    # Filter subject IDs to those present in subject_id_list
    filtered_subject_ids = [sid for sid in subject_ids_from_query if str(sid) in subject_id_list]
    cur.close()
    return filtered_subject_ids

In [13]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
preterm_subject_ids = get_all_preterm_subject_ids(connection, hosp_and_ed_subjects)
close_connection(connection)
print(len(preterm_subject_ids))

Connected to db: smcdougall
1074
Postgres connection has been closed.
1074


## Preliminary Counts for Obstetric Hemorrhage

In [14]:
def get_all_hemorrhage_subject_ids(connection, subject_id_list):
    cur = connection.cursor()
    cur.execute("""
        SELECT subject_id
        FROM (
        SELECT di.subject_id
                FROM mimiciv_hosp.diagnoses_icd di
                WHERE ((di.icd_code LIKE 'O20%'
                    OR di.icd_code LIKE 'O72%'
                    OR di.icd_code LIKE 'O46%'
                    OR di.icd_code LIKE 'O67%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '641%'
                    OR di.icd_code LIKE '666%') AND di.icd_version = 9)
        UNION
        SELECT di.subject_id
                FROM mimiciv_ed.diagnosis di
                WHERE ((di.icd_code LIKE 'O20%'
                    OR di.icd_code LIKE 'O72%'
                    OR di.icd_code LIKE 'O46%'
                    OR di.icd_code LIKE 'O67%') AND di.icd_version = 10)
                    OR
                    ((di.icd_code LIKE '641%'
                    OR di.icd_code LIKE '666%') AND di.icd_version = 9)
    ) AS combined_results;
    """)
    rows = cur.fetchall()
    subject_ids_from_query = [row[0] for row in rows]
    print(len(subject_ids_from_query))

    # Filter subject IDs to those present in subject_id_list
    filtered_subject_ids = [sid for sid in subject_ids_from_query if str(sid) in subject_id_list]
    cur.close()
    return filtered_subject_ids

In [15]:
connection = connect_to_postgres(DB_NAME, USERNAME, PASSWORD, HOST, PORT)
hemorrhage_subject_ids = get_all_hemorrhage_subject_ids(connection, hosp_and_ed_subjects)
close_connection(connection)
print(len(hemorrhage_subject_ids))

Connected to db: smcdougall
2112
Postgres connection has been closed.
2105


### Preliminary Results

In [16]:
print(len(preeclampsia_subject_ids)/len(hosp_and_ed_subjects), 'percent preeclampsia')
print(len(preterm_subject_ids)/len(hosp_and_ed_subjects), 'percent preterm')
print(len(hemorrhage_subject_ids)/len(hosp_and_ed_subjects), 'hemorrhage')

0.05357517299224156 percent preeclampsia
0.05630111134409729 percent preterm
0.11034808135877543 hemorrhage


They are each less than 10 percent, so this is considered highly imbalanced

## Dataframe Construction

In [24]:
hosp_and_ed_subjects = [int(x) for x in hosp_and_ed_subjects]
df = pd.DataFrame({'subject_id': hosp_and_ed_subjects})
# add binary columns for each condition
df['preeclampsia_output'] = df['subject_id'].apply(lambda x: 1 if x in preeclampsia_subject_ids else 0)
df['preterm_output'] = df['subject_id'].apply(lambda x: 1 if x in preterm_subject_ids else 0)
df['hemorrhage_output'] = df['subject_id'].apply(lambda x: 1 if x in hemorrhage_subject_ids else 0)
df['has_adverse_outcome'] = df[['preeclampsia_output', 'preterm_output', 'hemorrhage_output']].max(axis=1)

In [28]:
df['has_adverse_outcome'].value_counts()

has_adverse_outcome
0    15527
1     3549
Name: count, dtype: int64

In [31]:
save_df_as_csv(df, 'target_outputs.csv', 'final_dfs')

DataFrame has been saved as final_dfs/target_outputs.csv
