# Imports

In [17]:
import sqlite3
import pandas as pd
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt

# Connect to the SQLite database in the file at the given path d:\data\mimic.db
conn = sqlite3.connect("D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3.db")

# print version of sqlite
cursor = conn.cursor()
cursor.execute("SELECT sqlite_version()")
print("SQLite version: %s" % cursor.fetchone())

# close connection
conn.close()

# print pandas version
print("Pandas version: %s" % pd.__version__)

# print matplotlib version
print(f"Matplotlib version: {matplotlib.__version__}")

print(f"NetworkX version: {nx.__version__}")

# Set pandas display options to show all columns and rows without truncation
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
print("\nRemoved truncation of columns")


SQLite version: 3.49.1
Pandas version: 2.2.3
Matplotlib version: 3.10.1
NetworkX version: 3.4.2

Removed truncation of columns


# Load Data

In [18]:
# load patient data
conn = sqlite3.connect("D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3.db")

query = "SELECT * FROM patients"
patients_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM d_icd_diagnoses"
d_icd_diagnoses_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM diagnoses_icd"
diagnoses_icd_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM admissions"
admissions_df = pd.read_sql_query(query, conn)

conn.close()

# Subselect size of data


In [None]:
# print the number of patients and diagnoses
print(f"Number of patients: {len(patients_df)}")
print(f"Number of diagnoses: {len(diagnoses_icd_df)}")

pt_frac = 0.0001  # 0.01% of the patients gives 5 patients, 90 diagnoses
pt_frac = 0.0002  # 0.01% of the patients gives 5 patients, 90 diagnoses

# select 0.01% of the patients randomly
patients_df = patients_df.sample(frac=pt_frac, random_state=1)

# drop rows from diagnoses_icd_df where subject_id is not in patients_df
diagnoses_icd_df = diagnoses_icd_df[diagnoses_icd_df['SUBJECT_ID'].isin(patients_df['SUBJECT_ID'])]

# print the number of patients and diagnoses
print(f"Updated number of patients: {len(patients_df)}")
print(f"Updated number of diagnoses: {len(diagnoses_icd_df)}")


# TODO: fix the HADM_ID issue, only keep diagnosies that are from the latest admissions under subject id, and drop the rest, do same for admissions_df

Number of patients: 46520
Number of diagnoses: 651047
Updated number of patients: 9
Updated number of diagnoses: 138


# Feature Engineering

In [20]:
# log data
print(f"Number of patients: {len(patients_df)}")

# add DOB to admissions_df
admissions_age_df = admissions_df.merge(patients_df[['SUBJECT_ID', 'DOB']], left_on='SUBJECT_ID', right_on='SUBJECT_ID', how='inner', suffixes=('_adm', '_pt'))

# keep the newest admission for each patient, drop the rest
admissions_age_df = admissions_age_df.groupby('SUBJECT_ID').last().reset_index()

# convert to integer from datetime type to calculate age (without overflow)
admissions_age_df['DOB'] = admissions_age_df['DOB'].astype('datetime64[s]').astype(int)
admissions_age_df['ADMITTIME'] = admissions_age_df['ADMITTIME'].astype('datetime64[s]').astype(int)

# calculate age at admission
admissions_age_df['AGE'] = (admissions_age_df['ADMITTIME'] - admissions_age_df['DOB']) / 86400 / 365

# create age intervals in 5 year increments, of type int
admissions_age_df['AGE_BUCKET'] = (admissions_age_df['AGE'] // 5).astype(int) * 5

# set age bucket to 95 where age is greater than 90
admissions_age_df.loc[admissions_age_df['AGE_BUCKET'] > 90, 'AGE_BUCKET'] = 95

# add AGE_BUCKET to patients_df
patients_df = patients_df.merge(admissions_age_df[['SUBJECT_ID', 'HADM_ID', 'AGE_BUCKET']], left_on='SUBJECT_ID', right_on='SUBJECT_ID', how='inner', suffixes=('_pt', '_adm'))

print(f"Updated number of patients (with age bucket): {len(patients_df)}")

Number of patients: 9
Updated number of patients (with age bucket): 9


In [21]:
patients_df.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,HADM_ID,AGE_BUCKET
0,3850,4074,M,2121-08-01 00:00:00,2204-02-06 00:00:00,2204-02-06 00:00:00,2204-02-06 00:00:00,1,137421,80
1,44143,90889,M,2095-04-29 00:00:00,,,,0,168745,55
2,39453,72753,M,2042-06-01 00:00:00,,,,0,144029,85
3,37377,64908,M,2108-04-22 00:00:00,,,,0,172851,75
4,38824,70273,F,2090-05-28 00:00:00,,,,0,102912,75


# Create Graph

## Create patient nodes

In [6]:
# Create a graph
G = nx.Graph()

i = 0

for _, row in patients_df.iterrows():

    # Add a node with the patient ID as the node identifier and gender as an attribute, also add dob as an attribute
    G.add_node(row['SUBJECT_ID'], gender=row['GENDER'], age=row['DOB'], age_bucket=row['AGE_BUCKET'])

    i+=1
    if i % 2000 == 0:
        print(f"Added {i} patient nodes, percent: {i / len(patients_df) * 100:.2f}%")

# number of nodes in the graph
print(f"Number of nodes in the graph: {G.number_of_nodes()}")

Number of nodes in the graph: 9


## Create diagnosis to patient mappings

In [None]:
# use the diagnoses_icd_df to get the icd9 codes, group by icd9 code, and create a dictionary with the icd9 code as the key and the subjects as the values
diagnoses_to_patients = diagnoses_icd_df.groupby('ICD9_CODE')['HADM_ID'].apply(set).to_dict()

# print 1 random key and value from the dictionary
print(f"Sample keys and values from the dictionary:")
print(f"\t\t{list(diagnoses_to_patients.items())[12]}")
print(f"\t\t{list(diagnoses_to_patients.items())[15]}")

# print size of the dictionary and counts of values
print(f"Count of the number of keys in the dictionary: {len(diagnoses_to_patients)}")
print(f"Count of the number of values in all keys: {sum(len(v) for v in diagnoses_to_patients.values())}")

# print number of patients with diagnosis 0389
print(f"Number of patients with diagnosis 0389: {len(diagnoses_to_patients.get('0389', []))}")


Sample keys and values from the dictionary:
		('2760', {160456})
		('2851', {180201, 172851, 144460})
Count of the number of keys in the dictionary: 98
Count of the number of values in all keys: 138
Number of patients with diagnosis 0389: 2


## Add diagnoses edges between patients

In [8]:
# print the number of edges in the graph
print(f"Number of edges: {len(G.edges())}")

Number of edges: 0


In [9]:
i=0
ONCE = True


# add diagnosis/icd9 edge between patients
for icd9_code, patients in diagnoses_to_patients.items():

    # get SHORT_TITLE from d_icd_diagnoses_df for the icd9 code
    short_title = d_icd_diagnoses_df[d_icd_diagnoses_df['ICD9_CODE'] == icd9_code]['SHORT_TITLE'].values

    if len(short_title) == 0:
        short_title = "Unknown"
    else:
        # convert to a string
        short_title = short_title[0]

    if ONCE:
        print(f"ICD9 code: {icd9_code}, short title: {short_title}")
        ONCE = False

    if len(patients) < 2:
        continue

    # if there are more than 1 patients with the same diagnosis, add edges between them
    patients = list(patients)

    combos = [(patients[i], patients[j]) for i in range(len(patients)) for j in range(i + 1, len(patients))]
    G.add_edges_from(combos, icd9=icd9_code, short_title=short_title)
    i += 1
    if i % 4000 == 0:
        print(f"Added {i} edges, percent: {i / len(diagnoses_to_patients) * 100:.2f}%")


ICD9 code: 00845, short title: Int inf clstrdium dfcile


In [10]:
# print the number of edges in the graph
print(f"Number of edges: {len(G.edges())}")

# print the number of nodes in the graph
print(f"Number of nodes: {len(G.nodes())}")

Number of edges: 23
Number of nodes: 9


# Constants

In [11]:
PICKLE_FILE_NAME = None
HTML_FILE_NAME = None

if G.number_of_nodes() < 10:
    PICKLE_FILE_NAME = "patients_graph_max_10_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_10_nodes.html"
elif G.number_of_nodes() < 50:
    PICKLE_FILE_NAME = "patients_graph_max_50_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_50_nodes.html"
elif G.number_of_nodes() < 100:
    PICKLE_FILE_NAME = "patients_graph_max_100_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_100_nodes.html"
elif G.number_of_nodes() < 1000:
    PICKLE_FILE_NAME = "patients_graph_max_1000_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_1000_nodes.html"
elif G.number_of_nodes() < 10000:
    PICKLE_FILE_NAME = "patients_graph_max_10000_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_10000_nodes.html"
else:
    PICKLE_FILE_NAME = "patients_graph_max_46000_nodes.gpickle"
    HTML_FILE_NAME = "pyvis_graph_max_46000_nodes.html"

# Pyvis Graph

In [12]:
# use pyvis to draw the graph
from pyvis.network import Network

nt = Network(height="1200px", width="100%", notebook=False, directed=False, cdn_resources="remote")

print("created pyvis network")
nt.from_nx(G)

# add node attributes to the pyvis network
for node in nt.nodes:
    node['title'] = f"Patient ID: {node['id']}\nGender: {node['gender']}\nAge Bucket: {node['age_bucket']}"
    node['label'] = f"{node['id']}"
    node['font'] = {'size': 5}

# add edge attributes to the pyvis network
for edge in nt.edges:
    edge['title'] = f"ICD9 Code: {edge['icd9']}"
    edge['label'] = f"{edge['short_title']}"
    # set font size to 5
    edge['font'] = {'size': 5}

print("added nx graph to pyvis network")
print(f"file name: {HTML_FILE_NAME}")
nt.toggle_physics(False)
nt.show_buttons()
# nt.generate_html(HTML_FILE_NAME)
nt.save_graph(HTML_FILE_NAME)
# nt.show(HTML_FILE_NAME)
print("saved pyvis graph to file")


created pyvis network
added nx graph to pyvis network
file name: pyvis_graph_max_10_nodes.html
saved pyvis graph to file


# Save Graph to Disk

## graph ml

In [None]:
# nx.write_graphml(G, "D:\\Repos\\ut-health\\assign-8-high-risk-proj\\mimic3_graph.graphml")
# print("Graph saved to file")


## pickle

In [None]:
import pickle

# Save the graph to a pickle file
with open(f"D:\\Repos\\ut-health\\assign-8-high-risk-proj\\{PICKLE_FILE_NAME}", "wb") as f:
    pickle.dump(G, f)
    print("Graph saved to pickle file")


In [14]:
admissions_df.head(1)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1


In [16]:
diagnoses_icd_df.head(1)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
58594,45593,4074,137421,1.0,389


In [15]:
# find all diagnosis codes for a patient with subject_id 72753

# look up diagnosis codes for a patient with subject_id 72753 in diagnoses_icd_df
diagnoses_icd_df[diagnoses_icd_df['SUBJECT_ID'] == 72753].head(10)



Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
517025,529245,72753,144029,1.0,4241
517026,529246,72753,144029,2.0,42833
517027,529247,72753,144029,3.0,4280
517028,529248,72753,144029,4.0,42731
517029,529249,72753,144029,5.0,25000
517030,529250,72753,144029,6.0,V1005
517031,529251,72753,144029,7.0,V1046
517032,529252,72753,144029,8.0,29690
517033,529253,72753,144029,9.0,52100
517034,529254,72753,144029,10.0,52340


# Old

In [None]:
pt_list = ["a", "b", "c"]

# create unique combos of patients
combos = [(pt_list[i], pt_list[j]) for i in range(len(pt_list)) for j in range(i + 1, len(pt_list))]
print(combos)


In [None]:
# load prescription data
conn = sqlite3.connect("D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3.db")

query = "SELECT * FROM admissions"
admissions_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM d_labitems"
d_labitems_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM labevents"
labevents_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM prescriptions"
prescriptions_df = pd.read_sql_query(query, conn)

query = "SELECT * FROM d_items"
d_items_df = pd.read_sql_query(query, conn)

conn.close()


In [None]:

# Create edges based on shared diagnoses
diagnoses_by_patient = diagnoses_icd_df.groupby('SUBJECT_ID')['ICD9_CODE'].apply(set).to_dict()
patient_ids = list(diagnoses_by_patient.keys())

print(f"Number of patients: {len(patient_ids)}")

for i in range(len(patient_ids)):

    if i % 1000 == 0:
        print(f"Processing {i} patients, at {i/len(patient_ids) * 100:.2f}%")

    # compare the current patient with all subsequent patients to avoid duplicates
    # and to ensure that each pair is only considered once. Order of magnitude is O(n^2)
    for j in range(i + 1, len(patient_ids)):

        p1, p2 = patient_ids[i], patient_ids[j]
        if diagnoses_by_patient[p1].intersection(diagnoses_by_patient[p2]):
            G.add_edge(p1, p2, edge_type='diagnosis')



In [None]:
# print some stats about the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Number of connected components: {nx.number_connected_components(G)}")

In [None]:
# persist the graph to a file
# nx.readwrite.gpickle.write_gpickle(G, "D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3_graph.gpickle")
# nx.readwrite.graphml.write_graphml(G, "D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3_graph.graphml")
nx.write_graphml(G, "D:\\Repos\\ut-health\\mimic-iii-gz\\mimic3_graph.graphml")
print("Graph saved to file")


In [None]:



# (Example placeholder) Create edges based on shared procedures: 
# procedures_by_patient = procedures_icd_df.groupby('SUBJECT_ID')['ICD_CODE'].apply(set).to_dict()
# for i in range(len(patient_ids)):
#     for j in range(i + 1, len(patient_ids)):
#         p1, p2 = patient_ids[i], patient_ids[j]
#         if procedures_by_patient[p1].intersection(procedures_by_patient[p2]):
#             G.add_edge(p1, p2, edge_type='procedure')


In [None]:

# Visualize the graph
plt.figure(figsize=(8, 6))
nx.draw(G, with_labels=True, node_size=500)
plt.show()

In [None]:

# # capture time taken for the loop
# import time
# start_time = time.time()

# ii = 0

# # add diagnosis/icd9 edge between patients
# for icd9_code, patients in diagnoses_to_patients.items():

#     elapsed_time = time.time() - start_time

#     # convert patients set to list
#     patients = list(patients)

#     if len(patients) < 2:
#         continue

#     # Add an edge between all patients with the same ICD9 code
#     for i in range(len(patients)):
#         for j in range(i + 1, len(patients)):
#             G.add_edge(patients[i], patients[j], icd9_code=icd9_code)

#             ii += 1
#             # print("added edge between patients: ", patients[i], patients[j], " with icd9 code: ", icd9_code)
#             if ii % 4000 == 0:
#                 print(f"Added {ii} icd9 edges, elapsed time: {elapsed_time:.2f} seconds")
