# Initialization, Loading Modules

In [102]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# pd.set_option('display.max_columns', None)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import joblib

In [2]:
BASE_PATH = "C:\\Users\\iitsh\\OneDrive\\Documents\\Datasets and Models"
DATA_FOLDER = "mimic-iii-clinical-database-1.4"

In [3]:
def path(csv_name):
    return os.path.join(BASE_PATH, DATA_FOLDER, csv_name + ".csv")
path("ICUSTAYS")

'C:\\Users\\iitsh\\OneDrive\\Documents\\Datasets and Models\\mimic-iii-clinical-database-1.4\\ICUSTAYS.csv'

# Data Preprocessing

## General

In [40]:
admissions = pd.read_csv(path("ADMISSIONS"))
patients = pd.read_csv(path("PATIENTS"))
labevents2 = pd.read_csv("labevents2.csv")
if "Unnamed: 0" in labevents2.columns:
    labevents2 = labevents2.drop(["Unnamed: 0"], axis = 1)
diagnosis_embeddings = pd.read_csv("diagnosis_embeddings.csv")
if "Unnamed: 0" in diagnosis_embeddings.columns:
    diagnosis_embeddings = diagnosis_embeddings.drop(["Unnamed: 0"], axis = 1)

In [41]:
patients.shape

(46520, 8)

In [42]:
admissions.shape

(58976, 19)

In [43]:
labevents2.shape

(57227, 21)

In [44]:
diagnosis_embeddings.shape

(28350, 32)

In [45]:
df = patients.merge(admissions, on = "SUBJECT_ID", suffixes = (None, "_Y"))
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,ROW_ID_Y,HADM_ID,...,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,234,249,F,2075-03-13 00:00:00,,,,0,321,116935,...,Medicare,,CATHOLIC,DIVORCED,WHITE,2149-12-17 11:10:00,2149-12-17 21:35:00,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,0,1
1,234,249,F,2075-03-13 00:00:00,,,,0,322,149546,...,Medicare,ENGL,CATHOLIC,DIVORCED,WHITE,2155-02-03 17:43:00,2155-02-03 21:26:00,GI BLEED/ CHEST PAIN,0,1
2,234,249,F,2075-03-13 00:00:00,,,,0,323,158975,...,Medicare,ENGL,CATHOLIC,DIVORCED,WHITE,,,GI BLEEDING\COLONOSCOPY,0,1
3,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1,324,124271,...,Self Pay,HAIT,NOT SPECIFIED,SINGLE,BLACK/AFRICAN AMERICAN,2188-11-12 06:56:00,2188-11-12 10:10:00,PNEUMONIA;R/O TB,1,1
4,236,251,M,2090-03-15 00:00:00,,,,0,325,117937,...,Private,,OTHER,,UNKNOWN/NOT SPECIFIED,2110-07-27 05:00:00,2110-07-27 07:15:00,INTRACRANIAL HEAD BLEED,0,1


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ROW_ID                58976 non-null  int64 
 1   SUBJECT_ID            58976 non-null  int64 
 2   GENDER                58976 non-null  object
 3   DOB                   58976 non-null  object
 4   DOD                   22586 non-null  object
 5   DOD_HOSP              15071 non-null  object
 6   DOD_SSN               19069 non-null  object
 7   EXPIRE_FLAG           58976 non-null  int64 
 8   ROW_ID_Y              58976 non-null  int64 
 9   HADM_ID               58976 non-null  int64 
 10  ADMITTIME             58976 non-null  object
 11  DISCHTIME             58976 non-null  object
 12  DEATHTIME             5854 non-null   object
 13  ADMISSION_TYPE        58976 non-null  object
 14  ADMISSION_LOCATION    58976 non-null  object
 15  DISCHARGE_LOCATION    58976 non-null

In [47]:
df2 = df[["SUBJECT_ID", "GENDER", "EXPIRE_FLAG", "HADM_ID", "ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY", "HOSPITAL_EXPIRE_FLAG"]]
df2.head()

Unnamed: 0,SUBJECT_ID,GENDER,EXPIRE_FLAG,HADM_ID,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,HOSPITAL_EXPIRE_FLAG
0,249,F,0,116935,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,,CATHOLIC,DIVORCED,WHITE,0
1,249,F,0,149546,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,ENGL,CATHOLIC,DIVORCED,WHITE,0
2,249,F,0,158975,EMERGENCY,PHYS REFERRAL/NORMAL DELI,Medicare,ENGL,CATHOLIC,DIVORCED,WHITE,0
3,250,F,1,124271,EMERGENCY,EMERGENCY ROOM ADMIT,Self Pay,HAIT,NOT SPECIFIED,SINGLE,BLACK/AFRICAN AMERICAN,1
4,251,M,0,117937,EMERGENCY,EMERGENCY ROOM ADMIT,Private,,OTHER,,UNKNOWN/NOT SPECIFIED,0


In [48]:
df3 = df2.merge(labevents2, on = "HADM_ID", how = "left", suffixes = (None, "_Y"))
df3.shape

(58976, 32)

In [49]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SUBJECT_ID            58976 non-null  int64  
 1   GENDER                58976 non-null  object 
 2   EXPIRE_FLAG           58976 non-null  int64  
 3   HADM_ID               58976 non-null  int64  
 4   ADMISSION_TYPE        58976 non-null  object 
 5   ADMISSION_LOCATION    58976 non-null  object 
 6   INSURANCE             58976 non-null  object 
 7   LANGUAGE              33644 non-null  object 
 8   RELIGION              58518 non-null  object 
 9   MARITAL_STATUS        48848 non-null  object 
 10  ETHNICITY             58976 non-null  object 
 11  HOSPITAL_EXPIRE_FLAG  58976 non-null  int64  
 12  Anion Gap             57227 non-null  float64
 13  Bicarbonate           57227 non-null  float64
 14  Calcium, Total        57227 non-null  float64
 15  Chloride           

In [50]:
df3[labevents2.columns[1:]] = df3[labevents2.columns[1:]].fillna(df3[labevents2.columns[1:]].mean(axis = 0))
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SUBJECT_ID            58976 non-null  int64  
 1   GENDER                58976 non-null  object 
 2   EXPIRE_FLAG           58976 non-null  int64  
 3   HADM_ID               58976 non-null  int64  
 4   ADMISSION_TYPE        58976 non-null  object 
 5   ADMISSION_LOCATION    58976 non-null  object 
 6   INSURANCE             58976 non-null  object 
 7   LANGUAGE              33644 non-null  object 
 8   RELIGION              58518 non-null  object 
 9   MARITAL_STATUS        48848 non-null  object 
 10  ETHNICITY             58976 non-null  object 
 11  HOSPITAL_EXPIRE_FLAG  58976 non-null  int64  
 12  Anion Gap             58976 non-null  float64
 13  Bicarbonate           58976 non-null  float64
 14  Calcium, Total        58976 non-null  float64
 15  Chloride           

In [75]:
df4 = df3.merge(diagnosis_embeddings, on = "HADM_ID", how = "left", suffixes = (None, "_Y"))
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60419 entries, 0 to 60418
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SUBJECT_ID               60419 non-null  int64  
 1   GENDER                   60419 non-null  object 
 2   EXPIRE_FLAG              60419 non-null  int64  
 3   HADM_ID                  60419 non-null  int64  
 4   ADMISSION_TYPE           60419 non-null  object 
 5   ADMISSION_LOCATION       60419 non-null  object 
 6   INSURANCE                60419 non-null  object 
 7   LANGUAGE                 34798 non-null  object 
 8   RELIGION                 59961 non-null  object 
 9   MARITAL_STATUS           50291 non-null  object 
 10  ETHNICITY                60419 non-null  object 
 11  HOSPITAL_EXPIRE_FLAG     60419 non-null  int64  
 12  Anion Gap                60419 non-null  float64
 13  Bicarbonate              60419 non-null  float64
 14  Calcium, Total        

In [76]:
df4.shape

(60419, 63)

In [77]:
df4[["LANGUAGE", "RELIGION", "MARITAL_STATUS"]] = df4[["LANGUAGE", "RELIGION", "MARITAL_STATUS"]].fillna("UNKNOWN")

In [78]:
df4 = df4.drop(["DIAGNOSIS"], axis = 1)

## Embeddings

In [57]:
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = BertModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")



In [58]:
def get_clinicalbert_embedding(diagnosis):
    inputs = tokenizer(diagnosis, return_tensors = 'pt', truncation = True, padding = True, max_length = 128)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim = 1).detach().numpy()
    return embeddings[0]

get_clinicalbert_embedding("").shape

(768,)

In [59]:
np.array(get_clinicalbert_embedding("").tolist())

array([ 7.86792934e-01, -2.52326757e-01, -7.48165846e-02,  5.38168669e-01,
       -6.13306522e-01,  1.55442208e-01,  1.70231327e-01, -3.85478079e-01,
        4.51851487e-01, -2.17552766e-01, -3.64408195e-01, -4.89096582e-01,
       -8.47645700e-01,  8.49214643e-02, -7.49585748e-01,  8.46124709e-01,
        2.41761118e-01,  4.97108400e-01, -1.14376843e-02,  1.06724709e-01,
        1.57570705e-01, -4.11855519e-01, -3.22666705e-01, -4.56335336e-01,
       -1.07107915e-01,  1.82701908e-02,  8.12282443e-01,  2.09423587e-01,
       -3.64370614e-01,  2.56134063e-01,  3.88191819e-01,  4.28710848e-01,
       -2.11700916e-01,  3.05241883e-01, -6.79957390e-01,  1.65989757e-01,
        1.44901246e-01, -1.03520751e+00, -4.20886837e-02, -4.13491786e-01,
       -2.29299694e-01,  1.40357524e-01,  3.69807452e-01, -3.98489296e-01,
        2.65808344e-01, -7.11802602e-01,  2.33109426e-02, -4.35906857e-01,
        6.79388225e-01,  5.63531443e-02,  7.95930922e-02,  2.40634799e-01,
        1.18329845e-01,  

In [60]:
diagnosis_pca = joblib.load("diagnosis_pca.pkl")

In [81]:
no_diagnosis_embedding = diagnosis_pca.transform(np.array(get_clinicalbert_embedding("").tolist()).reshape((1, -1)))
no_diagnosis_embedding = no_diagnosis_embedding.reshape((30,))

In [82]:
no_diagnosis_embedding = pd.Series(no_diagnosis_embedding, index = diagnosis_embeddings.columns[2:])
no_diagnosis_embedding

Diagnosis Embedding: 1     6.322318
Diagnosis Embedding: 2    -1.317416
Diagnosis Embedding: 3     2.951266
Diagnosis Embedding: 4     0.334437
Diagnosis Embedding: 5    -0.180758
Diagnosis Embedding: 6    -0.621569
Diagnosis Embedding: 7    -0.697059
Diagnosis Embedding: 8    -0.718026
Diagnosis Embedding: 9     2.313809
Diagnosis Embedding: 10    1.174245
Diagnosis Embedding: 11    1.535792
Diagnosis Embedding: 12   -0.985281
Diagnosis Embedding: 13    0.019751
Diagnosis Embedding: 14   -1.242425
Diagnosis Embedding: 15    1.298467
Diagnosis Embedding: 16   -0.283092
Diagnosis Embedding: 17    0.087043
Diagnosis Embedding: 18   -0.975209
Diagnosis Embedding: 19   -1.798822
Diagnosis Embedding: 20    0.216820
Diagnosis Embedding: 21    0.714826
Diagnosis Embedding: 22   -1.423948
Diagnosis Embedding: 23    2.560760
Diagnosis Embedding: 24    1.136964
Diagnosis Embedding: 25   -0.259231
Diagnosis Embedding: 26    0.030026
Diagnosis Embedding: 27   -0.290483
Diagnosis Embedding: 28    0

In [83]:
df4[diagnosis_embeddings.columns[2:]] = df4[diagnosis_embeddings.columns[2:]].fillna(no_diagnosis_embedding)
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60419 entries, 0 to 60418
Data columns (total 62 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SUBJECT_ID               60419 non-null  int64  
 1   GENDER                   60419 non-null  object 
 2   EXPIRE_FLAG              60419 non-null  int64  
 3   HADM_ID                  60419 non-null  int64  
 4   ADMISSION_TYPE           60419 non-null  object 
 5   ADMISSION_LOCATION       60419 non-null  object 
 6   INSURANCE                60419 non-null  object 
 7   LANGUAGE                 60419 non-null  object 
 8   RELIGION                 60419 non-null  object 
 9   MARITAL_STATUS           60419 non-null  object 
 10  ETHNICITY                60419 non-null  object 
 11  HOSPITAL_EXPIRE_FLAG     60419 non-null  int64  
 12  Anion Gap                60419 non-null  float64
 13  Bicarbonate              60419 non-null  float64
 14  Calcium, Total        

## Aggregation

In [86]:
agg_methods = {
    "GENDER": "first",
    "EXPIRE_FLAG": "max",
    "ADMISSION_TYPE": "first",
    "ADMISSION_LOCATION": "first",
    "INSURANCE": "first",
    "LANGUAGE": "first",
    "RELIGION": "first",
    "MARITAL_STATUS": "first",
    "ETHNICITY": "first",
    "HOSPITAL_EXPIRE_FLAG": "max",
}
for col in labevents2.columns[1:]:
    agg_methods[col] = "mean"
for col in diagnosis_embeddings.columns[2:]:
    agg_methods[col] = "sum"
len(agg_methods.keys())

60

In [88]:
final_df3 = df4.drop(["HADM_ID"], axis = 1).groupby('SUBJECT_ID').agg(agg_methods).reset_index()
final_df3.head()

Unnamed: 0,SUBJECT_ID,GENDER,EXPIRE_FLAG,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,...,Diagnosis Embedding: 21,Diagnosis Embedding: 22,Diagnosis Embedding: 23,Diagnosis Embedding: 24,Diagnosis Embedding: 25,Diagnosis Embedding: 26,Diagnosis Embedding: 27,Diagnosis Embedding: 28,Diagnosis Embedding: 29,Diagnosis Embedding: 30
0,2,M,0,NEWBORN,PHYS REFERRAL/NORMAL DELI,Private,UNKNOWN,NOT SPECIFIED,UNKNOWN,ASIAN,...,0.714826,-1.423948,2.56076,1.136964,-0.259231,0.030026,-0.290483,0.842531,-0.445274,0.146142
1,3,M,1,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,UNKNOWN,CATHOLIC,MARRIED,WHITE,...,0.714826,-1.423948,2.56076,1.136964,-0.259231,0.030026,-0.290483,0.842531,-0.445274,0.146142
2,4,F,0,EMERGENCY,EMERGENCY ROOM ADMIT,Private,UNKNOWN,PROTESTANT QUAKER,SINGLE,WHITE,...,0.153319,0.202005,-0.965132,0.564511,-0.023339,-0.18559,-0.268535,-0.217234,0.248394,-0.353038
3,5,M,0,NEWBORN,PHYS REFERRAL/NORMAL DELI,Private,UNKNOWN,BUDDHIST,UNKNOWN,ASIAN,...,0.714826,-1.423948,2.56076,1.136964,-0.259231,0.030026,-0.290483,0.842531,-0.445274,0.146142
4,6,F,0,ELECTIVE,PHYS REFERRAL/NORMAL DELI,Medicare,ENGL,NOT SPECIFIED,MARRIED,WHITE,...,0.28756,-1.120275,-0.077396,-0.225054,0.000836,-0.439155,0.168988,-0.69522,-0.482354,0.781746


In [89]:
final_df3.shape

(46520, 61)

In [91]:
cat_final_df3 = pd.get_dummies(final_df3.select_dtypes(["object"]), prefix_sep = ": ", drop_first = True, dtype = int)
cat_final_df3.head()

Unnamed: 0,GENDER: M,ADMISSION_TYPE: EMERGENCY,ADMISSION_TYPE: NEWBORN,ADMISSION_TYPE: URGENT,ADMISSION_LOCATION: CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION: EMERGENCY ROOM ADMIT,ADMISSION_LOCATION: HMO REFERRAL/SICK,ADMISSION_LOCATION: PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION: TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION: TRANSFER FROM OTHER HEALT,...,ETHNICITY: PATIENT DECLINED TO ANSWER,ETHNICITY: PORTUGUESE,ETHNICITY: SOUTH AMERICAN,ETHNICITY: UNABLE TO OBTAIN,ETHNICITY: UNKNOWN/NOT SPECIFIED,ETHNICITY: WHITE,ETHNICITY: WHITE - BRAZILIAN,ETHNICITY: WHITE - EASTERN EUROPEAN,ETHNICITY: WHITE - OTHER EUROPEAN,ETHNICITY: WHITE - RUSSIAN
0,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [92]:
num_final_df3 = final_df3.select_dtypes(["number"])
num_final_df3.shape

(46520, 53)

In [94]:
final_df3 = pd.concat([num_final_df3, cat_final_df3], axis = 1)
final_df3.shape

(46520, 210)

In [95]:
# final_df3.to_csv("final_df3.csv")

# Modelling

In [96]:
patient_ids = df['SUBJECT_ID']
final_df4 = final_df3.drop(columns=['SUBJECT_ID'], axis = 1)
ss2 = StandardScaler()
final_df4 = ss2.fit_transform(final_df4)
final_df4

array([[-0.71575453, -0.37789019,  0.00495394, ..., -0.02175175,
        -0.03854136, -0.04778905],
       [ 1.39712704, -0.37789019,  0.52815569, ..., -0.02175175,
        -0.03854136, -0.04778905],
       [-0.71575453, -0.37789019,  1.26680371, ..., -0.02175175,
        -0.03854136, -0.04778905],
       ...,
       [-0.71575453, -0.37789019, -0.67214734, ..., -0.02175175,
        -0.03854136, -0.04778905],
       [ 1.39712704, -0.37789019, -1.59545737, ..., -0.02175175,
        -0.03854136, -0.04778905],
       [-0.71575453, -0.37789019, -0.79525534, ..., -0.02175175,
        -0.03854136, -0.04778905]])

In [97]:
# joblib.dump(ss2, "ss2.pkl")

['ss2.pkl']

In [99]:
X_train, X_test = train_test_split(final_df4, test_size = 0.3, random_state = 0)

In [101]:
input_dim = X_train.shape[1]
encoding_dim = 30

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
decoder = Dense(input_dim, activation="sigmoid")(encoder)

In [103]:
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

In [104]:
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

Epoch 1/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 1.0542 - val_loss: 0.8638
Epoch 2/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8550 - val_loss: 0.8422
Epoch 3/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8428 - val_loss: 0.8347
Epoch 4/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8303 - val_loss: 0.8306
Epoch 5/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8389 - val_loss: 0.8278
Epoch 6/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8844 - val_loss: 0.8260
Epoch 7/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8247 - val_loss: 0.8246
Epoch 8/50
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.8435 - val_loss: 0.8237
Epoch 9/50
[1m1018/1018

<keras.src.callbacks.history.History at 0x18893032420>

In [106]:
encoder_model = Model(inputs=input_layer, outputs=encoder)
embeddings = encoder_model.predict(final_df4)
embeddings.shape

[1m1454/1454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 659us/step


(46520, 30)

In [107]:
# joblib.dump(encoder_model, "patient_autoencoder.pkl")

['patient_autoencoder.pkl']