In [1]:
# CELL 1: Mount Drive and load structured cohort + ClinicalBERT embeddings

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import os

# Base path (same project folder)
base_path = "/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC"

# Paths
cohort_path = os.path.join(base_path, "outputs", "RA_COHORT_PHASE2_STRUCTURED.parquet")
cb_emb_path = os.path.join(base_path, "outputs", "RA_NOTES_EMB_ClinicalBERT_meanpool.parquet")

# Load data
cohort_df = pd.read_parquet(cohort_path)
cb_emb_df = pd.read_parquet(cb_emb_path)

print("Cohort shape:", cohort_df.shape)
print("ClinicalBERT embeddings shape:", cb_emb_df.shape)

cohort_df.head(2), cb_emb_df.head(2)


Mounted at /content/drive
Cohort shape: (15462, 63)
ClinicalBERT embeddings shape: (4979, 771)


(   subject_id   hadm_id            admittime            dischtime deathtime  \
 0    10002443  21329020  2183-10-17 23:20:00  2183-10-20 18:47:00      None   
 1    10003203  25146996  2153-04-26 02:05:00  2153-04-29 14:19:00      None   
 
    admission_type admit_provider_id      admission_location  \
 0        EW EMER.            P343TV  TRANSFER FROM HOSPITAL   
 1  EU OBSERVATION            P57BOT          EMERGENCY ROOM   
 
   discharge_location insurance  ... wbc_high_flag wbc_low_flag hb_low_flag  \
 0               HOME   Private  ...             1            0           0   
 1               None  Medicare  ...             0            0           1   
 
   platelet_low_flag hct_low_flag  cbc_abnormal_count wbc_range  hb_range  \
 0                 0            0                   1       7.7       2.1   
 1                 0            1                   2       0.2       0.4   
 
    platelet_range  crp_range  
 0            88.0        NaN  
 1            22.0        Na

**2 — Merge ClinicalBERT Embeddings Into RA Cohort**

In [2]:
# CELL 2: Merge structured cohort with ClinicalBERT embeddings (mean-pooled)

# 1) Merge on subject_id + hadm_id (unique admission)
merged_df = cohort_df.merge(
    cb_emb_df,
    on=["subject_id", "hadm_id"],
    how="left"          # keep all admissions; notes may be missing for some
)

print("Merged DF shape:", merged_df.shape)

# 2) Check how many admissions now have embeddings
num_with_embeddings = merged_df['cb_mean_0'].notna().sum()
print("Admissions with ClinicalBERT embeddings:", num_with_embeddings)

merged_df.head(2)


Merged DF shape: (15462, 832)
Admissions with ClinicalBERT embeddings: 5478


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,cb_mean_759,cb_mean_760,cb_mean_761,cb_mean_762,cb_mean_763,cb_mean_764,cb_mean_765,cb_mean_766,cb_mean_767,note_id
0,10002443,21329020,2183-10-17 23:20:00,2183-10-20 18:47:00,,EW EMER.,P343TV,TRANSFER FROM HOSPITAL,HOME,Private,...,,,,,,,,,,
1,10003203,25146996,2153-04-26 02:05:00,2153-04-29 14:19:00,,EU OBSERVATION,P57BOT,EMERGENCY ROOM,,Medicare,...,,,,,,,,,,


**3 — Handle Missing Embeddings (Fill with Zero Vectors)**

In [3]:
# CELL 3: Fill missing embeddings with 0 so models can use all admissions

embedding_cols = [col for col in merged_df.columns if col.startswith("cb_mean_")]

merged_df[embedding_cols] = merged_df[embedding_cols].fillna(0.0)

# Verify no NaNs remain in embeddings
print("Remaining NaNs in embeddings:", merged_df[embedding_cols].isna().sum().sum())


Remaining NaNs in embeddings: 0


**4 — Save the Multimodal Modeling Dataset**

In [4]:
# CELL 4: Save merged multimodal dataset for modeling

model_base_path = os.path.join(
    base_path,
    "outputs",
    "RA_MODELING_BASE_ClinicalBERT.parquet"
)

merged_df.to_parquet(model_base_path, index=False)

print("Saved multimodal modeling dataset to:")
print(model_base_path)
print("Final shape:", merged_df.shape)


Saved multimodal modeling dataset to:
/content/drive/MyDrive/Rahul_DTSC5082_Project/Scenario2_MIMIC/outputs/RA_MODELING_BASE_ClinicalBERT.parquet
Final shape: (15462, 832)
