In [1]:
import os
import glob
from pathlib import Path
from tqdm import tqdm
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

# MEDS ETL

In [2]:
# !pip install meds_etl

In [2]:
folder = Path("allofus_omop_v8/allofus_omop_v8/visit_occurrence")
files = [p for p in folder.glob("*.parquet")]  # only top-level files
len(files)

In [3]:
for src in tqdm(files):
    df = pd.read_parquet(src)
    df = df.rename(columns = {"discharge_to_concept_id": "discharged_to_concept_id"})
    df.to_parquet(src)

In [4]:
export OMOP_DIR=/home/jupyter/workspaces/ehrcancermodelevaluation/allofus_omop_v8/allofus_omop_v8
export OMOP_MEDS=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS

meds_etl_omop $OMOP_DIR $OMOP_MEDS --num_proc 16

# Converting into meds_reader

In [3]:
# !pip install meds_reader==0.1.13
# !pip install femr-0.2.0-py3-none-any.whl
# !pip install meds_evaluation-0.1.dev95+g841c87f-py3-none-any.whl

In [None]:
export OMOP_MEDS=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
meds_reader_convert $OMOP_MEDS $OMOP_MEDS_READER --num_threads 16

# Preparing for training

In [None]:
# prepare concept table in csv 

In [4]:
files = glob.glob('allofus_omop_v8/concept/*.parquet')
dfs = []
for f in tqdm(files):
    dfs.append(pd.read_parquet(f))
concept = pd.concat(dfs)
concept.to_csv('allofus_omop_v8/motor8192/CONCEPT.csv', index = False)

In [5]:
files = glob.glob('allofus_omop_v8/concept_relationship/*.parquet')
dfs = []
for f in tqdm(files):
    dfs.append(pd.read_parquet(f))
df = pd.concat(dfs)
df.to_csv('allofus_omop_v8/motor8192/CONCEPT_RELATIONSHIP.csv', index = False)

In [6]:
files = glob.glob('allofus_omop_v8/concept_ancestor/*.parquet')
dfs = []
for f in tqdm(files):
    dfs.append(pd.read_parquet(f))
df = pd.concat(dfs)
df.to_csv('allofus_omop_v8/motor8192/CONCEPT_ANCESTOR.csv', index = False)

In [23]:
files = glob.glob('allofus_omop_v8/allofus_omop_v8/patient_splits/*.parquet')
patient_split = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

In [27]:
patient_split.columns = ['subject_id','split']

In [28]:
patient_split.to_parquet('/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER/metadata/subject_splits.parquet', index = False)

In [None]:
# git clone https://github.com/ChaoPang/femr
# femr/ pip install .
# git checkout omop_meds_v3_tutorial

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/allofus_omop_v8/motor8192
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
export ATHENA_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/allofus_omop_v8/motor8192

nohup python -u -m femr.omop_meds_tutorial.prepare_motor \
  --pretraining_data $PRETRAINING_DATA \
  --athena_path $ATHENA_DATA \
  --meds_reader $OMOP_MEDS_READER \
  --tokens_per_batch 8192 > nohup.out 2>&1 &

# Motor pretraining

In [None]:
# There is one line of code we have to disable, which prevent from using multi gpus
# processor.py at line 465, you need to remove that assertion 
# comment out "assert len(batches)==1" at line 465

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/allofus_omop_v8/motor8192
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
export ATHENA_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/allofus_omop_v8/motor8192
export OUTPUT_DIR=/home/jupyter/workspaces/ehrcancermodelevaluation/aou_motor

In [None]:
# memory error with the following command
# nohup python -u -m femr.omop_meds_tutorial.pretrain_motor \
#   --pretraining_data $PRETRAINING_DATA \
#   --meds_reader $OMOP_MEDS_READER > motor_pretrain.out &

nohup deepspeed --num_gpus=8 src/femr/omop_meds_tutorial/pretrain_motor.py --pretraining_data $PRETRAINING_DATA \
    --meds_reader $OMOP_MEDS_READER \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --learning_rate 1e-5\
    --output_dir $OUTPUT_DIR \
    --remove_unused_columns false \
    --weight_decay 0.1 \
    --adam_beta2 0.95 \
    --report_to tensorboard \
    --num_train_epochs 50 \
    --warmup_steps 500 \
    --logging_strategy epoch \
    --logging_steps 10 \
    --save_strategy epoch \
    --evaluation_strategy epoch \
    --dataloader_num_workers 12 \
    --save_total_limit 10 \
    --load_best_model_at_end \
    --metric_for_best_model eval_loss \
    --greater_is_better false \
    --deepspeed /home/jupyter/workspaces/ehrcancermodelevaluation/deepspeed.json \
    > motor_train_log.txt 2>&1 &

# Generate features

In [None]:
# you need to make motor_model folder in aou_motor and copy neccessary files from aou_motor and motor8192
# for example move trainer_state.json into motor_model
# copy config.json and model.safetensors for checkpoint into motor_model
# copy dictionary.msgpack from motor8192 tokenizer folder into motor_model

In [None]:
# src/femr/models/transformers.py, on line 489, femr autocasts data and model to bfloat16 in compute_features, this is not supported by V100. You can simply change it torch.float32, it should work. 

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/aou_motor
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
export COHORT_DIR=/home/jupyter/workspaces/ehrcancermodelevaluation/meds_data

nohup python -u -m femr.omop_meds_tutorial.generate_motor_features \
  --pretraining_data $PRETRAINING_DATA \
  --meds_reader $OMOP_MEDS_READER \
  --cohort_dir $COHORT_DIR > motor_features.out 2>&1 &

# Femr baseline

In [34]:
import glob
files = glob.glob('allofus_omop_v8/patient_splits/*.parquet')
patient_split = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

In [6]:
patient_split.columns = ['subject_id', 'split_name']

In [36]:
patient_split.to_csv('OMOP_MEDS_READER/main_split.csv', index = False)

In [62]:
patient_split.to_csv('aou_motor/main_split.csv', index = False)

In [None]:
ids_by_cancer_type = load_dict('ids_by_cancer_type')
all_meds = pd.read_parquet('aou_motor/labels/meds_data.parquet')

for cancer_type, ids in tqdm(ids_by_cancer_type.items()):
    sub_meds = all_meds[all_meds['subject_id'].isin(ids)]
    sub_meds.to_parquet(f'OMOP_MEDS_READER/labels/meds_{cancer_type}.parquet', index = False)

In [None]:
#go to step 7 linear probing and generate 'aou_motor/features/meds_{cancer_type}_motor.pkl'

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER
export COHORT_DIR=/home/jupyter/workspaces/ehrcancermodelevaluation/aou_motor/labels/meds_pancreas.parquet

- featurizing
python -u -m femr.omop_meds_tutorial.generate_tabular_features --pretraining_data $PRETRAINING_DATA --meds_reader $OMOP_MEDS_READER --cohort_dir $COHORT_DIR

- baseline model training
nohup python -u -m femr.omop_meds_tutorial.train_baseline --pretraining_data $PRETRAINING_DATA --meds_reader $OMOP_MEDS_READER --cohort_label meds_all &> meds_baseline2.out &

In [None]:
# results are saved at OMOP_MEDS_READER/results 

# Linear probing

In [1]:
import pandas as pd
from collections import Counter
import os
import numpy as np
from tqdm import tqdm
import pickle
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import gcsfs
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import mannwhitneyu
from pathlib import Path

bucket = os.getenv("WORKSPACE_BUCKET")
print(bucket)

def write_df_to_bucket(df, filename: str):
    # save mt directly to bucket
    df.to_csv(f"{bucket}/data/{filename}.csv", index = None)
def read_df_from_bucket(filename: str):
    # save mt directly to bucket
    return pd.read_csv(f"{bucket}/data/{filename}.csv")

fs = gcsfs.GCSFileSystem()
def save_dict(data_dict, filename: str):
    full_path = f"{bucket}/data/{filename}.pickle"
    with fs.open(full_path, "wb") as h:
        pickle.dump(data_dict, h)
def load_dict(filename: str):
    full_path = f"{bucket}/data/{filename}.pickle"
    with fs.open(full_path, "rb") as h:
        data_dict = pickle.load(h)
    return data_dict

gs://fc-secure-29afd1b1-16e7-4b67-a9f2-ddd80f884e0d


In [4]:
ids_by_cancer_type = load_dict('ids_by_cancer_type')

In [3]:
# all_meds = pd.read_parquet('aou_motor/labels/meds_data.parquet')
all_meds = pd.read_parquet(f"{bucket}/aou_motor/labels/meds_data.parquet")
all_meds.to_parquet('aou_motor/labels/meds_data.parquet', index = False)

In [68]:
patient_split_dict = dict(zip(patient_split['subject_id'], patient_split['split_name']))

In [69]:
sub_meds = pd.read_parquet('aou_motor/labels/meds_all.parquet')

In [70]:
sub_meds['split'] = sub_meds['subject_id'].map(patient_split_dict)

In [7]:
for cancer_type, ids in tqdm(ids_by_cancer_type.items()):
    sub_meds = all_meds[all_meds['subject_id'].isin(ids)]
    sub_meds.to_parquet(f'aou_motor/labels/meds_{cancer_type}.parquet', index = False)

In [8]:
# with open('aou_motor/features/meds_data_motor.pkl', 'rb') as h:
#     motor_fea = pickle.load(h)
import gcsfs
import pickle

fs = gcsfs.GCSFileSystem()
with fs.open(f'{bucket}/aou_motor/features/hide/meds_data_motor.pkl', 'rb') as h:
    motor_fea = pickle.load(h)

In [10]:
def filter_dict(data, selected_ids):
    # data is your original dictionary with keys:
    # "subject_ids", "feature_times", "features"
    filtered = {
        "subject_ids": [],
        "feature_times": [],
        "features": []
    }

    for sid, ftime, feat in tqdm(zip(data["subject_ids"], data["feature_times"], data["features"]), total=len(data['subject_ids'])):
        if sid in selected_ids:
            filtered["subject_ids"].append(sid)
            filtered["feature_times"].append(ftime)
            filtered["features"].append(feat)

    return {
        "subject_ids": np.array(filtered["subject_ids"]),
        "feature_times": np.array(filtered["feature_times"]),
        "features": np.array(filtered["features"])
    }

In [9]:
ctrl_ids = list(set(ids_by_cancer_type['crc']) & set(ids_by_cancer_type['lung']))
# len(ctrl_ids)

In [12]:
motor_fea_df = pd.DataFrame({'subject_ids':motor_fea['subject_ids'],
                            'feature_times':motor_fea['feature_times'],
                            'features':[np.array(x) for x in motor_fea['features']]})
ctrl_fea_df = motor_fea_df[motor_fea_df['subject_ids'].isin(ctrl_ids)]
ctrl_fea = ctrl_fea_df.to_dict(orient='list')
ctrl_fea['features'] = np.stack(ctrl_fea['features'])

100%|██████████| 223402/223402 [18:13<00:00, 204.21it/s]


In [10]:
for cancer_type, ids in ids_by_cancer_type.items():
    ids = list(set(ids) - set(ctrl_ids))
    sub_fea = filter_dict(motor_fea, ids)
    merged_fea = {
    'subject_ids': np.concatenate([sub_fea['subject_ids'], ctrl_fea['subject_ids']]),
    'feature_times': np.concatenate([sub_fea['feature_times'], ctrl_fea['feature_times']]),
    'features': np.concatenate([sub_fea['features'], ctrl_fea['features']])}
    print(cancer_type, len(merged_fea['subject_ids']))
    with open(f'aou_motor/features/meds_{cancer_type}_motor.pkl', 'wb') as h:
        pickle.dump(merged_fea, h)

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/aou_motor
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER

python -u -m femr.omop_meds_tutorial.finetune_motor   --pretraining_data $PRETRAINING_DATA   --meds_reader $OMOP_MEDS_READER --cohort_label meds_pancreas

In [None]:
# saved in aou_motor/results

In [None]:
# save below in .txt and rename run_motor_lp.sh

In [None]:
export PRETRAINING_DATA=/home/jupyter/workspaces/ehrcancermodelevaluation/aou_motor
export OMOP_MEDS_READER=/home/jupyter/workspaces/ehrcancermodelevaluation/OMOP_MEDS_READER

cohorts=(
    leukemia 
    urinary_tract_bladder
    neuroendocrine
    myelodysplastic_syndromes
    cervical
    soft_tissue_sarcoma
    lymphoma_lymphoid
    all
    crc
    kidney
    lung
    brain
    skin
    stomach
    prostate
    uterine_endometrial
    liver
    esophagus
    ovarian
    multiple_myeloma
    breast
    pancreas
    head_and_neck
    bone
    testicular
    eye_ocular
    thyroid
)

for cohort in "${cohorts[@]}"; do
    cohort_label="meds_${cohort}"
    echo "Running fine-tuning for $cohort..."
    python -u -m femr.omop_meds_tutorial.finetune_motor \
        --pretraining_data $PRETRAINING_DATA \
        --meds_reader $OMOP_MEDS_READER \
        --cohort_label $cohort_label \
        &> "finetune_${cohort}.out" &
done

In [None]:
#bash run_motor_lp.sh

In [None]:
# check if it is running
# ps aux | grep finetune_motor