# Using CLMBR to generate features and training models on those features

We can use a trained CLMBR model to generate features and then use those features in a logistic regression model.

In [1]:
import warnings

# Suppress specific FutureWarning from awswrangler module
warnings.filterwarnings("ignore", message="promote has been superseded by mode='default'.", category=FutureWarning, module="pyarrow")
warnings.filterwarnings("ignore", message="promote has been superseded by mode='default'.", category=FutureWarning, module="datasets")
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="The max_iter was reached which means the coef_ did not converge")

In [2]:
import shutil
import os

# change your own cache directory so that you don't redownload from huggingface
os.environ["HF_DATASETS_CACHE"] = '/share/pi/nigam/projects/zphuo/.cache'

TARGET_DIR = 'trash/tutorial_5_INSPECT'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

# number of processes to use
num_proc = 20

In [3]:
import femr.models.transformer
import pyarrow.csv
import datasets

# First, we compute our features

label_columns = '12_month_PH'

# Load some labels
label_csv_subset = '/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds/cohort_0.2.0_master_file_anon_subset.csv'
labels_table = pyarrow.csv.read_csv(label_csv_subset)

In [4]:
import femr.models.transformer
import pyarrow.csv
import datasets
import pyarrow as pa
import pyarrow.compute as pc

# Load some labels
# labels = pyarrow.csv.read_csv('input/labels.csv').to_pylist()
label_csv_subset = '/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds/cohort_0.2.0_master_file_anon_subset.csv'
labels_table = pyarrow.csv.read_csv(label_csv_subset)

import pandas as pd
label_df = pd.read_csv(label_csv_subset)
label_df = label_df[['patient_id', 'split', ]]
label_df.rename(columns={'split': 'split_name'}, inplace=True)
inspect_split_csv = '/share/pi/nigam/projects/zphuo/repos/femr/tutorials/trash/tutorial_6_INSEPCT/motor_model/main_split.csv'
label_df.to_csv(inspect_split_csv, index=False)

# filter out censored
selected_table = labels_table.select(['patient_id', 'procedure_time', label_columns])
filtered_table = selected_table.filter(pa.compute.field(label_columns) != "Censored")

# cast to bool
casted_column = pc.cast(filtered_table.column(label_columns), target_type=pa.bool_())
filtered_table = filtered_table.set_column(filtered_table.schema.get_field_index(label_columns), pa.field(label_columns, pa.bool_()), casted_column)

columns = {name: filtered_table.column(name) for name in filtered_table.column_names}
columns['prediction_time'] = columns.pop('procedure_time')
columns['boolean_value'] = columns.pop(label_columns)
filtered_table = pa.Table.from_arrays(list(columns.values()), names=list(columns.keys()))

labels = filtered_table.to_pylist()

# Load our data
# dataset = datasets.Dataset.from_parquet("input/meds/data/*")
parquet_folder = '/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds/data_subset/*'
dataset = datasets.Dataset.from_parquet(parquet_folder)

# model_name = "StanfordShahLab/clmbr-t-base"
model_name = "/share/pi/nigam/projects/zphuo/repos/clmbr-t-base"

features = femr.models.transformer.compute_features(dataset, model_name, labels, num_proc=num_proc, tokens_per_batch=128)

# We have our features
for k, v in features.items():
    print(k, v.shape)

Map (num_proc=20):   0%|          | 0/1916 [00:00<?, ? examples/s]

Some weights of the model checkpoint at /share/pi/nigam/projects/zphuo/repos/clmbr-t-base were not used when initializing FEMRModel: ['task_model.final_layer.bias', 'task_model.final_layer.weight']
- This IS expected if you are initializing FEMRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FEMRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map (num_proc=20):   0%|          | 0/1574 [00:00<?, ? examples/s]

Creating batches 1867


Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

patient_ids (1881,)
feature_times (1881,)
features (1881, 768)


In [10]:
features['patient_ids'].shape

(1881,)

In [8]:
len(labels)

1872

In [17]:
from typing import Any, List, Mapping
import numpy as np
import meds

def join_labels(features: Mapping[str, np.array], labels: List[meds.Label]) -> Mapping[str, np.array]:
    labels = list(labels)
    labels.sort(key=lambda a: (a["patient_id"], a["prediction_time"]))

    label_index = 0

    indices = []
    label_values = []

    order = np.lexsort((features["feature_times"], features["patient_ids"]))

    for i, patient_id, feature_time in zip(order, features["patient_ids"][order], features["feature_times"][order]):
        print(i, patient_id, feature_time, end='| ')
        if label_index == len(labels):
            break

        assert patient_id <= labels[label_index]["patient_id"], f"Missing features for label {labels[label_index]}"
        if patient_id < labels[label_index]["patient_id"]:
            continue
            
        print(patient_id, labels[label_index]["patient_id"], end='| ')
        
        assert (
            feature_time <= labels[label_index]["prediction_time"]
        ), f"Missing features for label {labels[label_index]}"
        if feature_time < labels[label_index]["prediction_time"]:
            continue
        
        print(feature_time, labels[label_index]["prediction_time"])
            
        indices.append(i)
        label_values.append(labels[label_index]["boolean_value"])
        label_index += 1

    return {
        "boolean_values": np.array(label_values),
        "patient_ids": features["patient_ids"][indices],
        "times": features["feature_times"][indices],
        "features": features["features"][indices, :],
    }

In [18]:
features_and_labels = join_labels(features, labels)

1505 145065 2232-11-15 15:10:00| 145065 145065| 2232-11-15 15:10:00 2232-11-15 15:10:00
874 145065 2232-12-12 15:34:00| 145065 145065| 2232-12-12 15:34:00 2232-12-12 15:34:00
305 588132 2176-02-16 16:33:00| 588132 588132| 2176-02-16 16:33:00 2176-02-16 16:33:00
1150 1312085 2213-04-03 20:33:00| 1312085 1312085| 2213-04-03 20:33:00 2213-04-03 20:33:00
1216 2769482 2179-02-02 12:59:00| 2769482 2769482| 2179-02-02 12:59:00 2179-02-02 12:59:00
383 3466741 2198-03-23 22:33:00| 3466741 3466741| 2198-03-23 22:33:00 2198-03-23 22:33:00
1105 3639455 2223-05-19 19:15:00| 3639455 3639455| 2223-05-19 19:15:00 2223-05-19 19:15:00
579 4038854 2197-09-27 04:11:00| 4038854 4038854| 2197-09-27 04:11:00 2197-09-27 04:11:00
255 4667841 2237-02-08 18:55:00| 4667841 4667841| 2237-02-08 18:55:00 2237-02-08 18:55:00
982 5684969 2198-02-05 09:32:00| 5684969 5684969| 2198-02-05 09:32:00 2198-02-05 09:32:00
638 6270821 2211-06-20 15:18:00| 6270821 6270821| 2211-06-20 15:18:00 2211-06-20 15:18:00
1288 6728949 22

'947584322_2170_12_14_11:50:00'

In [32]:
labels[0]['prediction_time'].strftime("%Y_%m_%d_%H:%M:%S")

'2170_12_14_11:50:00'

In [67]:
def join_feature_label_frazier(features, labels, split_file):
    
    # for example
    #split_file = '/share/pi/nigam/projects/zphuo/repos/PE_3D_multimodal/training/trash/motor_model_022121/main_split.csv'
    
    split_df = pd.read_csv(file)
    split_dict = {k:v for i, (k, v) in split_df.iterrows()}

    
    #########################
    # construct a dictionary for labels, where key is a unique combo of pid and time, and value is the label
    labels_pid_time_dict = {}

    for item in labels:

        # key is the patient_id + time
        pid_time = str(item['patient_id'])+'_'+item['prediction_time'].strftime("%Y_%m_%d_%H:%M:%S")

        # value is the label
        labels_pid_time_dict[pid_time] = item['boolean_value']

    #########################
    # construct a dictionary for features, where key is a unique combo of pid and time, and value is the feature representation
    features_pid_time_dict = {}
    for idx, patient_id in enumerate(features['patient_ids']):
        pid_time = str(patient_id)+'_'+features['feature_times'][idx].strftime("%Y_%m_%d_%H:%M:%S")


        features_pid_time_dict[pid_time] = features['features'][idx]
        
    #########################
    # X, y train/test
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for pid_time in features_pid_time_dict:
        pid = pid_time.split('_')[0]

        if split_dict[int(pid)] == 'train':
            X_train.append(features_pid_time_dict[pid_time])
            y_train.append(labels_pid_time_dict[pid_time])
        elif split_dict[int(pid)] == 'test':
            X_test.append(features_pid_time_dict[pid_time])
            y_test.append(labels_pid_time_dict[pid_time])
            
    return X_train, y_train, X_test, y_test
        
    

In [69]:
X_train, y_train, X_test, y_test = join_feature_label_frazier(features, labels, '/share/pi/nigam/projects/zphuo/repos/PE_3D_multimodal/training/trash/motor_model_022121/main_split.csv')

# Joining features and labels

Given a feature set, it's important to be able to join a set of labels to those features.

This can be done with femr.featurizers.join_labels

In [5]:
import femr.featurizers

features_and_labels = femr.featurizers.join_labels(features, labels)

for k, v in features_and_labels.items():
    print(k, v.shape)

boolean_values (1872,)
patient_ids (1872,)
times (1872,)
features (1872, 768)


# Data Splitting

When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model

In [None]:
import femr.splits
import numpy as np

# We split into a global training and test set
split = femr.splits.PatientSplit.load_from_csv('input/clmbr_model/main_split.csv')

train_mask = np.isin(features_and_labels['patient_ids'], split.train_patient_ids)
test_mask = np.isin(features_and_labels['patient_ids'], split.test_patient_ids)

percent_train = .70
X_train, y_train = (
    features_and_labels['features'][train_mask],
    features_and_labels['boolean_values'][train_mask],
)
X_test, y_test = (
    features_and_labels['features'][test_mask],
    features_and_labels['boolean_values'][test_mask],
)

: 

# Building Models

The generated features can then be used to build your standard models. In this case we construct both logistic regression and XGBoost models and evaluate them.

Performance is perfect since our task (predicting gender) is 100% determined by the features

In [71]:
import xgboost as xgb
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing

def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):
    print(f"---- {title} ----")
    print("Train:")
    print_metrics(y_train, y_train_proba)
    print("Test:")
    print_metrics(y_test, y_test_proba)

def print_metrics(y_true, y_proba):
    y_pred = y_proba > 0.5
    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)
    aps = sklearn.metrics.average_precision_score(y_true, y_proba)
    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
    f1 = sklearn.metrics.f1_score(y_true, y_pred)
    print("\tAUROC:", auroc)
    print("\tAPS:", aps)
    print("\tAccuracy:", accuracy)
    print("\tF1 Score:", f1)


model = sklearn.linear_model.LogisticRegressionCV(penalty="l2", solver="liblinear").fit(X_train, y_train)
y_train_proba = model.predict_proba(X_train)[::, 1]
y_test_proba = model.predict_proba(X_test)[::, 1]
run_analysis("Logistic Regression", y_train, y_train_proba, y_test, y_test_proba)

---- Logistic Regression ----
Train:
	AUROC: 0.6367891023029246
	APS: 0.19678378242267813
	Accuracy: 0.8639322916666666
	F1 Score: 0.0
Test:
	AUROC: 0.605453667953668
	APS: 0.2422693462911058
	Accuracy: 0.8582375478927203
	F1 Score: 0.0
