# Using CLMBR to generate features and training models on those features

We can use a trained CLMBR model to generate features and then use those features in a logistic regression model.

This is a two step process, involving generating batches for the task and then the representations.



In [1]:
import shutil
import os

TARGET_DIR = 'trash/tutorial_6'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

In [2]:
"""
Generate batches for a given set of labels.
"""

import pickle

EXTRACT_LOCATION = "input/extract"

# From tutorial 3
LABELS = "input/labels.pkl"

# From tutorial 5
DICTIONARY_PATH = "input/dictionary"
MODEL_PATH = "input/clmbr_model"

TASK_BATCHES = os.path.join(TARGET_DIR, "task_batches")


os.system(
    f"clmbr_create_batches {TASK_BATCHES} --data_path {EXTRACT_LOCATION} --dictionary {DICTIONARY_PATH} --task labeled_patients --labeled_patients_path {LABELS} --transformer_vocab_size 2048"
)

REPRESENTATIONS = os.path.join(TARGET_DIR, "clmbr_reprs")

os.system(
    f"clmbr_compute_representations {REPRESENTATIONS} --data_path {EXTRACT_LOCATION} --batches_path {TASK_BATCHES} --model_dir {MODEL_PATH}"
)


"""
Open the resulting representations and take a look at the data matrix.
"""

with open(REPRESENTATIONS, "rb") as f:
    reprs = pickle.load(f)

    print(reprs.keys())

    print("Pulling the data for the first label")
    print("Patient id", reprs["patient_ids"][0])
    print("Label time", reprs["labeling_time"][0])
    print("Label value", reprs["label_values"][0])
    print("Representation", reprs["data_matrix"][0, :16])

2023-05-30 15:18:49,564 [MainThread  ] [INFO ]  Preparing batches with Namespace(directory='trash/tutorial_6/task_batches', data_path='input/extract', dictionary_path='input/dictionary', task='labeled_patients', transformer_vocab_size=2048, clmbr_survival_dictionary_path=None, labeled_patients_path='input/labels.pkl', is_hierarchical=False, seed=97, val_start=70, test_start=85, batch_size=16384, note_embedding_data=None, limit_to_patients_file=None, limit_before_date=None, num_clmbr_tasks=8192)
Traceback (most recent call last):
  File "/home/ethanid/miniconda3/envs/femr_develop2/bin/clmbr_create_batches", line 8, in <module>
    sys.exit(create_batches())
  File "/local-scratch/nigam/projects/ethanid/femr_develop2/src/femr/models/dataloader.py", line 242, in create_batches
    labeled_patients = femr.labelers.load_labeled_patients(args.labeled_patients_path)
  File "/local-scratch/nigam/projects/ethanid/femr_develop2/src/femr/labelers/core.py", line 100, in load_labeled_patients
    r

FileNotFoundError: [Errno 2] No such file or directory: 'trash/tutorial_6/clmbr_reprs'

In [None]:
"""
Train a logistic regression model on the resutling features.
"""

# Data splitting and modeling, see tutorial 4
import femr.datasets
import numpy as np
import xgboost as xgb
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing

database = femr.datasets.PatientDatabase("input/extract")

percent_train = .70
split_seed = 97

hashed_pids = np.array([database.compute_split(split_seed, pid) for pid in reprs["patient_ids"]])
train_pids_idx = np.where(hashed_pids < (percent_train * 100))[0]
test_pids_idx = np.where(hashed_pids >= (percent_train * 100))[0]

X_train, y_train = (
    reprs["data_matrix"][train_pids_idx],
    reprs["label_values"][train_pids_idx],
)
X_test, y_test = reprs["data_matrix"][test_pids_idx], reprs["label_values"][test_pids_idx]


def run_analysis(title: str, y_train, y_train_proba, y_test, y_test_proba):
    print(f"---- {title} ----")
    print("Train:")
    print_metrics(y_train, y_train_proba)
    print("Test:")
    print_metrics(y_test, y_test_proba)

def print_metrics(y_true, y_proba):
    y_pred = y_proba > 0.5
    auroc = sklearn.metrics.roc_auc_score(y_true, y_proba)
    aps = sklearn.metrics.average_precision_score(y_true, y_proba)
    accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
    f1 = sklearn.metrics.f1_score(y_true, y_pred)
    print("\tAUROC:", auroc)
    print("\tAPS:", aps)
    print("\tAccuracy:", accuracy)
    print("\tF1 Score:", f1)


model = sklearn.linear_model.LogisticRegressionCV(penalty="l2", solver="liblinear").fit(X_train, y_train)
y_train_proba = model.predict_proba(X_train)[::, 1]
y_test_proba = model.predict_proba(X_test)[::, 1]
run_analysis("Logistic Regression", y_train, y_train_proba, y_test, y_test_proba)