In [None]:
!pip3 install femr

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pickle
import femr
import datetime
import numpy as np
import pandas as pd
from typing import List, Dict

# Using APLUS ML with STARR OMOP

This tutorial shows how to load STARR OMOP data into APLUS ML.

It uses the [FEMR Python package](https://github.com/som-shahlab/femr) to load the STARR OMOP dataset (i.e. we assume you already have a FEMR extract prepared, if not please refer to the FEMR repo itself for instructions).

Then, we create a dataframe of patient features and labels.

Finally, we saves the dataframe to a CSV in the same format as expected in the corresponding APLUS ML tutorial `pad.ipynb`

## Load PAD cohort from STARR-OMOP using FEMR

In [None]:
# Create FEMR patient database
path_to_femr_extract: str = '/local-scratch/nigam/projects/ethanid/som-rit-phi-starr-prod.starr_omop_cdm5_deid_2023_02_08_extract_v8_no_observation'
femr_database = femr.PatientDatabase(path_to_femr_extract)
ontology = femr_database.get_ontology()
print("# of patients in our database: ", len(femr_database))

In [None]:
# Create FEMR patient cohort using a custom Labeler
from femr.labelers.omop import OMOPConceptCodeLabeler, get_outpatient_visit_codes
from femr.labelers.core import TimeHorizon

class PADLabeler(OMOPConceptCodeLabeler):
    original_omop_concept_codes_pad: List[str] = [
        "ICD9/250.7", "ICD9/440.0", "ICD9/440.2", "ICD9/440.3", "ICD9/443.9", 
        "ICD9/444.22", "ICD9/444.8", "ICD9/445.02", "ICD9/447.1", "ICD10/E08.51", 
        "ICD10/E08.52", "ICD10/E10.5", "ICD10/E11.5", "ICD10/E13.5", "ICD10/I70.0", 
        "ICD10/I70.2", "ICD10/I70.3", "ICD10/I70.4", "ICD10/I70.5", "ICD10/I70.6", 
        "ICD10/I70.7", "ICD10/I70.9", "ICD10/I73.9", "ICD10/I74", "ICD10/I75.0", "ICD10/I77.1",
    ]

# Identify patients with PAD
prediction_codes = get_outpatient_visit_codes()
time_horizon = TimeHorizon(0, None)
labeler = PADLabeler(ontology, time_horizon, prediction_codes)
# NOTE: This line takes a while to run on the full 100% STARR-OMOP extract
labeled_patients = labeler.apply(
    path_to_patient_database=path_to_femr_extract,
    num_threads=20,
)
pickle.dump(labeled_patients, open('labeled_patients.pkl', 'wb'))

# Format dataset for APLUS ML

In [None]:
labeled_patients = pickle.load(open('labeled_patients.pkl', 'rb'))
patients: List[Dict] = []

for patient_id, labels in labeled_patients.items():
    for label in labels:
        patient_age_at_visit: datetime.timedelta = label.time - femr_database[patient_id].events[0].start
        # Filter out patients < 50 years old
        if patient_age_at_visit < datetime.timedelta(days=365*50):
            continue
        patients.append({
            'id' : patient_id,
            'y' : label.value,
            'visit_datetime' : label.time,
            'birth_datetime' : femr_database[patient_id].events[0].start,
            'x_1' : len(femr_database[patient_id].events),
            'x_2' : label.time - femr_database[patient_id].events[0].start,
        })

# Create patients
df = pd.DataFrame(patients)

# Save to CSV
df.to_csv('./ignore/secure/femr_pad_inputs.csv')

## Run patients through model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load patients
df = pd.read_csv('./ignore/secure/femr_pad_inputs.csv')

# Run simple logistic regression on features to get model predictions
X = df[['x_1', 'x_2']]
y = df['y']
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=0)
clf = LogisticRegression(random_state=0).fit(train_X, train_Y)
y_hat = clf.predict_proba(train_Y)

# Model predictions
df['y_hat'] = y_hat

# ABI test prediction
df['abi_test_pred'] = np.random.normal(0.65 * df['y'] + (1 - df['y']) * 1.09, 0.15 * df['y'] + (1 - df['y']) * 0.11)

# Random patient-level resource priority
df['random_resource_priority'] = np.random.choice(range(df.shape[0]), replace=False, size=df.shape[0])

pd.to_csv('./ignore/secure/femr_pad_inputs.csv')

## Run APLUS ML

See `pad.ipynb` notebook, feeding `./ignore/secure/femr_pad_inputs.csv` as the input CSV.