# Train MOTOR

This tutorial walks through the various steps to train a MOTOR model.

Training MOTOR is a four step process:

- Training a tokenizer
- Prefitting MOTOR
- Preparing batches
- Training the model

In [1]:
import shutil
import os

# os.environ["HF_DATASETS_CACHE"] = '/share/pi/nigam/zphuo/cache_dir'


TARGET_DIR = 'trash/tutorial_6'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

num_proc = 8

In [2]:
import datasets
import femr.index
import femr.splits

# First, we want to split our dataset into train, valid, and test
# We do this by calling our split functionality twice

dataset = datasets.Dataset.from_parquet('input/meds/data/*')
# dataset = datasets.Dataset.from_parquet('/share/pi/nigam/projects/zphuo/data/PE/inspect/timelines_smallfiles_meds_3/data/*')


index = femr.index.PatientIndex(dataset, num_proc=num_proc)
main_split = femr.splits.generate_hash_split(index.get_patient_ids(), 97, frac_test=0.15)

os.mkdir(os.path.join(TARGET_DIR, 'motor_model'))
# Note that we want to save this to the target directory since this is important information

main_split.save_to_csv(os.path.join(TARGET_DIR, "motor_model", "main_split.csv"))

train_split = femr.splits.generate_hash_split(main_split.train_patient_ids, 87, frac_test=0.15)

print(train_split.train_patient_ids)
print(train_split.test_patient_ids)

main_dataset = main_split.split_dataset(dataset, index)
train_dataset = train_split.split_dataset(main_dataset['train'], femr.index.PatientIndex(main_dataset['train'], num_proc=num_proc))

print(train_dataset)

Map (num_proc=8):   0%|          | 0/200 [00:00<?, ? examples/s]

[0, 1, 2, 4, 6, 7, 10, 11, 12, 13, 14, 15, 18, 20, 21, 23, 24, 26, 27, 28, 29, 30, 31, 33, 36, 37, 38, 40, 42, 44, 45, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 73, 74, 75, 76, 77, 79, 80, 83, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 104, 105, 107, 109, 110, 112, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 133, 134, 135, 136, 137, 139, 141, 142, 143, 144, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 168, 169, 171, 172, 173, 174, 178, 181, 182, 183, 184, 185, 186, 187, 189, 192, 193, 195, 196, 197, 198, 199]
[19, 22, 25, 39, 46, 71, 82, 84, 87, 92, 106, 108, 113, 131, 132, 138, 146, 147, 148, 155, 177, 179, 180, 188, 190, 191]


Map (num_proc=8):   0%|          | 0/170 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'events'],
        num_rows: 144
    })
    test: Dataset({
        features: ['patient_id', 'events'],
        num_rows: 26
    })
})


In [3]:
main_dataset['train']

Dataset({
    features: ['patient_id', 'events'],
    num_rows: 170
})

In [4]:
import femr.models.tokenizer
import pickle

# First, we need to train a tokenizer
# Note, we need to use a hierarchical tokenizer for MOTOR

with open('input/meds/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)

tokenizer = femr.models.tokenizer.train_tokenizer(
    main_dataset['train'], vocab_size=128, is_hierarchical=True, num_proc=num_proc, ontology=ontology)

# Save the tokenizer to the same directory as the model
tokenizer.save_pretrained(os.path.join(TARGET_DIR, "motor_model"))



Map (num_proc=8):   0%|          | 0/170 [00:00<?, ? examples/s]

In [5]:

import femr.models.tasks

# Second, we need to prefit the MOTOR model. This is necessary because piecewise exponential models are unstable without an initial fit

motor_task = femr.models.tasks.MOTORTask.fit_pretraining_task_info(
    main_dataset['train'], tokenizer, num_tasks=64, num_bins=4, final_layer_size=32, num_proc=num_proc)


# It's recommended to save this with pickle to avoid recomputing since it's an expensive operation


Map (num_proc=8):   0%|          | 0/170 [00:00<?, ? examples/s]

In [6]:
import femr.models.processor
import femr.models.tasks

# Third, we need to create batches. 

processor = femr.models.processor.FEMRBatchProcessor(tokenizer, motor_task)

# We can do this one patient at a time
print("Convert a single patient")
example_batch = processor.collate([processor.convert_patient(train_dataset['train'][0], tensor_type='pt')])

print("Convert batches")
# But generally we want to convert entire datasets
train_batches = processor.convert_dataset(train_dataset, tokens_per_batch=36, num_proc=num_proc)

print("Convert batches to pytorch")
# Convert our batches to pytorch tensors
train_batches.set_format("pt")
print("Done")

Convert a single patient
Convert batches


Map (num_proc=8):   0%|          | 0/144 [00:00<?, ? examples/s]

Creating batches 9


Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=8):   0%|          | 0/26 [00:00<?, ? examples/s]

Creating batches 2


Setting num_proc from 8 to 2 for the train split as it only contains 2 shards.


Generating train split: 0 examples [00:00, ? examples/s]

Convert batches to pytorch
Done


In [7]:
import transformers

# Finally, given the batches, we can train CLMBR.
# We can use huggingface's trainer to do this.

transformer_config = femr.models.transformer.FEMRTransformerConfig(
    vocab_size=tokenizer.vocab_size, 
    is_hierarchical=tokenizer.is_hierarchical, 
    n_layers=2,
    hidden_size=64, 
    intermediate_size=64*2,
    n_heads=8,
)

config = femr.models.transformer.FEMRModelConfig.from_transformer_task_configs(transformer_config, motor_task.get_task_config())

model = femr.models.transformer.FEMRModel(config)

collator = processor.collate

trainer_config = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    output_dir='tmp_trainer',
    remove_unused_columns=False,
    num_train_epochs=100,

    eval_steps=20,
    evaluation_strategy="steps",

    logging_steps=20,
    logging_strategy='steps',

    prediction_loss_only=True,
)

trainer = transformers.Trainer(
    model=model,
    data_collator=processor.collate,
    train_dataset=train_batches['train'],
    eval_dataset=train_batches['test'],
    args=trainer_config,
)


2024-02-08 04:22:22.889080: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-08 04:22:29.158993: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /share/pi/nigam/projects/zphuo/cuda/lib64::/home/zphuo/packages/cuda/lib64:/home/zphuo/miniconda3/lib:/home/zphuo/miniconda3/lib:/home/zphuo/miniconda3/bin
2024-02-08 04:22:29.159200: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such fi

In [8]:
train_batches['train']['transformer']

[{'ages': tensor([  0.,   0.,   0.,   0.,   0.,   0., 270., 436.,   0.,   0.,   0.,   0.,
            0.,   0.,   0.,   0.,   0.,  21.,   0.,   0.,   0.,   0.,   0.,   0.,
           82., 188., 273.,   0.,   0.,   0.,  42.,   0.,   0.,   0.]),
  'hierarchical_tokens': tensor([  0,   1,   3,   0,   1,   4, 118,  42,  20,  11,  95,   5,  40,  82,
           28,   6,   0,   1,   4,   0,   1,   4,   0,   1,   4,  17,  12,   5,
           54,  41,  52,  89,  11,   8,  42,  20,  23,  95,  47,  40,  14, 103,
           45,  28, 108,   6, 113,  62, 101,  51,   7,  57,  93,  90,  48,  18,
           82,  91,  79,  58,  27,  84,   0,   1,   4,   0,   1,   4,   7,  19,
           44,  74,  16,  50,   9,   0,   2,   3, 121,   0,   1,   3]),
  'hierarchical_weights': tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.1111, 0.1111,
          0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 1.0000, 1.0000,
          1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.0270,

In [9]:
(train_batches['train']['transformer'][0]['label_indices'])

tensor([ 2,  5,  6,  7, 10, 13, 16, 17, 20, 23, 24, 25, 26, 29, 30, 33])

In [10]:
len(train_batches['train']['transformer'][0]['normalized_ages'])

34

In [24]:
train_batches['train']['patient_ids']

[tensor([189, 189, 189, 187, 187, 187, 187, 187,  63,  63,  63,  94,  94,  94,
          64,  64,  64,  64,  75,  75,  75, 144, 144, 144, 144, 144, 144, 162,
         162, 162, 162, 158, 158, 158]),
 tensor([125, 125, 125, 125, 125, 149, 149, 149, 114, 114, 114,  67,  67,  67,
         199, 199, 199, 199, 199,  45,  45,  45,  23,  23,  23,  23,  23,  23,
          23, 168, 168, 168,  59,  59,  59,  59]),
 tensor([185, 185, 185,   1,   1,   1,   1, 143, 143, 143, 117, 117, 117, 181,
         181, 181, 181,  89,  89,  89,  27,  27,  27,  49,  49,  49, 107, 107,
         107, 107, 150, 150, 150, 193, 193, 193]),
 tensor([128, 128, 128,  37,  37,  37,   7,   7,   7,   7,   7, 174, 174, 174,
         169, 169, 169, 124, 124, 124,  30,  30,  30,  88,  88,  88,  88,  88,
          40,  40,  40, 152, 152, 152]),
 tensor([122, 122, 122, 122, 122, 122,  86,  86,  86,  20,  20,  20,  57,  57,
          57, 153, 153, 153, 192, 192, 192, 192,  83,  83,  83,  83,  83,  83,
          83,  15,  15,  1

In [17]:
for key in train_batches['train']['transformer'][0]:
    try:
        print(key, train_batches['train']['transformer'][0][key].shape)
        print(train_batches['train']['transformer'][0][key])
    except:
        print(key, len(train_batches['train']['transformer'][0][key]))
        print(train_batches['train']['transformer'][0][key])

ages torch.Size([34])
tensor([  0.,   0.,   0.,   0.,   0.,   0., 270., 436.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,  21.,   0.,   0.,   0.,   0.,   0.,   0.,
         82., 188., 273.,   0.,   0.,   0.,  42.,   0.,   0.,   0.])
hierarchical_tokens torch.Size([82])
tensor([  0,   1,   3,   0,   1,   4, 118,  42,  20,  11,  95,   5,  40,  82,
         28,   6,   0,   1,   4,   0,   1,   4,   0,   1,   4,  17,  12,   5,
         54,  41,  52,  89,  11,   8,  42,  20,  23,  95,  47,  40,  14, 103,
         45,  28, 108,   6, 113,  62, 101,  51,   7,  57,  93,  90,  48,  18,
         82,  91,  79,  58,  27,  84,   0,   1,   4,   0,   1,   4,   7,  19,
         44,  74,  16,  50,   9,   0,   2,   3, 121,   0,   1,   3])
hierarchical_weights torch.Size([82])
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.1111, 0.1111,
        0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 0.1111, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,

In [11]:
trainer.train()

model.save_pretrained(os.path.join(TARGET_DIR, 'motor_model'))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
Unable to read the token file at /var/run/secrets/kubernetes.io/serviceaccount/token due to permission error ([Errno 13] Permission denied: '/var/run/secrets/kubernetes.io/serviceaccount/token').The current user id is 1300074. Consider changing the securityContext to run the container as the current user.
[34m[1mwandb[0m: Currently logged in as: [33mzphuo[0m ([33mstanford_som[0m). Use [1m`wandb login --relogin`[0m to force relogin


label_indices tensor([ 2,  3,  4,  7, 10, 13, 16, 17, 18, 21, 24, 25, 26, 27, 28, 31, 34, 35])
normalized_ages tensor([-1.1204, -1.1204, -1.1204, -0.9399,  0.6878, -1.1204, -1.1204, -1.1204,
        -1.1204, -1.1204, -1.1204, -1.1204, -1.1204, -1.1204, -1.1204, -1.1204,
        -1.1204, -0.8878, -0.8219, -1.1204, -1.1204, -1.1204, -1.1204, -1.1204,
        -1.1204, -0.2284,  0.5941,  1.2569,  2.2460, -1.1204, -1.1204, -1.1204,
        -1.1204, -1.1204, -1.1204, -0.8913])
before in norm torch.Size([36, 64])
after in norm torch.Size([36, 64])
pos_embed torch.Size([36, 1, 8]) torch.Size([36, 1, 8])
x in each layer torch.Size([36, 64])
x in each layer torch.Size([36, 64])
final torch.Size([36, 64])
tensor([ 2,  3,  4,  7, 10, 13, 16, 17, 18, 21, 24, 25, 26, 27, 28, 31, 34, 35])
torch.Size([36, 64])
features torch.Size([18, 64])


NameError: name 'exit' is not defined