# Train MOTOR

This tutorial walks through the various steps to train a MOTOR model.

Training MOTOR is a four step process:

- Training a tokenizer
- Prefitting MOTOR
- Preparing batches
- Training the model

In [1]:
import shutil
import os

# os.environ["HF_DATASETS_CACHE"] = '/share/pi/nigam/ethanid/cache_dir'


TARGET_DIR = 'trash/tutorial_6'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

In [2]:
import datasets
import femr.index
import femr.splits

# First, we want to split our dataset into train, valid, and test
# We do this by calling our split functionality twice

dataset = datasets.Dataset.from_parquet('input/meds/data/*')


index = femr.index.PatientIndex(dataset, num_proc=4)
main_split = femr.splits.generate_hash_split(index.get_patient_ids(), 97, frac_test=0.15)

os.mkdir(os.path.join(TARGET_DIR, 'motor_model'))
# Note that we want to save this to the target directory since this is important information

main_split.save_to_csv(os.path.join(TARGET_DIR, "motor_model", "main_split.csv"))

train_split = femr.splits.generate_hash_split(main_split.train_patient_ids, 87, frac_test=0.15)

print(train_split.train_patient_ids)
print(train_split.test_patient_ids)

main_dataset = main_split.split_dataset(dataset, index)
train_dataset = train_split.split_dataset(main_dataset['train'], femr.index.PatientIndex(main_dataset['train'], num_proc=4))

print(train_dataset)

  from .autonotebook import tqdm as notebook_tqdm
Map (num_proc=4): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1512.26 examples/s]

[0, 1, 2, 4, 6, 7, 10, 11, 12, 13, 14, 15, 18, 20, 21, 23, 24, 26, 27, 28, 29, 30, 31, 33, 36, 37, 38, 40, 42, 44, 45, 47, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 73, 74, 75, 76, 77, 79, 80, 83, 85, 86, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 100, 101, 102, 103, 104, 105, 107, 109, 110, 112, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 133, 134, 135, 136, 137, 139, 141, 142, 143, 144, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 168, 169, 171, 172, 173, 174, 178, 181, 182, 183, 184, 185, 186, 187, 189, 192, 193, 195, 196, 197, 198, 199]
[19, 22, 25, 39, 46, 71, 82, 84, 87, 92, 106, 108, 113, 131, 132, 138, 146, 147, 148, 155, 177, 179, 180, 188, 190, 191]



Map (num_proc=4): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170/170 [00:00<00:00, 1514.12 examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'events'],
        num_rows: 144
    })
    test: Dataset({
        features: ['patient_id', 'events'],
        num_rows: 26
    })
})





In [3]:
import femr.models.tokenizer
import pickle

# First, we need to train a tokenizer
# Note, we need to use a hierarchical tokenizer for MOTOR

with open('input/meds/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)

tokenizer = femr.models.tokenizer.train_tokenizer(
    main_dataset['train'], vocab_size=128, is_hierarchical=True, num_proc=4, ontology=ontology)

# Save the tokenizer to the same directory as the model
tokenizer.save_pretrained(os.path.join(TARGET_DIR, "motor_model"))

Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170/170 [00:00<00:00, 460.40 examples/s]


In [4]:

import femr.models.tasks

# Second, we need to prefit the MOTOR model. This is necessary because piecewise exponential models are unstable without an initial fit

motor_task = femr.models.tasks.MOTORTask.fit_pretraining_task_info(
    main_dataset['train'], tokenizer, num_tasks=64, num_bins=4, final_layer_size=32, num_proc=4)


# It's recommended to save this with pickle to avoid recomputing since it's an expensive operation


Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170/170 [00:00<00:00, 414.49 examples/s]


In [5]:
import femr.models.processor
import femr.models.tasks

# Third, we need to create batches. 

processor = femr.models.processor.FEMRBatchProcessor(tokenizer, motor_task)

# We can do this one patient at a time
print("Convert a single patient")
example_batch = processor.collate([processor.convert_patient(train_dataset['train'][0], tensor_type='pt')])

print("Convert batches")
# But generally we want to convert entire datasets
train_batches = processor.convert_dataset(train_dataset, tokens_per_batch=32, num_proc=4)

print("Convert batches to pytorch")
# Convert our batches to pytorch tensors
train_batches.set_format("pt")
print("Done")

Convert a single patient
Convert batches


Map (num_proc=4): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:00<00:00, 348.22 examples/s]


Creating batches 7


Generating train split: 7 examples [00:00, 11.81 examples/s]
Map (num_proc=4): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 68.03 examples/s]


Creating batches 1


Setting num_proc from 4 back to 1 for the train split to disable multiprocessing as it only contains one shard.
Generating train split: 1 examples [00:00, 76.13 examples/s]

Convert batches to pytorch
Done





In [6]:
import transformers

import femr.models.transformer

# Finally, given the batches, we can train CLMBR.
# We can use huggingface's trainer to do this.

transformer_config = femr.models.config.FEMRTransformerConfig(
    vocab_size=tokenizer.vocab_size, 
    is_hierarchical=tokenizer.is_hierarchical, 
    n_layers=2,
    hidden_size=64, 
    intermediate_size=64*2,
    n_heads=8,
)

config = femr.models.config.FEMRModelConfig.from_transformer_task_configs(transformer_config, motor_task.get_task_config())

model = femr.models.transformer.FEMRModel(config)

collator = processor.collate

trainer_config = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    output_dir='tmp_trainer',
    remove_unused_columns=False,
    num_train_epochs=100,

    eval_steps=20,
    evaluation_strategy="steps",

    logging_steps=20,
    logging_strategy='steps',

    prediction_loss_only=True,
)

trainer = transformers.Trainer(
    model=model,
    data_collator=processor.collate,
    train_dataset=train_batches['train'],
    eval_dataset=train_batches['test'],
    args=trainer_config,
)

trainer.train()

model.save_pretrained(os.path.join(TARGET_DIR, 'motor_model'))

2024-02-15 01:17:39.969569: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 01:17:39.969629: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 01:17:39.971090: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-15 01:17:39.981133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Could not estimate the number of tokens of the input,

Step,Training Loss,Validation Loss
20,0.824,0.506734
40,0.7903,0.506806
60,0.8589,0.50689
80,0.7896,0.506971
100,0.8282,0.507057
120,0.7872,0.507141
140,0.8239,0.507234
160,0.8217,0.507326
180,0.8219,0.507422
200,0.7935,0.50751


Checkpoint destination directory tmp_trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
