# Train MOTOR

This tutorial walks through the various steps to train a MOTOR model.

Training MOTOR is a four step process:

- Training a tokenizer
- Prefitting MOTOR
- Preparing batches
- Training the model

In [1]:
import shutil
import os

TARGET_DIR = 'trash/tutorial_4'

if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

os.mkdir(TARGET_DIR)

In [2]:
import meds_reader
import femr.splits

# First, we want to split our dataset into train, valid, and test
# We do this by calling our split functionality twice

database = meds_reader.SubjectDatabase('input/synthetic_meds')

main_split = femr.splits.generate_hash_split(list(database), 97, frac_test=0.15)

os.mkdir(os.path.join(TARGET_DIR, 'motor_model'))
# Note that we want to save this to the target directory since this is important information

main_split.save_to_csv(os.path.join(TARGET_DIR, "motor_model", "main_split.csv"))

train_split = femr.splits.generate_hash_split(main_split.train_subject_ids, 87, frac_test=0.15)

main_database = database.filter(main_split.train_subject_ids)
train_database = main_database.filter(train_split.train_subject_ids)
val_database = main_database.filter(train_split.test_subject_ids)


In [3]:
import femr.models.tokenizer
import pickle

# First, we need to train a tokenizer
# Note, we need to use a hierarchical tokenizer for MOTOR

with open('input/ontology.pkl', 'rb') as f:
    ontology = pickle.load(f)

# NOTE: A vocab size of 128 is probably too low for a real model. 128 was chosen to make this tutorial quick to run
# NOTE: Normally you would train the tokenizer on only the train database, but for such a tiny dataset that's not enough
tokenizer = femr.models.tokenizer.HierarchicalTokenizer.train(
    database, vocab_size=1024 * 16, ontology=ontology, min_fraction=1e-9) # Normally min_fraction should be set higher, to 1e-4, but need a small min fraction to get enough codes

# Save the tokenizer to the same directory as the model
tokenizer.save_pretrained(os.path.join(TARGET_DIR, "motor_model"))

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

import femr.models.tasks

# Second, we need to prefit the MOTOR model. This is necessary because piecewise exponential models are unstable without an initial fit

motor_task = femr.models.tasks.MOTORTask.fit_pretraining_task_info(
    train_database, tokenizer, num_tasks=2048, num_bins=4, final_layer_size=32, min_fraction=1e-9)  # Normally min_fraction should be set higher, to 1e-4, but need a small min fraction to get enough codes

# It's recommended to save this with pickle to avoid recomputing since it's an expensive operation

In [5]:
import femr.models.processor
import femr.models.tasks

# Third, we need to create batches. 

processor = femr.models.processor.FEMRBatchProcessor(tokenizer, motor_task)

example_subject_id = list(train_database)[0]
example_subject = train_database[example_subject_id]

# We can do this one subject at a time
print("Convert a single subject")
example_batch = processor.collate([processor.convert_subject(example_subject, tensor_type='pt')])

print("Convert batches")
# But generally we want to convert entire datasets
train_batches = processor.convert_dataset(train_database, tokens_per_batch=32, num_proc=4)

print("Convert batches to pytorch")
# Convert our batches to pytorch tensors
train_batches.set_format("pt")
print("Done")

val_batches = processor.convert_dataset(val_database, tokens_per_batch=32, num_proc=4)
# Convert our batches to pytorch tensors
val_batches.set_format("pt")

Convert a single subject
Convert batches
Got batches 46


Generating train split: 46 examples [00:00, 1850.48 examples/s]


Convert batches to pytorch
Done
Got batches 9


Generating train split: 9 examples [00:00, 1676.98 examples/s]


In [6]:
import transformers

import femr.models.transformer

# Finally, given the batches, we can train CLMBR.
# We can use huggingface's trainer to do this.

transformer_config = femr.models.config.FEMRTransformerConfig(
    vocab_size=tokenizer.vocab_size, 
    is_hierarchical=True, 
    use_normed_ages=True,
    use_bias=False,
    hidden_act='swiglu',
    n_layers=2,
    hidden_size=64, 
    intermediate_size=64*2,
    n_heads=8,
)

config = femr.models.config.FEMRModelConfig.from_transformer_task_configs(transformer_config, motor_task.get_task_config())

model = femr.models.transformer.FEMRModel(config)

collator = processor.collate

trainer_config = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    output_dir='tmp_trainer',
    remove_unused_columns=False,
    num_train_epochs=4,

    eval_steps=20,
    eval_strategy="steps",

    logging_steps=20,
    logging_strategy='steps',

    prediction_loss_only=True,
)

trainer = transformers.Trainer(
    model=model,
    data_collator=processor.collate,
    train_dataset=train_batches,
    eval_dataset=val_batches,
    args=trainer_config,
)

trainer.train()

model.save_pretrained(os.path.join(TARGET_DIR, 'motor_model'))

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss
20,0.0391,0.031973
40,0.0445,0.031971
60,0.0475,0.031969
80,0.0376,0.031968
100,0.0364,0.031967
120,0.0343,0.031967
140,0.0475,0.031966
160,0.0496,0.031966
180,0.0337,0.031966
