In [1]:
# os.environ["WANDB_DISABLED"] = "true"

In [2]:
# import logging
# log = logging.getLogger()
# log.handlers.clear()
# log.addHandler(logging.StreamHandler())
# log.setLevel(logging.WARNING)

In [3]:
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# from medcat.cat import CAT
# from foresight.models.lucid_transformers import LucidLM2HF
from transformers import SchedulerType, Trainer, TrainingArguments

# from medcat.cdb import CDB
from foresight.datasets.data_collator import CollataAndPad
from foresight.datasets.data_collator_v2 import (
    DataCollatorForLanguageModelingMaskStaticVariables,
)
from foresight.metrics.next_concept_prediction import (
    ComputePrecisionHF,
    metrics_data2df,
    precision,
)
from foresight.models.custom_GPT2 import CustomGPT2Config, CustomGPT2LMHeadModel
from foresight.tokenizers import PreTrainedTokenizerFastWithPositionIDPadding
from foresight.tokenizers.simple_map_tokenizer import SimpleMapTokenizer
from foresight.utils import pickle
from foresight.metrics.timeline import TimelineMetrics


In [4]:
import time
import math
import datasets
from torch.utils.data import DataLoader

In [5]:
OUTPUT_DIR = Path.cwd() / "outputs"
SAVE_TOKENIZER_PATH = OUTPUT_DIR / "tokenizer"
SAVE_ENCODED_DATASET_PATH = OUTPUT_DIR / "encoded_dataset"
MODEL_LOGS_DIR = OUTPUT_DIR / "model_logs" / time.strftime("%Y_%m_%d_%H_%M_%S")
FINAL_MODEL_DIR = MODEL_LOGS_DIR / "final_model"


NUM_STATIC_VARIABLES = 4

In [6]:
encoded_dataset = datasets.load_from_disk(SAVE_ENCODED_DATASET_PATH)
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [7]:
tokenizer = PreTrainedTokenizerFastWithPositionIDPadding.from_pretrained(
    SAVE_TOKENIZER_PATH
)
training_data_collator = DataCollatorForLanguageModelingMaskStaticVariables(
    tokenizer=tokenizer, mlm=False, num_static_variables=NUM_STATIC_VARIABLES
)

# Create GPT2

In [8]:
max_sequence_length = math.ceil(max(len(sample["input_ids"]) for sample in encoded_dataset["train"]) * 1.2)

# Make a new model
config = CustomGPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=max_sequence_length,
    n_embd=16,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
)
model = CustomGPT2LMHeadModel(config)
model.generation_config.max_length = max_sequence_length
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 1,
  "max_length": 100,
  "pad_token_id": 1
}

In [9]:
trial_dataset = DataLoader(
    encoded_dataset["train"],
    batch_size=8,
    shuffle=False,
    collate_fn=training_data_collator,
)
batch = next(iter(trial_dataset))
# model(**{k:v for k, v in batch.items()}).logits.shape

# Trainer

In [10]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33msimon_ellershaw[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
import os
os.environ["WANDB_LOG_MODEL"] = "end"

wandb.init(
    project="nhs_foresight_dummy_experiments",
    config = config.to_dict(),
    name = MODEL_LOGS_DIR.stem,
)

In [12]:
MODEL_LOGS_DIR.mkdir(parents=True, exist_ok=True)

training_args = TrainingArguments(
    output_dir=MODEL_LOGS_DIR,  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,  # batch size for evaluation
    # weight_decay=1e-2,               # strength of weight decay
    # logging_dir='./logs',            # directory for storing logs
    # warmup_ratio=0.01,
    learning_rate=2e-03,
    # eval_accumulation_steps=1,
    # gradient_accumulation_steps=16,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    # lr_scheduler_type=SchedulerType.LINEAR,
    # use_cpu=True
    report_to="wandb",
)

In [13]:
timeline_metrics = TimelineMetrics(tokenizer)
compute_metrics = lambda eval_preds: timeline_metrics.batch_compute_precision_recall_f1(
    eval_preds, batch_size = 100
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=encoded_dataset["train"],  # training dataset
    eval_dataset=encoded_dataset["test"],  # evaluation dataset
    compute_metrics=compute_metrics,
    data_collator=training_data_collator,
    # prediction_loss_only=True
    # tokenizer=None,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
trainer.train()
wandb.finish()
trainer.save_model(FINAL_MODEL_DIR)

Epoch,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


VBox(children=(Label(value='0.071 MB of 0.071 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▇███
eval/loss,█▃▂▁▁
eval/num_samples,▁▁▁▁▁
eval/precision,▁▇███
eval/recall,▁▆███
eval/runtime,▁█▁█▁
eval/samples_per_second,█▁▇▁█
eval/steps_per_second,█▁▇▁█
train/epoch,▁▁▃▃▄▅▆▆▇██
train/global_step,▁▁▃▃▄▅▆▆▇██

0,1
eval/f1,0.9787
eval/loss,0.05213
eval/num_samples,8631.0
eval/precision,0.95829
eval/recall,1.0
eval/runtime,0.6378
eval/samples_per_second,1567.873
eval/steps_per_second,98.776
train/epoch,5.0
train/global_step,2815.0


In [15]:
model = CustomGPT2LMHeadModel.from_pretrained(FINAL_MODEL_DIR)
model.to("cuda")

CustomGPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(53, 16)
    (wpe): Embedding(100, 16)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-3): 4 x GPT2Block(
        (ln_1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=16, out_features=53, bias=False)
)

In [16]:
tokenizer.padding_side = "left"
inference_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = inference_data_collator(
    encoded_dataset["test"][:2],
)
batch = {k: v.to("cuda") for k, v in batch.items()}

In [17]:
output_ids = model.generate(**batch).cpu()
output_ids

tensor([[27, 28, 47,  2,  3, 25,  2,  3, 24,  2,  3, 23,  2,  3, 22,  2,  3, 21,
          2,  3, 18,  2,  3,  7,  2,  3, 15,  2,  3, 19,  2,  3, 13,  2,  3, 12,
          2,  3,  9,  2,  3, 17,  2,  3, 16,  2,  3, 10,  2,  3,  8,  2,  3, 20,
          2,  3, 14,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,
          6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,
          3, 11,  6,  2,  3, 11,  6,  2,  3, 11],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  1,  1,  1,  1,  1,  1, 32, 28, 51,  2,  3, 21, 18,  2,  3,  7,
         15,  2,  3, 19, 13,  2,  3, 12,  9,  2,  3, 17, 16,  2,  3, 10,  8,  2,
          3, 20, 14,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,
          6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,  3, 11,  6,  2,
          3, 11,  6,  2,  3, 11,  6,  2,  3, 11]])

In [18]:
output_tokens = [
    [
        token
        for token in tokenizer.convert_ids_to_tokens(ids)
        if token != tokenizer.pad_token
    ]
    for ids in output_ids
]
output_tokens

[['num_samples_1',
  'num_blanks_0',
  'start_idx_7',
  '<SEP>',
  'char_diff_0',
  'H',
  '<SEP>',
  'char_diff_0',
  'I',
  '<SEP>',
  'char_diff_0',
  'J',
  '<SEP>',
  'char_diff_0',
  'K',
  '<SEP>',
  'char_diff_0',
  'L',
  '<SEP>',
  'char_diff_0',
  'M',
  '<SEP>',
  'char_diff_0',
  'N',
  '<SEP>',
  'char_diff_0',
  'O',
  '<SEP>',
  'char_diff_0',
  'P',
  '<SEP>',
  'char_diff_0',
  'Q',
  '<SEP>',
  'char_diff_0',
  'R',
  '<SEP>',
  'char_diff_0',
  'S',
  '<SEP>',
  'char_diff_0',
  'T',
  '<SEP>',
  'char_diff_0',
  'U',
  '<SEP>',
  'char_diff_0',
  'V',
  '<SEP>',
  'char_diff_0',
  'W',
  '<SEP>',
  'char_diff_0',
  'X',
  '<SEP>',
  'char_diff_0',
  'Y',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<EOS>',
  '<SEP>',
  'char_diff_0',
  'Z',
  '<E

# Hyperparameter search

In [19]:
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import PopulationBasedTraining

ModuleNotFoundError: No module named 'ray'

In [None]:
compute_metrics = ComputePrecisionHF(
    id2tkn, id2type, prediction_scope="age", topk=1, start=0, batch_size=2000
)

In [None]:
NUM_TRIALS = 20
N_GPU_PER_TRIAL = 1
METRIC_TO_OPTIMIZE = "eval_precision"

In [None]:
def get_model(params):
    torch.cuda.empty_cache()
    if params is None:
        params = {}

    config = GPT2Config(
        vocab_size=len(embeddings),
        n_positions=MAX_SEQ_LEN + 1,
        n_ctx=MAX_SEQ_LEN + 1,
        n_embd=params.get("n_embd", 300),
        n_layer=params.get("n_layer", 1),
        n_head=params.get("n_head", 1),
        bos_token_id=tkn2id["<PAD>"],
        eos_token_id=tkn2id["<PAD>"],
    )
    model = GPT2LMHeadModel(config)

    if params.get("load_weights", 0):
        model.transformer.wte.load_state_dict(
            {"weight": torch.tensor(embeddings, dtype=torch.float32)}
        )
        model.transformer.wte.weight.requires_grad = True

    return model

In [None]:
training_args = TrainingArguments(
    output_dir="./results",  # output directory
    num_train_epochs=5,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    weight_decay=0.01,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=200,
    eval_steps=200,
    learning_rate=5e-5,
    eval_accumulation_steps=1,
    do_eval=True,
    evaluation_strategy="steps",
    skip_memory_metrics=True,
)

In [None]:
training_args.n_head = 1
training_args.n_layer = 1
training_args.n_embd = 300
training_args.load_weights = 0

In [None]:
tune_dataset = encoded_dataset["train"].train_test_split(test_size=0.1)

In [None]:
tune_train_dataset = tune_dataset["train"]
tune_test_dataset = tune_dataset["test"]

In [None]:
trainer = Trainer(
    #    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=tune_train_dataset,  # training dataset
    eval_dataset=tune_test_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
    tokenizer=None,
    model_init=get_model,
)

In [None]:
tune_config = {
    "num_train_epochs": tune.choice([5]),
    "n_head": tune.choice([2, 4, 6]),
}
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric=METRIC_TO_OPTIMIZE,
    mode="max",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": [16, 32, 64, 128],
        "n_layer": tune.choice([2, 4, 6, 8]),
        #       "n_embd": tune.choice([256, 512]),
        "load_weights": tune.choice([0, 1]),
        "warmup_steps": tune.choice([20, 40, 60, 100]),
    },
)

In [None]:
import copy


def compute_objective(metrics):
    metrics = copy.deepcopy(metrics)
    eval_precision = metrics.pop("eval_precision")

    return eval_precision

In [None]:
best_model = trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=NUM_TRIALS,
    direction="maximize",
    compute_objective=compute_objective,
    resources_per_trial={"cpu": 1, "gpu": N_GPU_PER_TRIAL},
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr=METRIC_TO_OPTIMIZE,
    stop=None,
    local_dir=RESULTS_HYPERPARAM,
    name="21_May_2021",
    log_to_file=False,
    loggers=None,  # (WandbLogger, ),
)

In [None]:
best_model

# Saliency 

In [None]:
import ecco

In [None]:
lm = ecco.LM(trainer.model, tokenizer, model_name="gpt2")

In [None]:
ind = 49
print(
    "~~".join(
        [tokenizer.id2tkn[id] for id in encoded_dataset["test"][ind]["input_ids"]]
    )
)
text = "~~".join(
    [tokenizer.id2tkn[id] for id in encoded_dataset["test"][ind]["input_ids"][1:-1]]
)

In [None]:
output = lm.generate(text, generate=10, do_sample=True, temperature=1)

In [None]:
output.saliency(style="detailed")

# Probability prediction

In [None]:
from foresight.sight import Sight

In [None]:
_ = model.eval()

In [None]:
sight = Sight(tokenizer=tokenizer, device="cuda", model=model)

In [None]:
cdb.name2cuis["muscle~pain"]

In [None]:
cdb.get_name("pain")

In [None]:
text = "<ETHNICITY>~~White~~<SEX>~~Male~~<AGE>~~23~~49727002~~386661006".split("~~")

In [None]:
# Small with WD
r = sight.next_concepts(
    text, type_ids=["T-11"], n=40, p_new=True, create_position_ids=False
)
print([cdb.get_name(x) for x in text])
for x in r:
    print(x[0], x[1], cdb.get_name(x[0]))