In [None]:
import logging

log = logging.getLogger()
log.handlers.clear()
log.addHandler(logging.StreamHandler())
log.setLevel(logging.WARNING)

In [None]:
import copy
from pathlib import Path
from typing import Any

import datasets
import numpy as np
import pandas as pd
import ray
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import Dataset
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from torch.utils.data import DataLoader

# from medcat.cat import CAT
# from foresight.models.lucid_transformers import LucidLM2HF
from transformers import (
    DataCollatorWithPadding,
    SchedulerType,
    Trainer,
    TrainingArguments,
)

# from medcat.cdb import CDB
from foresight.datasets.data_collator import CollataAndPad
from foresight.datasets.data_collator_v2 import (
    DataCollatorForLanguageModelingMaskStaticVariables,
)
from foresight.metrics.next_concept_prediction import (
    ComputePrecisionHF,
    metrics_data2df,
    precision,
)
from foresight.metrics.timeline import TimelineMetrics
from foresight.models.custom_GPT2 import CustomGPT2Config, CustomGPT2LMHeadModel
from foresight.tokenizers import PreTrainedTokenizerFastWithPositionIDPadding
from foresight.tokenizers.simple_map_tokenizer import SimpleMapTokenizer
from foresight.utils import pickle

In [None]:
import math
import time

import datasets
from torch.utils.data import DataLoader

In [None]:
OUTPUT_DIR = Path.cwd() / "outputs"
SAVE_TOKENIZER_PATH = OUTPUT_DIR / "tokenizer"
SAVE_ENCODED_DATASET_PATH = OUTPUT_DIR / "encoded_dataset"
MODEL_LOGS_DIR = OUTPUT_DIR / "model_logs" / time.strftime("%Y_%m_%d_%H_%M_%S")
FINAL_MODEL_DIR = MODEL_LOGS_DIR / "final_model"
MODEL_LOGS_DIR.mkdir(parents=True, exist_ok=True)

NUM_STATIC_VARIABLES = 4

In [None]:
encoded_dataset = datasets.load_from_disk(SAVE_ENCODED_DATASET_PATH)
encoded_dataset

In [None]:
tokenizer = PreTrainedTokenizerFastWithPositionIDPadding.from_pretrained(
    SAVE_TOKENIZER_PATH
)
training_data_collator = DataCollatorForLanguageModelingMaskStaticVariables(
    tokenizer=tokenizer, mlm=False, num_static_variables=NUM_STATIC_VARIABLES
)

# Create GPT2

In [None]:
# "n_layer_and_heads": tune.choice([2, 4, 8, 16, 32, 64]),
# "embed_dim": tune.choice([256, 512]),


def get_model(
    params: dict[str, Any],
    tokenizer: PreTrainedTokenizerFastWithPositionIDPadding,
    max_sequence_length: int,
):
    print("get_model", params)
    if params is None:
        params = {}

    config = CustomGPT2Config(
        vocab_size=tokenizer.vocab_size,
        n_positions=max_sequence_length,
        n_ctx=max_sequence_length,
        # n_embd=params.get('n_embd', 512),
        n_layer=params.get("n_layer_and_heads", 4),
        n_head=params.get("n_layer_and_heads", 4),
        bos_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.pad_token_id,
        sep_token_id=tokenizer.sep_token_id,
    )
    return CustomGPT2LMHeadModel(config)


max_sequence_length = math.ceil(
    max(len(sample["input_ids"]) for sample in encoded_dataset["train"]) * 1.2
)
get_model_lambda = lambda params: get_model(params, tokenizer, max_sequence_length)
trial_model = get_model_lambda(None)

In [None]:
sum(p.numel() for p in trial_model.parameters() if p.requires_grad)

In [None]:
trial_dataset = DataLoader(
    encoded_dataset["train"],
    batch_size=8,
    shuffle=False,
    collate_fn=training_data_collator,
)
batch = next(iter(trial_dataset))
trial_model(**{k: v for k, v in batch.items()}).logits.shape

# Trainer

In [None]:
gpus_per_trial = 1
training_args = TrainingArguments(
    output_dir=MODEL_LOGS_DIR,  # output directory
    no_cuda=gpus_per_trial <= 0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=10,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,  # config
    warmup_ratio=0.1,  # config
    weight_decay=0.1,  # config
    logging_dir="./logs",
    skip_memory_metrics=True,
    report_to="none",
)

In [None]:
def compute_objective(metrics):
    metrics = copy.deepcopy(metrics)
    return metrics.pop("eval_loss")

In [None]:
# timeline_metrics = TimelineMetrics(tokenizer)
# compute_metrics = lambda eval_preds: timeline_metrics.batch_compute_precision_recall_f1(
#     eval_preds, batch_size = 100
# )


trainer = Trainer(
    model_init=get_model_lambda,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    data_collator=training_data_collator,
)

In [None]:
tune_config = {
    "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
}

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_loss",
    mode="min",
    perturbation_interval=1,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.loguniform(1e-5, 1e-1),
        "warmup_ratio": tune.loguniform(1e-2, 1e-1),
    },
)

In [None]:
from ray.tune import JupyterNotebookReporter

reporter = JupyterNotebookReporter(
    parameter_columns=[
        "weight_decay",
        "learning_rate",
        "warmup_ratio",
        "per_device_train_batch_size",
        "n_layer_and_heads",
        "embed_dim",
    ],
    metric_columns=["eval_loss", "epoch", "training_iteration"],
)

In [None]:
import math

12 * 96 * math.exp(2 * 5.039) * math.exp(2 * (5.55e-2) * 96)

In [None]:
from ray.train import CheckpointConfig

best_trial = trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=4,
    resources_per_trial={"cpu": 1, "gpu": gpus_per_trial},
    scheduler=pbt_scheduler,
    # checkpoint_config=CheckpointConfig(
    #     num_to_keep=1,
    #     checkpoint_score_attribute="training_iteration",
    # ),
    progress_reporter=reporter,
    local_dir=str(MODEL_LOGS_DIR),
    name="tune_transformer_pbt",
    log_to_file=True,
)

In [None]:
best_trial.run_summary.trial_dataframes["6b51c_00000"]

In [None]:
[
    "1_num_train_epochs=5",
    "2_num_train_epochs=4",
    4,
    4,
    {
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 4,
        "weight_decay": 0.029992474745400864,
        "learning_rate": 2.836995567863469e-05,
    },
    {
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 4,
        "weight_decay": 0.006175348288740734,
        "learning_rate": 2.2695964542907755e-05,
    },
]

In [None]:
model = CustomGPT2LMHeadModel.from_pretrained(FINAL_MODEL_DIR)
model.to("cuda")

In [None]:
tokenizer.padding_side = "left"
inference_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = inference_data_collator(
    encoded_dataset["test"][:2],
)
batch = {k: v.to("cuda") for k, v in batch.items()}

In [None]:
output_ids = model.generate(**batch).cpu()
output_ids

In [None]:
output_tokens = [
    [
        token
        for token in tokenizer.convert_ids_to_tokens(ids)
        if token != tokenizer.pad_token
    ]
    for ids in output_ids
]
output_tokens