In [1]:
import os
os.chdir('../../')

In [2]:
import logging
import math
import os
from pathlib import Path

from hydra import initialize, compose
from rich.pretty import pprint
from hydra.core.global_hydra import GlobalHydra

from src.data.data_pipeline import data_pipeline
from src.factories import (
    get_callbacks,
    get_dataloaders,
    get_datasets,
    get_lookups,
    get_lr_scheduler,
    get_metric_collections,
    get_model,
    get_optimizer,
    get_text_encoder,
    get_transform,
)
from src.trainer.trainer import Trainer
from src.utils.seed import set_seed

LOGGER = logging.getLogger(name='test')
LOGGER.setLevel(logging.INFO)



def deterministic() -> None:
    """Run experiment deterministically. There will still be some randomness in the backward pass of the model."""
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

    import torch

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms(True)


# Clear the global Hydra instance
GlobalHydra.instance().clear()
#Load configuration

initialize(config_path="../../configs")


cfg = compose(config_name="config",
              overrides=["experiment=mimiciv_icd10/vanillaconv.yaml",
                         "trainer.validate_on_training_data=false"
                         #"callbacks=no_wandb",
                         #"load_model=./experiments/t1dbhfub",
                         #"trainer.epochs=1"
                         ])


if cfg.deterministic:
    deterministic()
else:
    import torch

    if torch.cuda.is_available():
        print("GPU is available")
        print(f"GPU name: {torch.cuda.get_device_name(0)}")
    else:
        print("GPU is not available")
set_seed(cfg.seed)

# Check if CUDA_VISIBLE_DEVICES is set
if "CUDA_VISIBLE_DEVICES" not in os.environ:
    if cfg.gpu != -1 and cfg.gpu is not None and cfg.gpu != "":
     
        os.environ["CUDA_VISIBLE_DEVICES"] = (
            ",".join([str(gpu) for gpu in cfg.gpu])
            if isinstance(cfg.gpu, list)
            else str(cfg.gpu)
        )

    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")
pprint(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path="../../configs")


GPU is not available


In [3]:
from pprint import pprint

pprint(cfg)

{'seed': 1337, 'deterministic': False, 'gpu': [6], 'name': None, 'debug': False, 'load_model': None, 'data': {'dir': 'files/data/mimiciv_icd10', 'data_filename': 'mimiciv_icd10.feather', 'split_filename': 'mimiciv_icd10_split.feather', 'code_column_names': ['icd10_diag', 'icd10_proc'], 'max_length': 4000}, 'dataset': {'name': 'BaseDataset', 'configs': {}}, 'dataloader': {'max_batch_size': 128, 'batch_size': 8, 'num_workers': 0, 'drop_last': True, 'pin_memory': False, 'batch_sampler': {'name': 'BySequenceLengthSampler', 'configs': {'bucket_boundaries': [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2600, 3000, 4000]}}}, 'model': {'name': 'VanillaConv', 'configs': {'embed_dropout': 0, 'num_filters': 500, 'kernel_size': 4}}, 'text_encoder': {'name': 'Word2Vec', 'file_name': 'word2vec_full.model', 'load_model': True, 'configs': {'min_document_count': 3, 'model_configs': {'vector_size': 100, 'min_count': 0, 'workers': -1, 'epochs': 5}}}, 'trainer': {'epochs': 20, 'validate_on_tra

พูดถึง vaex

'split_filename': 'mimiciv_icd10_split.feather'

In [4]:
data = data_pipeline(config=cfg.data)

set unsupervise train wordembed

In [5]:
text_encoder = get_text_encoder(
    config=cfg.text_encoder, data_dir=cfg.data.dir, texts=data.get_train_documents
)

embed already exist


ตรงนี้เปลี่ยน target เป็น int นะ 

In [6]:
label_transform = get_transform(
    config=cfg.label_transform,
    targets=data.all_targets,
    load_transform_path=cfg.load_model,
)


set TokenSequence, HuggingFaceTokenizer, BOW แต่งเติมส่วนของ padding unknow ให้เรียบร้อย

In [7]:
text_transform = get_transform(
    config=cfg.text_transform,
    texts=data.get_train_documents,
    text_encoder=text_encoder,
    load_transform_path=cfg.load_model,
)

Tranform text จริง โดยมองเป็น batch

In [8]:
data.truncate_text(cfg.data.max_length)

In [9]:
#transform token=>index ตัวเลข เด๋ว train จะถูกผลักออกเป็น vector 
data.transform_text(text_transform.batch_transform)

Transforming text...: 100%|██████████| 31/31 [00:01<00:00, 17.68it/s]
Collecting results...: 100%|██████████| 31/31 [00:00<00:00, 2755.20it/s]


Target มีทำ map กลับเป็น int ไว้เลย

In [10]:
lookups = get_lookups(
    config=cfg.lookup,
    data=data,
    label_transform=label_transform,
    text_transform=text_transform,
)

# print data info
pprint(lookups.data_info)
# pprint(lookups.data_info["num_classes"] == len(lookups.code_system2code_indices['icd10_diag'])+len(lookups.code_system2code_indices['icd10_proc']))


#สังเกตแค่่ train เท่านั้นหากเราพยายามเปลียน input เพื่อทดสอบ n น้อยๆ ให้ run model ผ่าน ตอน val / test ถูกล็อคไว้หมดแล้ว

{'average_classes_per_example': 15.65479481182224,
 'average_tokens_per_example': 1596.879217847855,
 'num_classes': 7942,
 'num_examples': 122278,
 'num_test_classes': 7937,
 'num_test_examples': 21265,
 'num_train_classes': 7938,
 'num_train_examples': 76933,
 'num_train_tokens': 122848593,
 'num_val_classes': 7932,
 'num_val_examples': 24080,
 'pad_index': 0,
 'vocab_size': 62792}


ระบุโมเดล

In [11]:
model = get_model(
    config=cfg.model, data_info=lookups.data_info, text_encoder=text_encoder
)
model.to(device)

loading pretrained embeddings...


VanillaConv(
  (embed_drop): Dropout(p=0, inplace=False)
  (embed): Embedding(62792, 100, padding_idx=0)
  (conv): Conv1d(100, 500, kernel_size=(4,), stride=(1,))
  (fc): Linear(in_features=500, out_features=7942, bias=True)
)

จัดเตรียม datasset

In [12]:
datasets = get_datasets(
    config=cfg.dataset,
    data=data,
    text_transform=text_transform,
    label_transform=label_transform,
    lookups=lookups,
)

Creating examples train: 100%|██████████| 76933/76933 [00:05<00:00, 13665.72it/s]
Creating examples val: 100%|██████████| 24080/24080 [00:01<00:00, 12908.44it/s]
Creating examples test: 100%|██████████| 21265/21265 [00:01<00:00, 13803.92it/s]


สามารถปรับ batch Optimize ได้ตรงนี้เลย ตรงนี้เป็นการบังคับเฉพาะให้เฉพาะ train เท่านั้น ถูกควบคุมโดย cfg.dataloader.name

set loader มีทั้ง train train_val test val

In [13]:
#ตรงนี้เตรียม batch ให้เรียบร้อยแล้วหละ เดิม #dataset ยังไม่ทำเป็น batch นะ 
dataloaders = get_dataloaders(config=cfg.dataloader, datasets_dict=datasets)



set optimiser

In [14]:
optimizer = get_optimizer(config=cfg.optimizer, model=model)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 1e-05
)

เรื่องของ batchsize ตำนวนหา gradient กรณี accumulate_grad_batches > 1 แสดงว่ายังไม่ปรับ grad ทันที & Learning rate

In [15]:
accumulate_grad_batches = int(
    max(cfg.dataloader.batch_size / cfg.dataloader.max_batch_size, 1)
)
num_training_steps = (
    math.ceil(len(dataloaders["train"]) / accumulate_grad_batches)
    * cfg.trainer.epochs
)
lr_scheduler = get_lr_scheduler(
    config=cfg.lr_scheduler,
    optimizer=optimizer,
    num_training_steps=num_training_steps,
)

split_names: list[str] = ["train", "train_val", "val", "test"],

splits_with_multiple_code_systems: set[str] = {"train_val", "val", "test"},

In [16]:
#code_system2code_indices => diag proc (ทั้งหมดเลยนะไม่แยก train test)
#split2code_indices =>  train train_val val test
metric_collections = get_metric_collections(
    config=cfg.metrics,
    number_of_classes=lookups.data_info["num_classes"],
    code_system2code_indices=lookups.code_system2code_indices, # รวมทั้งหมดที่เป็น label diag และ label proc
    split2code_indices=lookups.split2code_indices, # label classs แต่ละ กกลุ่มที่แบ่งไป ตาม 
)
metric_collections

#ข้างในมี แต่ index ล้วนๆเลยนะ

defaultdict(dict,
            {'train': {'all': <src.metrics.MetricCollection at 0x7f9ddc373520>},
             'train_val': {'all': <src.metrics.MetricCollection at 0x7f9ddc373730>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f9ddc3d4190>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f9ddc3d4910>},
             'val': {'all': <src.metrics.MetricCollection at 0x7f9ddc3d5090>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f9ddc3d5810>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f9ddc3d5f90>},
             'test': {'all': <src.metrics.MetricCollection at 0x7f9ddc3d67a0>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f9ddc3d6f20>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f9ddc3d76a0>}})

- name: WandbCallback

- name: SaveBestModelCallback

- name: EarlyStoppingCallback



In [17]:
callbacks = get_callbacks(config=cfg.callbacks)
callbacks

[<src.trainer.callbacks.WandbCallback at 0x7f9ddc3ec5e0>,
 <src.trainer.callbacks.SaveBestModelCallback at 0x7f9ddc3ec610>,
 <src.trainer.callbacks.EarlyStoppingCallback at 0x7f9ddc3ec700>]

Before Train

In [18]:
cfg

{'seed': 1337, 'deterministic': False, 'gpu': [6], 'name': None, 'debug': False, 'load_model': None, 'data': {'dir': 'files/data/mimiciv_icd10', 'data_filename': 'mimiciv_icd10.feather', 'split_filename': 'mimiciv_icd10_split.feather', 'code_column_names': ['icd10_diag', 'icd10_proc'], 'max_length': 4000}, 'dataset': {'name': 'BaseDataset', 'configs': {}}, 'dataloader': {'max_batch_size': 128, 'batch_size': 8, 'num_workers': 0, 'drop_last': True, 'pin_memory': False, 'batch_sampler': {'name': 'BySequenceLengthSampler', 'configs': {'bucket_boundaries': [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2600, 3000, 4000]}}}, 'model': {'name': 'VanillaConv', 'configs': {'embed_dropout': 0, 'num_filters': 500, 'kernel_size': 4}}, 'text_encoder': {'name': 'Word2Vec', 'file_name': 'word2vec_full.model', 'load_model': True, 'configs': {'min_document_count': 3, 'model_configs': {'vector_size': 100, 'min_count': 0, 'workers': -1, 'epochs': 5}}}, 'trainer': {'epochs': 20, 'validate_on_tra

In [19]:
trainer = Trainer(
    config=cfg,
    data=datasets,
    model=model,
    optimizer=optimizer,
    dataloaders=dataloaders,
    metric_collections=metric_collections, # metric มี 4 กลุ่ม ตอน ใช้ metric จะแยกกล่มใครกล่มมั่นไปแล้ว
    callbacks=callbacks,
    lr_scheduler=lr_scheduler,
    lookups=lookups,
    accumulate_grad_batches=accumulate_grad_batches,
).to(device)

  #ไฟล์นี้บรรจุแค่ configfile เอง มันมีการถูกบันทึกเหมือนกลไก wandb


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrxsu08560194[0m ([33mICD-10[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


experiments/xela8cvk


Train  ตรงส่วนนี้จะระบุว่าไฟล์ข้อมูลทั้งหมดจะถูกวางไว้ที่ไหนเลย

ตอน validate จะสืบหา f1 ที่ดีที่สด 

In [None]:
if cfg.load_model:
    trainer.experiment_path = Path(cfg.load_model)
trainer.fit()

Epoch: 0 | Training:  97%|█████████▋| 9284/9616 [13:41<00:25, 12.93it/s]