In [1]:
import os
os.chdir('../../')

In [2]:
import os
import shutil

def remove_pycache(directory):
    for root, dirs, files in os.walk(directory):
        for d in dirs:
            if d == "__pycache__":
                shutil.rmtree(os.path.join(root, d))

# Run the function in your work directory
remove_pycache(".")

In [3]:
import logging
import math
import os
from pathlib import Path

from hydra import initialize, compose
from rich.pretty import pprint
from hydra.core.global_hydra import GlobalHydra

from src.data.data_pipeline import data_pipeline
from src.factories import (
    get_callbacks,
    get_dataloaders,
    get_datasets,
    get_lookups,
    get_lr_scheduler,
    get_metric_collections,
    get_model,
    get_optimizer,
    get_text_encoder,
    get_transform,
)
from src.trainer.trainer import Trainer
from src.utils.seed import set_seed

LOGGER = logging.getLogger(name='test')
LOGGER.setLevel(logging.INFO)



def deterministic() -> None:
    """Run experiment deterministically. There will still be some randomness in the backward pass of the model."""
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

    import torch

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms(True)


# Clear the global Hydra instance
GlobalHydra.instance().clear()
#Load configuration

initialize(config_path="../../configs")



cfg = compose(config_name="config",
              overrides=[
                  "experiment=mimiciv_icd10/multi_res_conv.yaml",
                  "trainer.validate_on_training_data=false",
                  "callbacks=no_wandb",
                  "load_model=./experiments/jc1u3c6s",
                  "trainer.epochs=0"
              ])


if cfg.deterministic:
    deterministic()
else: 
    import torch

    if torch.cuda.is_available():
        print("GPU is available")
        print(f"GPU name: {torch.cuda.get_device_name(0)}")
    else:
        print("GPU is not available")
set_seed(cfg.seed)

# Check if CUDA_VISIBLE_DEVICES is set
if "CUDA_VISIBLE_DEVICES" not in os.environ:
    if cfg.gpu != -1 and cfg.gpu is not None and cfg.gpu != "":
     
        os.environ["CUDA_VISIBLE_DEVICES"] = (
            ",".join([str(gpu) for gpu in cfg.gpu])
            if isinstance(cfg.gpu, list)
            else str(cfg.gpu)
        )

    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = ""

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")
pprint(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

2025-03-14 17:16:18.743202: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-14 17:16:21.606022: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 17:16:22.426397: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-03-14 17:16:22.426419: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

GPU is not available


In [4]:
from pprint import pprint

pprint(cfg)

{'seed': 1337, 'deterministic': False, 'gpu': [6], 'name': None, 'debug': False, 'load_model': './experiments/jc1u3c6s', 'data': {'dir': 'files/data/mimiciv_icd10', 'data_filename': 'mimiciv_icd10.feather', 'split_filename': 'mimiciv_icd10_split.feather', 'code_column_names': ['icd10_diag', 'icd10_proc'], 'max_length': 4000}, 'dataset': {'name': 'BaseDataset', 'configs': {}}, 'dataloader': {'max_batch_size': 16, 'batch_size': 16, 'num_workers': 0, 'drop_last': True, 'pin_memory': False, 'batch_sampler': {'name': 'BySequenceLengthSampler', 'configs': {'bucket_boundaries': [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2600, 3000, 4000]}}}, 'model': {'name': 'MultiResCNN', 'configs': {'kernel_sizes': [3, 5, 9, 15, 19, 25], 'num_filters': 50, 'embed_dropout': 0.2}}, 'text_encoder': {'name': 'Word2Vec', 'file_name': 'word2vec_full.model', 'load_model': True, 'configs': {'min_document_count': 3, 'model_configs': {'vector_size': 100, 'min_count': 0, 'workers': -1, 'epochs': 5}}}, 

พูดถึง vaex

'split_filename': 'mimiciv_icd10_split.feather'

In [5]:
data = data_pipeline(config=cfg.data)

set unsupervise train wordembed

In [6]:
text_encoder = get_text_encoder(
    config=cfg.text_encoder, data_dir=cfg.data.dir, texts=data.get_train_documents
)

embed already exist


ตรงนี้เปลี่ยน target เป็น int นะ 

In [7]:
label_transform = get_transform(
    config=cfg.label_transform,
    targets=data.all_targets,
    load_transform_path=cfg.load_model,
)


loaded transform


set TokenSequence, HuggingFaceTokenizer, BOW แต่งเติมส่วนของ padding unknow ให้เรียบร้อย

In [8]:
text_transform = get_transform(
    config=cfg.text_transform,
    texts=data.get_train_documents,
    text_encoder=text_encoder,
    load_transform_path=cfg.load_model,
)

loaded transform


Tranform text จริง โดยมองเป็น batch

In [9]:
data.truncate_text(cfg.data.max_length)

In [10]:
#transform token=>index ตัวเลข เด๋ว train จะถูกผลักออกเป็น vector 
data.transform_text(text_transform.batch_transform)

Transforming text...: 100%|██████████| 13/13 [01:25<00:00,  6.58s/it]


Target มีทำ map กลับเป็น int ไว้เลย

In [11]:
lookups = get_lookups(
    config=cfg.lookup,
    data=data,
    label_transform=label_transform,
    text_transform=text_transform,
)

# print data info
pprint(lookups.data_info)
# pprint(lookups.data_info["num_classes"] == len(lookups.code_system2code_indices['icd10_diag'])+len(lookups.code_system2code_indices['icd10_proc']))


#สังเกตแค่่ train เท่านั้นหากเราพยายามเปลียน input เพื่อทดสอบ n น้อยๆ ให้ run model ผ่าน ตอน val / test ถูกล็อคไว้หมดแล้ว

{'average_classes_per_example': 15.65479481182224,
 'average_tokens_per_example': 1596.879217847855,
 'num_classes': 7942,
 'num_examples': 122278,
 'num_test_classes': 7935,
 'num_test_examples': 19801,
 'num_train_classes': 7939,
 'num_train_examples': 89091,
 'num_train_tokens': 141971714,
 'num_val_classes': 7906,
 'num_val_examples': 13386,
 'pad_index': 0,
 'vocab_size': 62774}


ระบุโมเดล

In [12]:
model = get_model(
    config=cfg.model, data_info=lookups.data_info, text_encoder=text_encoder
)
model.to(device)

loading pretrained embeddings...


MultiResCNN(
  (embed_drop): Dropout(p=0.2, inplace=False)
  (embed): Embedding(62774, 100, padding_idx=0)
  (convs): ModuleList(
    (0): Sequential(
      (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (1): Tanh()
      (2): ResidualBlock(
        (left): Sequential(
          (0): Conv1d(100, 50, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
          (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): Tanh()
          (3): Conv1d(50, 50, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
          (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (shortcut): Sequential(
          (0): Conv1d(100, 50, kernel_size=(1,), stride=(1,), bias=False)
          (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
    (1): Sequential(


จัดเตรียม datasset

In [13]:
datasets = get_datasets(
    config=cfg.dataset,
    data=data,
    text_transform=text_transform,
    label_transform=label_transform,
    lookups=lookups,
)

Creating examples train: 100%|██████████| 89091/89091 [00:05<00:00, 16566.11it/s]
Creating examples val: 100%|██████████| 13386/13386 [00:01<00:00, 11797.51it/s]
Creating examples test: 100%|██████████| 19801/19801 [00:01<00:00, 14397.26it/s]


สามารถปรับ batch Optimize ได้ตรงนี้เลย ตรงนี้เป็นการบังคับเฉพาะให้เฉพาะ train เท่านั้น ถูกควบคุมโดย cfg.dataloader.name

set loader มีทั้ง train train_val test val

In [14]:
#ตรงนี้เตรียม batch ให้เรียบร้อยแล้วหละ เดิม #dataset ยังไม่ทำเป็น batch นะ 
dataloaders = get_dataloaders(config=cfg.dataloader, datasets_dict=datasets)



In [15]:
# dataloaders["train"].dataset.text_transform.save('./')
# dataloaders["train"].dataset.label_transform.save('./')

set optimiser

In [16]:
optimizer = get_optimizer(config=cfg.optimizer, model=model)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0005
    maximize: False
    weight_decay: 0.0001
)

เรื่องของ batchsize ตำนวนหา gradient กรณี accumulate_grad_batches > 1 แสดงว่ายังไม่ปรับ grad ทันที & Learning rate

In [17]:
accumulate_grad_batches = int(
    max(cfg.dataloader.batch_size / cfg.dataloader.max_batch_size, 1)
)
num_training_steps = (
    math.ceil(len(dataloaders["train"]) / accumulate_grad_batches)
    * cfg.trainer.epochs
)
lr_scheduler = get_lr_scheduler(
    config=cfg.lr_scheduler,
    optimizer=optimizer,
    num_training_steps=num_training_steps,
)

split_names: list[str] = ["train", "train_val", "val", "test"],

splits_with_multiple_code_systems: set[str] = {"train_val", "val", "test"},

In [18]:
#code_system2code_indices => diag proc (ทั้งหมดเลยนะไม่แยก train test)
#split2code_indices =>  train train_val val test
metric_collections = get_metric_collections(
    config=cfg.metrics,
    number_of_classes=lookups.data_info["num_classes"],
    code_system2code_indices=lookups.code_system2code_indices, # รวมทั้งหมดที่เป็น label diag และ label proc
    split2code_indices=lookups.split2code_indices, # label classs แต่ละ กกลุ่มที่แบ่งไป ตาม 
)
metric_collections

#ข้างในมี แต่ index ล้วนๆเลยนะ

defaultdict(dict,
            {'train': {'all': <src.metrics.MetricCollection at 0x7f2082855f90>},
             'train_val': {'all': <src.metrics.MetricCollection at 0x7f20828574f0>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f2082693f10>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f2082691090>},
             'val': {'all': <src.metrics.MetricCollection at 0x7f2082691540>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f20826bc730>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f20826bceb0>},
             'test': {'all': <src.metrics.MetricCollection at 0x7f20826bd6c0>,
              'icd10_diag': <src.metrics.MetricCollection at 0x7f20826bde40>,
              'icd10_proc': <src.metrics.MetricCollection at 0x7f20826be5c0>}})

- name: WandbCallback

- name: SaveBestModelCallback

- name: EarlyStoppingCallback



In [19]:
callbacks = get_callbacks(config=cfg.callbacks)
callbacks

[<src.trainer.callbacks.SaveBestModelCallback at 0x7f2082857820>,
 <src.trainer.callbacks.EarlyStoppingCallback at 0x7f20826bf520>]

Before Train

In [20]:
cfg

{'seed': 1337, 'deterministic': False, 'gpu': [6], 'name': None, 'debug': False, 'load_model': './experiments/jc1u3c6s', 'data': {'dir': 'files/data/mimiciv_icd10', 'data_filename': 'mimiciv_icd10.feather', 'split_filename': 'mimiciv_icd10_split.feather', 'code_column_names': ['icd10_diag', 'icd10_proc'], 'max_length': 4000}, 'dataset': {'name': 'BaseDataset', 'configs': {}}, 'dataloader': {'max_batch_size': 16, 'batch_size': 16, 'num_workers': 0, 'drop_last': True, 'pin_memory': False, 'batch_sampler': {'name': 'BySequenceLengthSampler', 'configs': {'bucket_boundaries': [400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2600, 3000, 4000]}}}, 'model': {'name': 'MultiResCNN', 'configs': {'kernel_sizes': [3, 5, 9, 15, 19, 25], 'num_filters': 50, 'embed_dropout': 0.2}}, 'text_encoder': {'name': 'Word2Vec', 'file_name': 'word2vec_full.model', 'load_model': True, 'configs': {'min_document_count': 3, 'model_configs': {'vector_size': 100, 'min_count': 0, 'workers': -1, 'epochs': 5}}}, 

In [21]:
trainer = Trainer(
    config=cfg,
    data=datasets,
    model=model,
    optimizer=optimizer,
    dataloaders=dataloaders,
    metric_collections=metric_collections, # metric มี 4 กลุ่ม ตอน ใช้ metric จะแยกกล่มใครกล่มมั่นไปแล้ว
    callbacks=callbacks,
    lr_scheduler=lr_scheduler,
    lookups=lookups,
    accumulate_grad_batches=accumulate_grad_batches,
).to(device)

  self.gradient_scaler = torch.cuda.amp.GradScaler(enabled=self.use_amp)


Train  ตรงส่วนนี้จะระบุว่าไฟล์ข้อมูลทั้งหมดจะถูกวางไว้ที่ไหนเลย

ตอน validate จะสืบหา f1 ที่ดีที่สด 

In [22]:
if cfg.load_model:
    trainer.experiment_path = Path(cfg.load_model)
trainer.fit()

  checkpoint = torch.load(self.experiment_path / file_name,map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"))


tensor(0.4242)


Loaded best model


Epoch: 18 | Validating on val: 100%|██████████| 837/837 [08:12<00:00,  1.70it/s]


Best F1: 0.5668 at DB: 0.4242


Epoch: 18 | Validating on test: 100%|██████████| 1238/1238 [07:40<00:00,  2.69it/s]
