# Made with ML

- https://github.com/GokuMohandas/Made-With-ML/blob/main/notebooks/madewithml.ipynb
- https://madewithml.com/

In [1]:
import os
import ray
import random
import torch
import torch.nn as nn
from ray.data.preprocessor import Preprocessor
from transformers import BertTokenizer
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [2]:
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()
%load_ext autoreload
%autoreload 2

In [3]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Initialize Ray
if ray.is_initialized():
    ray.shutdown()
ray.init()

2023-12-27 15:25:49,764	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.11
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [5]:
ray.cluster_resources()

{'memory': 50673364173.0,
 'object_store_memory': 2147483648.0,
 'node:127.0.0.1': 1.0,
 'node:__internal_head__': 1.0,
 'CPU': 10.0}

In [6]:
num_workers = 6  # prefer to do a few less than total available CPU (1 for head node + 1 for background tasks)
resources_per_worker={"CPU": 1, "GPU": 0}

In [7]:
# Label to index
tags = ['mlops', 'natural-language-processing', 'computer-vision', 'other']
num_classes = len(tags)
class_to_index = {tag: i for i, tag in enumerate(tags)}
class_to_index

{'mlops': 0,
 'natural-language-processing': 1,
 'computer-vision': 2,
 'other': 3}

In [8]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sguys99/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def clean_text(text, stopwords=STOPWORDS):
    """Clean raw text string."""
    # Lower
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)  # add spacing
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends
    text = re.sub(r"http\S+", "", text)  #  remove links
    
    return text

In [10]:
# Bert tokenizer
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)

In [11]:
def tokenize(batch):
    tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    encoded_inputs = tokenizer(batch["text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["tag"]))

In [12]:
def preprocess(df, class_to_index):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df["text"] = df.text.apply(clean_text)  # clean text
    df = df.drop(columns=["id", "created_on", "title", "description"], errors="ignore")  # clean dataframe
    df = df[["text", "tag"]]  # rearrange columns
    df["tag"] = df["tag"].map(class_to_index)  # label encoding
    outputs = tokenize(df)
    return outputs

In [13]:
from ray.data import Dataset
from typing import Dict, List, Tuple

def stratify_split(
    ds: Dataset,
    stratify: str,
    test_size: float,
    shuffle: bool = True,
    seed: int = 1234,
) -> Tuple[Dataset, Dataset]:
    """Split a dataset into train and test splits with equal
    amounts of data points from each class in the column we
    want to stratify on.

    Args:
        ds (Dataset): Input dataset to split.
        stratify (str): Name of column to split on.
        test_size (float): Proportion of dataset to split for test set.
        shuffle (bool, optional): whether to shuffle the dataset. Defaults to True.
        seed (int, optional): seed for shuffling. Defaults to 1234.

    Returns:
        Tuple[Dataset, Dataset]: the stratified train and test datasets.
    """

    def _add_split(df: pd.DataFrame) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Naively split a dataframe into train and test splits.
        Add a column specifying whether it's the train or test split."""
        train, test = train_test_split(df, test_size=test_size, shuffle=shuffle, random_state=seed)
        train["_split"] = "train"
        test["_split"] = "test"
        return pd.concat([train, test])

    def _filter_split(df: pd.DataFrame, split: str) -> pd.DataFrame:  # pragma: no cover, used in parent function
        """Filter by data points that match the split column's value
        and return the dataframe with the _split column dropped."""
        return df[df["_split"] == split].drop("_split", axis=1)

    # Train, test split with stratify
    grouped = ds.groupby(stratify).map_groups(_add_split, batch_format="pandas")  # group by each unique value in the column we want to stratify on
    train_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "train"}, batch_format="pandas")  # combine
    test_ds = grouped.map_batches(_filter_split, fn_kwargs={"split": "test"}, batch_format="pandas")  # combine

    # Shuffle each split (required)
    train_ds = train_ds.random_shuffle(seed=seed)
    test_ds = test_ds.random_shuffle(seed=seed)

    return train_ds, test_ds

In [14]:
ray.data.DatasetContext.get_current().execution_options.preserve_order = True

In [15]:
DATASET_LOC = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/dataset.csv"
# df = pd.read_csv(DATASET_LOC)
# df.head()

In [16]:
# Data ingestion
ds = ray.data.read_csv(DATASET_LOC)
ds = ds.random_shuffle(seed=1234)
ds.take(1)

2023-12-27 15:25:52,249	INFO read_api.py:406 -- To satisfy the requested parallelism of 20, each read task output is split into 20 smaller blocks.
2023-12-27 15:25:52,261	INFO dataset.py:2380 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-12-27 15:25:52,263	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:25:52,264	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:52,265	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[{'id': 549,
  'created_on': datetime.datetime(2020, 4, 16, 3, 48, 35),
  'title': '15 Best Tools for Tracking Machine Learning Experiments',
  'description': 'A feature comparison of all the open-source and commercial options for experiment tracking.',
  'tag': 'mlops'}]

In [17]:
# Split dataset
test_size = 0.2
train_ds, val_ds = stratify_split(ds, stratify="tag", test_size=test_size)

2023-12-27 15:25:53,233	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:25:53,233	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:53,234	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
# Mapping
tags = train_ds.unique(column="tag")
class_to_index = {tag: i for i, tag in enumerate(tags)}

2023-12-27 15:25:53,696	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:25:53,697	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:53,697	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:25:54,675	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-12-27 15:25:54,676	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:54,677	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

- Aggregate 11:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 12:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 13:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
import os
import random
import torch
from ray.data.preprocessor import Preprocessor

In [20]:
def set_seeds(seed=42):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    eval("setattr(torch.backends.cudnn, 'deterministic', True)")
    eval("setattr(torch.backends.cudnn, 'benchmark', False)")
    os.environ["PYTHONHASHSEED"] = str(seed)

In [21]:
def load_data(num_samples=None):
    ds = ray.data.read_csv(DATASET_LOC)
    ds = ds.random_shuffle(seed=1234)
    ds = ray.data.from_items(ds.take(num_samples)) if num_samples else ds
    return ds

In [22]:
class CustomPreprocessor():
    """Custom preprocessor class."""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        
    def fit(self, ds):
        tags = ds.unique(column="tag")
        self.class_to_index = {tag: i for i, tag in enumerate(tags)}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        return self
    
    def transform(self, ds):
        return ds.map_batches(
            preprocess, 
            fn_kwargs={"class_to_index": self.class_to_index}, 
            batch_format="pandas")

In [23]:
import torch.nn as nn
from transformers import BertModel

In [24]:

# Pretrained LLM
llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
embedding_dim = llm.config.hidden_size

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
class FinetunedLLM(nn.Module):
    def __init__(self, llm, dropout_p, embedding_dim, num_classes):
        super(FinetunedLLM, self).__init__()
        self.llm = llm
        self.dropout_p = dropout_p
        self.embedding_dim = embedding_dim
        self.num_classes = num_classes
        self.dropout = torch.nn.Dropout(dropout_p)
        self.fc1 = torch.nn.Linear(embedding_dim, num_classes)

    def forward(self, batch):
        ids, masks = batch["ids"], batch["masks"]
        seq, pool = self.llm(input_ids=ids, attention_mask=masks)
        z = self.dropout(pool)
        z = self.fc1(z)
        return z
    
    @torch.inference_mode()
    def predict(self, batch):
        self.eval()
        z = self(batch)
        y_pred = torch.argmax(z, dim=1).cpu().numpy()
        return y_pred
    
    @torch.inference_mode()
    def predict_proba(self, batch):
        self.eval()
        z = self(batch)
        y_probs = F.softmax(z, dim=1).cpu().numpy()
        return y_probs
    
    def save(self, dp):
        with open(Path(dp, "args.json"), "w") as fp:
            contents = {
                "dropout_p": self.dropout_p,
                "embedding_dim": self.embedding_dim,
                "num_classes": self.num_classes,
            }
            json.dump(contents, fp, indent=4, sort_keys=False)
        torch.save(self.state_dict(), os.path.join(dp, "model.pt"))

    @classmethod
    def load(cls, args_fp, state_dict_fp):
        with open(args_fp, "r") as fp:
            kwargs = json.load(fp=fp)
        llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
        model = cls(llm=llm, **kwargs)
        model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device("cpu")))
        return model


In [26]:
# Initialize model
model = FinetunedLLM(llm=llm, dropout_p=0.5, embedding_dim=embedding_dim, num_classes=num_classes)
print (model.named_parameters)

<bound method Module.named_parameters of FinetunedLLM(
  (llm): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [27]:
from ray.train.torch import get_device

In [28]:
def pad_array(arr, dtype=np.int32):
    max_len = max(len(row) for row in arr)
    padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
    for i, row in enumerate(arr):
        padded_arr[i][:len(row)] = row
    return padded_arr

In [29]:
def collate_fn(batch):
    batch["ids"] = pad_array(batch["ids"])
    batch["masks"] = pad_array(batch["masks"])
    dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
    tensor_batch = {}
    for key, array in batch.items():
        tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
    return tensor_batch

In [30]:
from pathlib import Path
import ray.train as train
from ray.train import Checkpoint, CheckpointConfig, DataConfig, RunConfig, ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer
import tempfile
import torch.nn.functional as F
from torch.nn.parallel.distributed import DistributedDataParallel

In [31]:
def train_step(ds, batch_size, model, num_classes, loss_fn, optimizer):
    """Train step."""
    model.train()
    loss = 0.0
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    for i, batch in enumerate(ds_generator):
        optimizer.zero_grad()  # reset gradients
        z = model(batch)  # forward pass
        targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
        J = loss_fn(z, targets)  # define loss
        J.backward()  # backward pass
        optimizer.step()  # update weights
        loss += (J.detach().item() - loss) / (i + 1)  # cumulative loss
    return loss

In [32]:
def eval_step(ds, batch_size, model, num_classes, loss_fn):
    """Eval step."""
    model.eval()
    loss = 0.0
    y_trues, y_preds = [], []
    ds_generator = ds.iter_torch_batches(batch_size=batch_size, collate_fn=collate_fn)
    with torch.inference_mode():
        for i, batch in enumerate(ds_generator):
            z = model(batch)
            targets = F.one_hot(batch["targets"], num_classes=num_classes).float()  # one-hot (for loss_fn)
            J = loss_fn(z, targets).item()
            loss += (J - loss) / (i + 1)
            y_trues.extend(batch["targets"].cpu().numpy())
            y_preds.extend(torch.argmax(z, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)

In [33]:
# Training loop
def train_loop_per_worker(config):
    # Hyperparameters
    dropout_p = config["dropout_p"]
    lr = config["lr"]
    lr_factor = config["lr_factor"]
    lr_patience = config["lr_patience"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    num_classes = config["num_classes"]

    # Get datasets
    set_seeds()
    train_ds = train.get_dataset_shard("train")
    val_ds = train.get_dataset_shard("val")

    # Model
    llm = BertModel.from_pretrained("allenai/scibert_scivocab_uncased", return_dict=False)
    model = FinetunedLLM(llm=llm, dropout_p=dropout_p, embedding_dim=llm.config.hidden_size, num_classes=num_classes)
    model = train.torch.prepare_model(model)

    # Training components
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=lr_factor, patience=lr_patience)

    # Training
    num_workers = train.get_context().get_world_size()
    batch_size_per_worker = batch_size // num_workers
    for epoch in range(num_epochs):
        # Step
        train_loss = train_step(train_ds, batch_size_per_worker, model, num_classes, loss_fn, optimizer)
        val_loss, _, _ = eval_step(val_ds, batch_size_per_worker, model, num_classes, loss_fn)
        scheduler.step(val_loss)

        # Checkpoint
        with tempfile.TemporaryDirectory() as dp:
            if isinstance(model, DistributedDataParallel):  # cpu
                model.module.save(dp=dp)
            else:
                model.save(dp=dp)
            metrics = dict(epoch=epoch, lr=optimizer.param_groups[0]["lr"], train_loss=train_loss, val_loss=val_loss)
            checkpoint = Checkpoint.from_directory(dp)
            train.report(metrics, checkpoint=checkpoint)

In [34]:
# Train loop config
train_loop_config = {
    "dropout_p": 0.5,
    "lr": 1e-4,
    "lr_factor": 0.8,
    "lr_patience": 3,
    "num_epochs": 10,
    "batch_size": 256,
    "num_classes": num_classes,
}

In [35]:
# Scaling config
scaling_config = ScalingConfig(
    num_workers=num_workers,
    use_gpu=bool(resources_per_worker["GPU"]),
    resources_per_worker=resources_per_worker
)

In [36]:
# Run config
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(name="llm", checkpoint_config=checkpoint_config, storage_path="~/ray/ray_results")
# 주의 : 경로를 "~/ray/ray_results"로 수정해야 동작함

In [37]:
# Dataset
ds = load_data()
train_ds, val_ds = stratify_split(ds, stratify="tag", test_size=test_size)

2023-12-27 15:25:57,440	INFO read_api.py:406 -- To satisfy the requested parallelism of 20, each read task output is split into 20 smaller blocks.
2023-12-27 15:25:57,443	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:25:57,443	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:57,444	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
# Preprocess
preprocessor = CustomPreprocessor()
preprocessor =  preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()

2023-12-27 15:25:57,791	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:25:57,791	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:57,791	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:25:58,299	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-12-27 15:25:58,300	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:58,301	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

- Aggregate 11:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 12:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 13:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:25:58,874	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]
2023-12-27 15:25:58,874	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:25:58,875	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:26:00,247	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]
2023-12-27 15:26:00,247	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:00,248	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

In [39]:
# Dataset config
options = ray.data.ExecutionOptions(preserve_order=True)
dataset_config = DataConfig(
    datasets_to_split=["train"],
    execution_options=options)

In [40]:
# Trainer
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    run_config=run_config,
    datasets={"train": train_ds, "val": val_ds},
    dataset_config=dataset_config,
    metadata={"class_to_index": preprocessor.class_to_index}
)

In [41]:
%%time
# Train
#results = trainer.fit()

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


In [42]:
from sklearn.metrics import precision_recall_fscore_support

In [43]:
class TorchPredictor:
    def __init__(self, preprocessor, model):
        self.preprocessor = preprocessor
        self.model = model
        self.model.eval()
        
    def __call__(self, batch):
        results = self.model.predict(collate_fn(batch))
        return {"output": results}

    def predict_proba(self, batch):
        results = self.model.predict_proba(collate_fn(batch))
        return {"output": results}
        
    def get_preprocessor(self):
        return self.preprocessor
        
    @classmethod
    def from_checkpoint(cls, checkpoint):
        metadata = checkpoint.get_metadata()
        preprocessor = CustomPreprocessor(class_to_index=metadata["class_to_index"])
        model = FinetunedLLM.load(Path(checkpoint.path, "args.json"), Path(checkpoint.path, "model.pt"))
        return cls(preprocessor=preprocessor, model=model)

In [44]:
def evaluate(ds, predictor):
    # y_true
    preprocessor = predictor.get_preprocessor()
    preprocessed_ds = preprocessor.transform(ds)
    values = preprocessed_ds.select_columns(cols=["targets"]).take_all()
    y_true = np.stack([item["targets"] for item in values])
    
    # y_pred
    predictions = preprocessed_ds.map_batches(predictor).take_all()
    y_pred = np.array([d["output"] for d in predictions])

    # Evaluate
    metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    return performance

In [45]:
def format_prob(prob, index_to_class):
    d = {}
    for i, item in enumerate(prob):
        d[index_to_class[i]] = item
    return d

In [46]:
def predict_proba(ds, predictor):
    preprocessor = predictor.get_preprocessor()
    preprocessed_ds = preprocessor.transform(ds)
    outputs = preprocessed_ds.map_batches(predictor.predict_proba)
    y_prob = np.array([d["output"] for d in outputs.take_all()])
    results = []
    for i, prob in enumerate(y_prob):
        tag = preprocessor.index_to_class[prob.argmax()]
        results.append({"prediction": tag, "probabilities": format_prob(prob, preprocessor.index_to_class)})
    return results

In [47]:
import mlflow
from pathlib import Path
from ray.tune.logger.mlflow import MLflowLoggerCallback
import time

In [48]:
# Config MLflow
MODEL_REGISTRY = Path(f"tmp/mlflow")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute())
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print (mlflow.get_tracking_uri())

file:///Users/sguys99/Desktop/project/self-study/made_with_ml/notebooks/tmp/mlflow


In [49]:
# MLflow callback
experiment_name = f"llm-{int(time.time())}"
mlflow_callback = MLflowLoggerCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    experiment_name=experiment_name,
    save_artifact=True)

In [50]:
# Run configuration with MLflow callback
run_config = RunConfig(
    callbacks=[mlflow_callback],
    checkpoint_config=checkpoint_config,
)

# 3. Model
## 3.3 Hyperparameter tuning

### Intuition

하이퍼파라미터 튜닝은 모델에 적합한 파라미터 값 세트를 발견하는 프로세스입니다.   
매개변수의 수, 검색 공간 및 모델 아키텍처에 따라 계산이 수반되는 프로세스가 될 수 있습니다.  
하이퍼파라미터에는 모델의 파라미터뿐만 아니라 전처리, 분할 등과 관련된 파라미터도 포함될 수 있습니다.  
조정할 수 있는 다양한 파라미터를 모두 고려하면 금세 매우 큰 검색 공간이 됩니다.  
하지만 어떤 것이 하이퍼파라미터라고 해서 반드시 조정해야 하는 것은 아닙니다.

- 일부 하이퍼파라미터를 수정하는 것은 절대적으로 허용됩니다(예: 전처리 중에 소문자 텍스트 [lower=True] 사용).
- 처음에는 좋은 결과를 얻을 수 있다고 생각되는 작지만 영향력 있는 하이퍼파라미터의 하위 집합만 조정하면 됩니다.
  
하이퍼파라미터를 최적화하여 각 하이퍼파라미터가 목표에 어떤 영향을 미치는지 이해하고자 합니다.  
합리적인 검색 공간에서 많은 실험을 실행하면 다양한 매개변수에 대해 거의 이상적인 값을 결정할 수 있습니다.

### Frameworks

하이퍼파라미터 튜닝을 위한 많은 옵션이 있습니다(레이 튠, 옵투나, 하이퍼옵트 등).   
여기서는 간편하고 일반적으로 널리 사용되는 HyperOpt 통합 기능이 있는 Ray Tune을 사용하겠습니다.  
레이 튠은 다른 많은 튜닝 검색 알고리즘(옵투나, 베이지안 등)도 폭넓게 지원합니다.

### Setup

하이퍼파라미터 튜닝을 수행할 때 고려해야 할 요소는 많습니다.   
여기서는 몇 가지 주요 하이퍼파라미터를 몇 차례에 걸쳐 튜닝하는 소규모 연구를 진행할 예정입니다.   
튜닝 실험에 추가 매개변수를 포함하거나 실험 횟수를 늘리는 것은 자유입니다.

In [51]:
# Number of trials (small sample)
num_runs = 2

In [52]:
from ray import tune
from ray.tune import Tuner
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.search import ConcurrencyLimiter
from ray.tune.search.hyperopt import HyperOptSearch

In [53]:
set_seeds()

In [54]:
ds = load_data()
train_ds, val_ds = stratify_split(ds, stratify='tag', test_size=test_size)

2023-12-27 15:26:03,005	INFO read_api.py:406 -- To satisfy the requested parallelism of 20, each read task output is split into 20 smaller blocks.
2023-12-27 15:26:03,008	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:26:03,008	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:03,009	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

In [55]:
# Preprocess
preprocessor = CustomPreprocessor()
preprocessor = preprocessor.fit(train_ds)
train_ds = preprocessor.transform(train_ds)
val_ds = preprocessor.transform(val_ds)
train_ds = train_ds.materialize()
val_ds = val_ds.materialize()

2023-12-27 15:26:03,333	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]
2023-12-27 15:26:03,334	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:03,334	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:26:03,872	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches(<lambda>)]
2023-12-27 15:26:03,873	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:03,874	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

- Aggregate 11:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 12:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 13:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:26:04,512	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]
2023-12-27 15:26:04,512	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:04,513	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-27 15:26:05,694	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(20)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]
2023-12-27 15:26:05,695	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
2023-12-27 15:26:05,695	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


- RandomShuffle 1:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/400 [00:00<?, ?it/s]

- Sort 4:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 5:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 6:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 7:   0%|          | 0/400 [00:00<?, ?it/s]

- MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle 8:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Map 9:   0%|          | 0/400 [00:00<?, ?it/s]

Shuffle Reduce 10:   0%|          | 0/400 [00:00<?, ?it/s]

Running 0:   0%|          | 0/400 [00:00<?, ?it/s]

Sort Sample 0:   0%|          | 0/20 [00:00<?, ?it/s]

In [56]:
# Trainer
trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config=train_loop_config,
    scaling_config=scaling_config,
    datasets={"train": train_ds, "val": val_ds},
    dataset_config=dataset_config,
    metadata={"class_to_index": preprocessor.class_to_index}
)

In [57]:
# MLflow callback
mlflow_callback = MLflowLoggerCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    experiment_name=experiment_name,
    save_artifact=True)

### Tune configuration

튜닝은 다양한 파라미터 조합에 대한 트레이닝이라고 생각할 수 있습니다.   
이를 위해서는 튜닝을 중단할 시점(중단 기준), 다음 학습할 파라미터 집합을 정의하는 방법(검색 알고리즘), 심지어 파라미터가 취할 수 있는 다양한 값(검색 공간)에 대한 몇 가지 구성을 정의해야 합니다.

먼저 훈련할 때와 마찬가지로 CheckpointConfig와 RunConfig를 정의하는 것으로 시작하겠습니다:

In [58]:
# Run configuration
checkpoint_config = CheckpointConfig(num_to_keep=1, checkpoint_score_attribute="val_loss", checkpoint_score_order="min")
run_config = RunConfig(
    callbacks=[mlflow_callback],
    checkpoint_config=checkpoint_config
)

Search algorithm

다음으로 튜닝 실험을 위한 초기 매개변수 값과 검색 알고리즘(HyperOptSearch)을 설정하겠습니다. 또한 보유한 컴퓨팅 리소스에 따라 동시에 실행할 수 있는 최대 트라이얼 수(ConcurrencyLimiter)를 설정할 것입니다.

In [59]:
# Hyperparameters to start with
initial_params = [{"train_loop_config": {"dropout_p": 0.5, "lr": 1e-4, "lr_factor": 0.8, "lr_patience": 3}}]
search_alg = HyperOptSearch(points_to_evaluate=initial_params)
search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)

Search space

다음으로 매개변수, 분포 및 값의 범위를 선택하여 매개변수 검색 공간을 정의하겠습니다.  
매개변수 유형에 따라 다양한 분포 중에서 선택할 수 있습니다.

In [60]:
# Parameter space
param_space = {
    "train_loop_config": {
        "dropout_p": tune.uniform(0.3, 0.9),
        "lr": tune.loguniform(1e-5, 5e-4),
        "lr_factor": tune.uniform(0.1, 0.9),
        "lr_patience": tune.uniform(1, 10),
    }
}


Scheduler

다음으로, 유망하지 않은 트라이얼을 정리하는 스케줄러를 정의하겠습니다.  
여기서는 매우 인기 있고 공격적인 조기 종료 알고리즘인 ASHA(AsyncHyperBandScheduler)를 사용할 것입니다.  
공격적인 스케줄러이기 때문에, 트라이얼이 프루닝하기 전에 최소 몇 에포크, 최대 최대 1 에포크 동안 실행될 수 있도록 grace_period를 설정할 것입니다.

In [61]:
# Scheduler
scheduler = AsyncHyperBandScheduler(
    max_t=train_loop_config["num_epochs"],  # max epoch (<time_attr>) per trial
    grace_period=5,  # min epoch (<time_attr>) per trial
)

### Tuner

In [62]:
# Tune config
tune_config = tune.TuneConfig(
    metric="val_loss",
    mode="min",
    search_alg=search_alg,
    scheduler=scheduler,
    num_samples=num_runs,
)

In [63]:
# Tuner
tuner = Tuner(
    trainable=trainer,
    run_config=run_config,
    param_space=param_space,
    tune_config=tune_config,
)

In [64]:
results = tuner.fit()

0,1
Current time:,2023-12-27 15:32:07
Running for:,00:05:59.47
Memory:,41.2/64.0 GiB

Trial name,# failures,error file
TorchTrainer_654ebb6c,1,"/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_654ebb6c_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-12-27_15-26-07/error.txt"

Trial name,status,loc,train_loop_config/dr opout_p,train_loop_config/lr,train_loop_config/lr _factor,train_loop_config/lr _patience,iter,total time (s),epoch,lr,train_loss
TorchTrainer_df269731,TERMINATED,127.0.0.1:70383,0.810944,0.000219244,0.852474,2.81732,10,249.465,9,0.000219244,0.373202
TorchTrainer_654ebb6c,ERROR,127.0.0.1:70272,0.5,0.0001,0.8,3.0,3,82.3893,2,0.0001,0.4689


[2m[36m(TorchTrainer pid=70272)[0m Starting distributed worker processes: ['70276 (127.0.0.1)', '70277 (127.0.0.1)', '70278 (127.0.0.1)', '70279 (127.0.0.1)', '70280 (127.0.0.1)', '70281 (127.0.0.1)']
[2m[36m(RayTrainWorker pid=70276)[0m Setting up process group for: env:// [rank=0, world_size=6]
[2m[36m(SplitCoordinator pid=70300)[0m Auto configuring locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851']
[2m[36m(RayTrainWorker pid=70278)[0m Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weig

(pid=70300) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(SplitCoordinator pid=70300)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70300)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(SplitCoordinator pid=70300)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=70278)[0m Checkpoint successfully created at: Checkpoint(filesystem=loca

(pid=70300) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(SplitCoordinator pid=70300)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70300)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(SplitCoordinator pid=70300)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=70278)[0m Checkpoint successfully created at: Checkpoint(filesystem=loca

(pid=70300) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(SplitCoordinator pid=70300)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70300)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(SplitCoordinator pid=70300)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=70280)[0m Checkpoint successfully created at: Checkpoint(filesystem=loca

(pid=70300) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(SplitCoordinator pid=70300)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70300)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], preserve_order=True, actor_locality_enabled=True, verbose_progress=False)
[2m[36m(SplitCoordinator pid=70300)[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`
[2m[36m(RayTrainWorker pid=70280)[0m Checkpoint successfully created at: Checkpoint(filesystem=loca

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70394)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000000)
[2m[36m(RayTrainWorker pid=70396)[0m - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).[32m [repeated 5x across cluster][0m
[2m[36m(RayTrainWorker pid=70396)[0m - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).[32m [repeated 5x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitte

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70395)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000001)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70398)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000002)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70397)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000003)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70398)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000004)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70399)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000005)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70398)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000006)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70398)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000007)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70394)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000008)[32m [repeated 6x across cluster][0m
[2m[36m(SplitCoordinator pid=70402)[0m Executing DAG InputDataBuffer[Input] -> OutputSplitter[split(6, equal=True)]
[2m[36m(SplitCoordinator pid=70402)[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=['534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851', '534cd63b3eede5543582a293ad243bb921a70c40a42dbef669a25851'], prese

(pid=70402) Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(RayTrainWorker pid=70398)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/Users/sguys99/ray_results/TorchTrainer_2023-12-27_15-26-07/TorchTrainer_df269731_2_dropout_p=0.8109,lr=0.0002,lr_factor=0.8525,lr_patience=2.8173_2023-12-27_15-26-09/checkpoint_000009)[32m [repeated 6x across cluster][0m
2023-12-27 15:32:07,224	ERROR tune.py:1139 -- Trials did not complete: [TorchTrainer_654ebb6c]
2023-12-27 15:32:07,225	INFO tune.py:1143 -- Total run time: 359.54 seconds (359.46 seconds for the tuning loop).


TypeError: RayTaskError.as_instanceof_cause.<locals>.cls.__init__() takes 2 positional arguments but 4 were given