In [1]:
!pip install pytorch-lightning ray[tune]



In [2]:
!pip install -U ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.4/214.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets

In [3]:
import os
FILE_NAMES = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        FILE_NAMES.append(os.path.join(dirname, filename))

In [4]:
# print(FILE_NAMES)

In [5]:
# import torch
# torch.cuda.is_available()

In [6]:
# !nvidia-smi

In [7]:
import os
from pathlib import Path
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, EarlyStopping
from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
from ray import train, tune
from ray.tune import Tuner, with_resources
# from ray_lightning.tune import TuneReportCallback
from ray.train import RunConfig

In [8]:
# Set global parameters
scale_factor = 11.888623072966611
input_dim = 3072
# sample_rate = 0.1 # 10% of data
sample_size = 8192 # file_size * sample_rate
num_epochs = 3  # Only one epoch over all data

# Dataset with Subsampling
class ActivationDataset(Dataset):
    def __init__(self, f_type, test_fraction=0.01, scale_factor=1.0, batch_size=2048, seed=42):
        self.test_fraction = test_fraction
        self.scale_factor = scale_factor
        self.batch_size = batch_size
        self.multi = sample_size // batch_size
        self.seed = seed
        self.file_names = FILE_NAMES
        if f_type not in ["train", "test"]:
            raise ValueError("f_type must be 'train' or 'test'")
        if f_type == "train":
            self.file_names = self.file_names[:int(len(self.file_names)*(1 - self.test_fraction))]
        else:
            self.file_names = self.file_names[int(len(self.file_names)*(1 - self.test_fraction)):]
        self.f_type = f_type

    def __len__(self):
        return len(self.file_names)*self.multi

    def __getitem__(self, idx):
        # Load a single file
        f_ix = idx // self.multi # sample the file 4 times
        file_path = self.file_names[f_ix]
        activations = np.load(file_path)[:, :-3]  # Remove metadata columns

        # Normalize
        activations = activations / self.scale_factor * np.sqrt(activations.shape[1])

        # Random subsampling to sample_size
        np.random.seed(self.seed + idx)  # Change seed per file for reproducibility
        subsample_indices = np.random.choice(activations.shape[0], sample_size, replace=False)
        activations = activations[subsample_indices]

        # Get batch 
        batch_i = idx % self.multi
        start = batch_i*self.batch_size
        end = (batch_i+1)*self.batch_size
        activations = activations[start:end]

        # Convert to tensor
        return torch.tensor(activations, dtype=torch.float32)

# Model Definition
class SparseAutoencoder(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, l1_lambda, lr):
        super().__init__()
        self.save_hyperparameters()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.criterion = nn.MSELoss()
        self.l1_lambda = l1_lambda
        self.lr = lr

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

    def compute_loss(self, batch, decoded, encoded):
        mse_loss = self.criterion(decoded, batch)
        decoder_weight_norms = torch.norm(self.decoder.weight, p=2, dim=0)
        l1_terms = encoded * decoder_weight_norms.unsqueeze(0)
        l1_loss = torch.mean(torch.sum(l1_terms, dim=1))
        return mse_loss, l1_loss

    def training_step(self, batch, batch_idx):
        batch = batch.to("cuda")
        decoded, encoded = self(batch)
        mse_loss, l1_loss = self.compute_loss(batch, decoded, encoded)
        total_loss = mse_loss + self.l1_lambda * l1_loss

        # Compute active features
        active_features = (encoded > 0).any(dim=0).float().mean().item() * 100

        # Log metrics
        self.log("train_loss", total_loss, on_step=True, on_epoch=True)
        self.log("train_mse_loss", mse_loss, on_step=True, on_epoch=True)
        self.log("train_l1_loss", l1_loss, on_step=True, on_epoch=True)
        self.log("active_features", active_features, on_step=True, on_epoch=True)
        self.log("val_loss", 0, on_step=True, on_epoch=True)
        # train.report({"loss": total_loss, "active_features": active_features})
        return total_loss

    def validation_step(self, batch, batch_idx):
        batch = batch.to("cuda")
        decoded, encoded = self(batch)
        mse_loss, l1_loss = self.compute_loss(batch, decoded, encoded)
        total_loss = mse_loss + self.l1_lambda * l1_loss

        # Compute active features
        active_features = (encoded > 0).any(dim=0).float().mean().item() * 100

        # Log metrics
        self.log("val_loss", total_loss, on_step=False, on_epoch=True)
        self.log("val_mse_loss", mse_loss, on_step=False, on_epoch=True)
        self.log("val_l1_loss", l1_loss, on_step=False, on_epoch=True)
        self.log("val_active_features", active_features, on_step=False, on_epoch=True)
        print("RESULTS: ", total_loss, mse_loss, l1_loss, active_features)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

# DataLoader Creation
def create_data_loaders(batch_size):
    train_dataset = ActivationDataset("train", 0.01, scale_factor, batch_size, 42)
    val_dataset = ActivationDataset("test", 0.01, scale_factor, batch_size, 42)
    # train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)#, num_workers=3, pin_memory=True, persistent_workers=True)
    # val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    train_loader = DataLoader(
        train_dataset,
        batch_size=1,  # Keep 1 as outer batch size for per-file sampling
        shuffle=False,
        num_workers=3,  # Adjust based on CPU availability
        pin_memory=False,
        persistent_workers=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=1,  # Smaller for validation
        pin_memory=False,
    )
    return train_loader, val_loader

# Training Function with Ray Tune
def train_model(config):
    train_loader, val_loader = create_data_loaders(config["HB"]["batch_size"])
    model = SparseAutoencoder(input_dim, hidden_dim=config["HB"]["hidden_dim"], l1_lambda=config["l1_lambda"], lr=config["lr"])

    logger = TensorBoardLogger("tb_logs", name="SparseAutoencoder")

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        logger=logger,
        profiler="advanced",
        # val_check_interval=0.25,  # Check validation 4 times per epoch
        # max_time="00:30:00",  # Stop after 30 minutes
        enable_progress_bar=True, # Show progress bar
        callbacks=[
            LearningRateMonitor(logging_interval="step"),
            EarlyStopping(monitor="val_loss", patience=3, mode="min"),
            TuneReportCheckpointCallback(
                {
                    "train_loss": "train_loss",
                    "train_mse_loss": "train_mse_loss",
                    "train_l1_loss": "train_l1_loss",
                    "active_features": "active_features",
                    "val_loss": "val_loss",
                },
                filename="none",  # Do not save checkpoints
                save_checkpoints = False,
                on="train_batch_end",
            ),
            TuneReportCheckpointCallback(
                {
                    "val_loss": "val_loss",
                    "val_mse_loss": "val_mse_loss",
                    "val_l1_loss": "val_l1_loss",
                    "val_active_features": "val_active_features",
                },
                filename="none1",
                save_checkpoints = False,
                on="validation_end",
            ),
            # RayTrainReportCallback(),
        ],
        # strategy=RayDDPStrategy(), # Use Ray for distributed training, DDP stands for Distributed Data Parallel
        # callbacks=[RayTrainReportCallback()], # Report metrics to Ray
        # plugins=[RayLightningEnvironment()], # Use Ray for distributed training
    )
    trainer.fit(model, train_loader, val_loader)

# Ray Tune Hyperparameter Search
def tune_hyperparameters():

    # possible_hidden_dims = [4096, 8192, 16384, 20000, 32768]
    possible_batch_sizes = [2048, 4096, 8192]
    possible_hidden_dims = [20000, 32768, 65536]
    # possible_batch_sizes = [1024, 2048, 4096, 8192]
    # possible_hidden_dims = [20000]
    # possible_batch_sizes = [8192]

    valid_hb_pairs = []
    for hidden_dim in possible_hidden_dims:
        for batch_size in possible_batch_sizes:
            if hidden_dim * batch_size <= 441_000_000: # VRAM limit
                valid_hb_pairs.append({"hidden_dim": hidden_dim, "batch_size": batch_size})


    # search_space = {
    #     "hidden_dim": tune.choice([4096, 8192, 16384, 20000, 32768]),
    #     "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
    #     "l1_lambda": tune.loguniform(1e-4, 1e-2),
    #     "lr": tune.loguniform(1e-4, 1e-2),
    # }

    search_space = {
        "HB": tune.choice(valid_hb_pairs),
        "lr": tune.loguniform(1e-5, 1e-1),
        "l1_lambda": tune.loguniform(1e-3, 1e-1),
    }

    # search_space = {
    #     "HB": tune.choice(valid_hb_pairs),
    #     "l1_lambda": tune.choice([0.01]),
    #     "lr": tune.choice([0.0001]),
    # }

    scheduler_asha = tune.schedulers.ASHAScheduler(
        time_attr="training_iteration",
        metric="train_loss",
        mode="min",
        max_t=80,
        grace_period=40, # at least 25 batches
        reduction_factor=2,
    )

    # os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0" # Allows relative paths, but trials are not isolated

    trainable_with_resources = with_resources(
        train_model,
        {"cpu": 4, "gpu": 1}  # Adjust based on your available resources
    )

    tuner = Tuner(
        trainable=trainable_with_resources,
        param_space=search_space,
        tune_config=tune.TuneConfig(
            num_samples=20, # Number of hyperparameter sets to try
            max_concurrent_trials=3, # Number of trials to run concurrently
            scheduler=scheduler_asha,
        ),
        run_config=RunConfig(
            name="hyperparameter_search",
            storage_path=str(Path("./results").resolve()),
        ),
    )
    results = tuner.fit()
    best_result = results.get_best_result(metric="val_loss", mode="min")
    print("Best Hyperparameters Found:")
    print(best_result.config)
    return results

# # Run Hyperparameter Search
# if __name__ == "__main__":
#     tune_hyperparameters()

In [9]:
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)


In [10]:
results = tune_hyperparameters()

0,1
Current time:,2024-12-04 14:04:05
Running for:,01:10:02.29
Memory:,4.2/31.4 GiB

Trial name,status,loc,HB,l1_lambda,lr,iter,total time (s),train_loss,train_mse_loss,train_l1_loss
train_model_d367c_00000,TERMINATED,172.19.2.2:407,{'hidden_dim': _9ac0,0.00786079,0.000437945,80,310.756,0.99097,0.990472,0.0634333
train_model_d367c_00001,TERMINATED,172.19.2.2:553,{'hidden_dim': _1c80,0.0446133,0.00272249,80,222.09,0.9814,0.9814,0.0
train_model_d367c_00002,TERMINATED,172.19.2.2:684,{'hidden_dim': _2f80,0.0028269,0.0936196,40,129.275,33.1922,33.1774,5.24938
train_model_d367c_00003,TERMINATED,172.19.2.2:804,{'hidden_dim': _4500,0.00597965,2.5011e-05,80,229.677,0.733222,0.560459,28.8919
train_model_d367c_00004,TERMINATED,172.19.2.2:935,{'hidden_dim': _8740,0.00462191,0.000219001,80,226.358,0.780682,0.706084,16.14
train_model_d367c_00005,TERMINATED,172.19.2.2:1065,{'hidden_dim': _1600,0.00501011,6.88847e-05,40,126.053,1.06317,0.920452,28.4863
train_model_d367c_00006,TERMINATED,172.19.2.2:1185,{'hidden_dim': _bd40,0.00785182,0.0713815,80,275.318,0.866764,0.866764,0.0
train_model_d367c_00007,TERMINATED,172.19.2.2:1321,{'hidden_dim': _b040,0.010878,0.00227305,40,136.34,0.994433,0.994433,0.0
train_model_d367c_00008,TERMINATED,172.19.2.2:1442,{'hidden_dim': _aac0,0.0640658,0.0350018,80,225.41,0.899889,0.899889,0.0
train_model_d367c_00009,TERMINATED,172.19.2.2:1572,{'hidden_dim': _a080,0.0325109,0.000724451,40,126.368,1.00699,1.00698,0.000323438


[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=407)[0m   self.pid = os.fork()


Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=407)[0m RESULTS:  
[36m(train_model pid=407)[0m tensor(1.9815, device='cuda:0') tensor(1.0586, device='cuda:0') tensor(117.4091, device='cuda:0') 49.922382831573486
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.71it/s]
[36m(train_model pid=407)[0m RESULTS:  tensor(1.9734, device='cuda:0') tensor(1.0532, device='cuda:0') tensor(117.0725, device='cuda:0') 49.92486834526062
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:16<1:11:25,  0.06it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:18<38:58,  0.11it/s, v_num=0]  
Epoch 0:   1%|          | 3/254 [00:20<28:09,  0.15it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:25<26:09,  0.16it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:27<22:58,  0.18it/s, v_num=0

[36m(train_model pid=407)[0m [2024-12-04 12:59:21,270 E 407 449] logging.cc:115: Stack trace: 
[36m(train_model pid=407)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7e6aad77df2a] ray::operator<<()
[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7e6aad780f72] ray::TerminateHandler()
[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7e6aac61d502] __cxxabiv1::__terminate()
[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7e6aac617303] std::unexpected()
[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7e6aac61d1d1] __gxx_personality_v0
[36m(train_model pid=407)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7e6aac55df48] _Unwind_ForcedUnwind_Phase2
[36m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=553)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=553)[0m RESULTS:  
[36m(train_model pid=553)[0m tensor(5.8294, device='cuda:0') tensor(1.0611, device='cuda:0') tensor(106.8794, device='cuda:0') 50.20785331726074
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.70it/s]
[36m(train_model pid=553)[0m RESULTS:  tensor(5.8123, device='cuda:0') tensor(1.0569, device='cuda:0') tensor(106.5934, device='cuda:0') 50.1967191696167
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:11<1:36:18,  0.09it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:11<49:14,  0.17it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:11<33:32,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:20<42:10,  0.20it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:20<34:05,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 6/508 [00:20<28:42,  0.29

[36m(train_model pid=553)[0m [2024-12-04 13:03:10,183 E 553 595] logging.cc:115: Stack trace: 
[36m(train_model pid=553)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7dfbd12a1f2a] ray::operator<<()
[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7dfbd12a4f72] ray::TerminateHandler()
[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7dfbd0141502] __cxxabiv1::__terminate()
[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7dfbd013b303] std::unexpected()
[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7dfbd01411d1] __gxx_personality_v0
[36m(train_model pid=553)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7dfbd0081f48] _Unwind_ForcedUnwind_Phase2
[36m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=684)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=684)[0m RESULTS:  tensor(1.2923, device='cuda:0') tensor(1.0576, device='cuda:0') 
[36m(train_model pid=684)[0m tensor(83.0388, device='cuda:0') 49.97713565826416
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.18it/s]
[36m(train_model pid=684)[0m RESULTS:  tensor(1.2868, device='cuda:0') tensor(1.0526, device='cuda:0') tensor(82.8440, device='cuda:0') 49.971458315849304
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:10<1:26:45,  0.10it/s]
Epoch 0:   0%|          | 1/508 [00:10<1:26:46,  0.10it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:10<45:07,  0.19it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:11<31:15,  0.27it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:19<40:10,  0.21it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:19<32:48,  0.26it/s, v

[36m(train_model pid=684)[0m [2024-12-04 13:05:26,843 E 684 726] logging.cc:115: Stack trace: 
[36m(train_model pid=684)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7dc9b9a19f2a] ray::operator<<()
[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7dc9b9a1cf72] ray::TerminateHandler()
[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7dc9b88b9502] __cxxabiv1::__terminate()
[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7dc9b88b3303] std::unexpected()
[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7dc9b88b91d1] __gxx_personality_v0
[36m(train_model pid=684)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7dc9b87f9f48] _Unwind_ForcedUnwind_Phase2
[36m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=804)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=804)[0m RESULTS:  
[36m(train_model pid=804)[0m tensor(1.4045, device='cuda:0') tensor(1.0530, device='cuda:0') tensor(58.7711, device='cuda:0') 49.997785687446594
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.37it/s]
[36m(train_model pid=804)[0m RESULTS:  tensor(1.3995, device='cuda:0') tensor(1.0489, device='cuda:0') tensor(58.6342, device='cuda:0') 49.98928904533386
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:10<1:27:06,  0.10it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:11<47:04,  0.18it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:12<33:40,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:19<40:59,  0.20it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:20<34:05,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 6/508 [00:21<29:30,  0.28

[36m(train_model pid=804)[0m [2024-12-04 13:09:23,777 E 804 846] logging.cc:115: Stack trace: 
[36m(train_model pid=804)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x782647b8bf2a] ray::operator<<()
[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x782647b8ef72] ray::TerminateHandler()
[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x782646a2b502] __cxxabiv1::__terminate()
[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x782646a25303] std::unexpected()
[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x782646a2b1d1] __gxx_personality_v0
[36m(train_model pid=804)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x78264696bf48] _Unwind_ForcedUnwind_Phase2
[36m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=935)[0m RESULTS:  
[36m(train_model pid=935)[0m tensor(2.0467, device='cuda:0') tensor(1.0633, device='cuda:0') tensor(212.7667, device='cuda:0') 49.94256794452667
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.01it/s]
[36m(train_model pid=935)[0m RESULTS:  tensor(2.0383, device='cuda:0') tensor(1.0579, device='cuda:0') tensor(212.1068, device='cuda:0') 49.93990957736969
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:09<41:54,  0.10it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:10<22:00,  0.19it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:11<15:20,  0.27it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:19<20:11,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:19<16:29,  0.25it/s, v_num=0]
Epo

[36m(train_model pid=935)[0m [2024-12-04 13:13:17,190 E 935 977] logging.cc:115: Stack trace: 
[36m(train_model pid=935)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7cb7ea3adf2a] ray::operator<<()
[36m(train_model pid=935)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7cb7ea3b0f72] ray::TerminateHandler()
[36m(train_model pid=935)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7cb7e924d502] __cxxabiv1::__terminate()
[36m(train_model pid=935)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7cb7e9247303] std::unexpected()
[36m(train_model pid=935)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7cb7e924d1d1] __gxx_personality_v0
[36m(train_model pid=935)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7cb7e918df48] _Unwind_ForcedUnwind_Phase2
[36m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=1065)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1065)[0m RESULTS:  
[36m(train_model pid=1065)[0m tensor(3.1946, device='cuda:0') tensor(1.0621, device='cuda:0') tensor(425.6529, device='cuda:0') 49.989330768585205
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.34it/s]
[36m(train_model pid=1065)[0m RESULTS:  tensor(3.1953, device='cuda:0') tensor(1.0620, device='cuda:0') tensor(425.8088, device='cuda:0') 49.99890625476837
                                                                           
Epoch 0:   0%|          | 0/127 [00:00<?, ?it/s] 
Epoch 0:   1%|          | 1/127 [00:09<19:17,  0.11it/s, v_num=0]
Epoch 0:   2%|▏         | 2/127 [00:10<10:33,  0.20it/s, v_num=0]
Epoch 0:   2%|▏         | 3/127 [00:11<07:39,  0.27it/s, v_num=0]
Epoch 0:   3%|▎         | 4/127 [00:17<08:48,  0.23it/s, v_num=0]
Epoch 0:   4%|▍         | 5/127 [00:18<07:23,  0.28it/s, v_num=0]
Epoch 0:   5%|▍         | 6/127 [00:19<06:26,  0.3

[36m(train_model pid=1065)[0m [2024-12-04 13:15:29,985 E 1065 1107] logging.cc:115: Stack trace: 
[36m(train_model pid=1065)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7d20d37c8f2a] ray::operator<<()
[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7d20d37cbf72] ray::TerminateHandler()
[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7d20d2668502] __cxxabiv1::__terminate()
[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7d20d2662303] std::unexpected()
[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7d20d26681d1] __gxx_personality_v0
[36m(train_model pid=1065)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7d20d25a8f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=1185)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1185)[0m RESULTS:  
[36m(train_model pid=1185)[0m tensor(4.3915, device='cuda:0') tensor(1.0572, device='cuda:0') tensor(424.6540, device='cuda:0') 49.84802007675171
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s]
[36m(train_model pid=1185)[0m RESULTS:  tensor(4.3922, device='cuda:0') tensor(1.0574, device='cuda:0') tensor(424.7071, device='cuda:0') 49.85003173351288
                                                                           
Epoch 0:   0%|          | 0/127 [00:00<?, ?it/s] 
Epoch 0:   1%|          | 1/127 [00:10<21:22,  0.10it/s, v_num=0]
Epoch 0:   2%|▏         | 2/127 [00:11<11:36,  0.18it/s, v_num=0]
Epoch 0:   2%|▏         | 3/127 [00:12<08:20,  0.25it/s, v_num=0]
Epoch 0:   3%|▎         | 4/127 [00:17<08:49,  0.23it/s, v_num=0]
Epoch 0:   4%|▍         | 5/127 [00:18<07:23,  0.28it/s, v_num=0]
Epoch 0:   5%|▍         | 6/127 [00:19<06:27,  0.31

[36m(train_model pid=1185)[0m [2024-12-04 13:20:12,414 E 1185 1227] logging.cc:115: Stack trace: 
[36m(train_model pid=1185)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7d04a68b8f2a] ray::operator<<()
[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7d04a68bbf72] ray::TerminateHandler()
[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7d04a5758502] __cxxabiv1::__terminate()
[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7d04a5752303] std::unexpected()
[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7d04a57581d1] __gxx_personality_v0
[36m(train_model pid=1185)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7d04a5698f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1321)[0m RESULTS:  
[36m(train_model pid=1321)[0m tensor(3.3700, device='cuda:0') tensor(1.0572, device='cuda:0') tensor(212.6158, device='cuda:0') 49.99050796031952
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.01it/s]
[36m(train_model pid=1321)[0m RESULTS:  tensor(3.3584, device='cuda:0') tensor(1.0520, device='cuda:0') tensor(212.0208, device='cuda:0') 49.99132752418518
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:05<00:00,  0.36it/s]
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:09<38:01,  0.11it/s]
Epoch 0:   0%|          | 1/254 [00:09<38:01,  0.11it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:09<20:02,  0.21it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:14<20:50,  0.20it/s, v_num=0]

[36m(train_model pid=1321)[0m [2024-12-04 13:22:36,287 E 1321 1362] logging.cc:115: Stack trace: 
[36m(train_model pid=1321)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7bbc1dbf0f2a] ray::operator<<()
[36m(train_model pid=1321)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7bbc1dbf3f72] ray::TerminateHandler()
[36m(train_model pid=1321)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7bbc1ca90502] __cxxabiv1::__terminate()
[36m(train_model pid=1321)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7bbc1ca8a303] std::unexpected()
[36m(train_model pid=1321)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7bbc1ca901d1] __gxx_personality_v0
[36m(train_model pid=1321)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7bbc1c9d0f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=1442)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1442)[0m RESULTS:  
[36m(train_model pid=1442)[0m tensor(7.8713, device='cuda:0') tensor(1.0595, device='cuda:0') tensor(106.3250, device='cuda:0') 50.03999471664429
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.61it/s]
[36m(train_model pid=1442)[0m RESULTS:  tensor(7.8514, device='cuda:0') tensor(1.0543, device='cuda:0') tensor(106.0954, device='cuda:0') 50.057220458984375
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:10<1:28:43,  0.10it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:10<45:31,  0.19it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:11<31:03,  0.27it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:19<41:13,  0.20it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:19<33:20,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 6/508 [00:20<28:04, 

[36m(train_model pid=1442)[0m [2024-12-04 13:26:28,613 E 1442 1483] logging.cc:115: Stack trace: 
[36m(train_model pid=1442)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7a18c8683f2a] ray::operator<<()
[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7a18c8686f72] ray::TerminateHandler()
[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7a18c7523502] __cxxabiv1::__terminate()
[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7a18c751d303] std::unexpected()
[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7a18c75231d1] __gxx_personality_v0
[36m(train_model pid=1442)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7a18c7463f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=1572)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=1572)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1572)[0m RESULTS:  
[36m(train_model pid=1572)[0m tensor(6.4703, device='cuda:0') tensor(1.0588, device='cuda:0') tensor(166.4521, device='cuda:0') 49.978968501091
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.42it/s]
[36m(train_model pid=1572)[0m RESULTS:  tensor(6.4497, device='cuda:0') tensor(1.0533, device='cuda:0') tensor(165.9860, device='cuda:0') 49.981629848480225
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:10<44:29,  0.09it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:11<23:53,  0.18it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:12<17:02,  0.25it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:20<21:23,  0.19it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:21<17:43,  0.23it/s, v_num=0]
Epoch 0:   2%|▏         | 6/254 [00:22<15:15,  0.27i

[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=1692)[0m   self.pid = os.fork()


Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1692)[0m RESULTS:  tensor(1.2125, device='cuda:0') tensor(1.0546, device='cuda:0') tensor(105.9387, device='cuda:0') 49.85639154911041
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.67it/s]
[36m(train_model pid=1692)[0m RESULTS:  tensor(1.2077, device='cuda:0') tensor(1.0502, device='cuda:0') tensor(105.6952, device='cuda:0') 49.87742900848389
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:09<1:24:20,  0.10it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:10<43:17,  0.19it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:10<29:32,  0.28it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:19<40:53,  0.21it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:19<33:04,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 6/5

[36m(train_model pid=1692)[0m [2024-12-04 13:30:55,726 E 1692 1735] logging.cc:115: Stack trace: 
[36m(train_model pid=1692)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x78c1220daf2a] ray::operator<<()
[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x78c1220ddf72] ray::TerminateHandler()
[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x78c120f7a502] __cxxabiv1::__terminate()
[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x78c120f74303] std::unexpected()
[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x78c120f7a1d1] __gxx_personality_v0
[36m(train_model pid=1692)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x78c120ebaf48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1812)[0m RESULTS:  tensor(9.4897, device='cuda:0') 
[36m(train_model pid=1812)[0m tensor(1.0643, device='cuda:0') tensor(166.2260, device='cuda:0') 49.973079562187195
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.45it/s]
[36m(train_model pid=1812)[0m RESULTS:  tensor(9.4598, device='cuda:0') tensor(1.0587, device='cuda:0') tensor(165.7461, device='cuda:0') 49.964839220047
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:11<47:46,  0.09it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:12<25:35,  0.16it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:13<18:09,  0.23it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:20<21:08,  0.20it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:21<17:30,  0.24it/s, v_num=0]
E

[36m(train_model pid=1812)[0m [2024-12-04 13:34:54,839 E 1812 1854] logging.cc:115: Stack trace: 
[36m(train_model pid=1812)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7c48e6290f2a] ray::operator<<()
[36m(train_model pid=1812)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7c48e6293f72] ray::TerminateHandler()
[36m(train_model pid=1812)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7c48e5130502] __cxxabiv1::__terminate()
[36m(train_model pid=1812)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7c48e512a303] std::unexpected()
[36m(train_model pid=1812)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7c48e51301d1] __gxx_personality_v0
[36m(train_model pid=1812)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7c48e5070f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=1943)[0m RESULTS:  
[36m(train_model pid=1943)[0m tensor(4.3205, device='cuda:0') tensor(1.0590, device='cuda:0') tensor(214.4815, device='cuda:0') 50.23040175437927
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.90it/s]
[36m(train_model pid=1943)[0m RESULTS:  tensor(4.3066, device='cuda:0') tensor(1.0535, device='cuda:0') tensor(213.9299, device='cuda:0') 50.244200229644775
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:10<43:06,  0.10it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:10<22:37,  0.19it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:11<15:43,  0.27it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:19<20:18,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:19<16:34,  0.25it/s, v_num=0]

[36m(train_model pid=1943)[0m [2024-12-04 13:38:52,769 E 1943 1985] logging.cc:115: Stack trace: 
[36m(train_model pid=1943)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7e701bde0f2a] ray::operator<<()
[36m(train_model pid=1943)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7e701bde3f72] ray::TerminateHandler()
[36m(train_model pid=1943)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7e701ac80502] __cxxabiv1::__terminate()
[36m(train_model pid=1943)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7e701ac7a303] std::unexpected()
[36m(train_model pid=1943)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7e701ac801d1] __gxx_personality_v0
[36m(train_model pid=1943)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7e701abc0f48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=2074)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2074)[0m RESULTS:  
[36m(train_model pid=2074)[0m tensor(1.6939, device='cuda:0') tensor(1.0552, device='cuda:0') tensor(82.7843, device='cuda:0') 49.881792068481445
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.11it/s]
[36m(train_model pid=2074)[0m RESULTS:  tensor(1.6884, device='cuda:0') tensor(1.0511, device='cuda:0') tensor(82.6067, device='cuda:0') 49.88405406475067
                                                                           
Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/508 [00:10<1:25:04,  0.10it/s, v_num=0]
Epoch 0:   0%|          | 2/508 [00:10<44:17,  0.19it/s, v_num=0]  
Epoch 0:   1%|          | 3/508 [00:10<30:37,  0.27it/s, v_num=0]
Epoch 0:   1%|          | 4/508 [00:19<40:38,  0.21it/s, v_num=0]
Epoch 0:   1%|          | 5/508 [00:19<33:21,  0.25it/s, v_num=0]
Epoch 0:   1%|          | 6/508 [00:20<28:19,  0

[36m(train_model pid=2074)[0m [2024-12-04 13:42:41,821 E 2074 2116] logging.cc:115: Stack trace: 
[36m(train_model pid=2074)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7acba3e9af2a] ray::operator<<()
[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7acba3e9df72] ray::TerminateHandler()
[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7acba2d3a502] __cxxabiv1::__terminate()
[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7acba2d34303] std::unexpected()
[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7acba2d3a1d1] __gxx_personality_v0
[36m(train_model pid=2074)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7acba2c7af48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2204)[0m RESULTS:  
[36m(train_model pid=2204)[0m tensor(2.8032, device='cuda:0') tensor(1.0593, device='cuda:0') tensor(212.9523, device='cuda:0') 49.96995031833649
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.89it/s]
[36m(train_model pid=2204)[0m RESULTS:  tensor(2.7938, device='cuda:0') tensor(1.0540, device='cuda:0') tensor(212.4526, device='cuda:0') 49.98873770236969
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:09<41:06,  0.10it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:10<21:35,  0.19it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:10<15:02,  0.28it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:19<20:00,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:19<16:21,  0.25it/s, v_num=0]


[36m(train_model pid=2204)[0m [2024-12-04 13:46:34,145 E 2204 2245] logging.cc:115: Stack trace: 
[36m(train_model pid=2204)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x7d90e8fbbf2a] ray::operator<<()
[36m(train_model pid=2204)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x7d90e8fbef72] ray::TerminateHandler()
[36m(train_model pid=2204)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x7d90e7e5b502] __cxxabiv1::__terminate()
[36m(train_model pid=2204)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x7d90e7e55303] std::unexpected()
[36m(train_model pid=2204)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x7d90e7e5b1d1] __gxx_personality_v0
[36m(train_model pid=2204)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x7d90e7d9bf48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=2334)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=2334)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2334)[0m RESULTS:  
[36m(train_model pid=2334)[0m tensor(2.8996, device='cuda:0') tensor(1.0569, device='cuda:0') tensor(166.6448, device='cuda:0') 50.01333951950073
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.41it/s]
[36m(train_model pid=2334)[0m RESULTS:  tensor(2.8891, device='cuda:0') tensor(1.0516, device='cuda:0') tensor(166.1818, device='cuda:0') 50.01157522201538
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:10<45:54,  0.09it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:11<24:33,  0.17it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:12<17:29,  0.24it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:19<20:32,  0.20it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:20<17:03,  0.24it/s, v_num=0]
Epoch 0:   2%|▏         | 6/254 [00:21<14:42,  0.28

[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=2454)[0m   self.pid = os.fork()


Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2454)[0m RESULTS:  
[36m(train_model pid=2454)[0m tensor(5.2061, device='cuda:0') tensor(1.0593, device='cuda:0') tensor(213.5154, device='cuda:0') 50.049614906311035
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.95it/s]
[36m(train_model pid=2454)[0m RESULTS:  tensor(5.1877, device='cuda:0') tensor(1.0536, device='cuda:0') tensor(212.8573, device='cuda:0') 50.03841519355774
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:10<42:46,  0.10it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:10<22:25,  0.19it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:11<15:38,  0.27it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:18<19:38,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:19<16:03,  0.26it/s, v_num=0]

[36m(train_model pid=2454)[0m [2024-12-04 13:50:59,017 E 2454 2496] logging.cc:115: Stack trace: 
[36m(train_model pid=2454)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x788b8f52ef2a] ray::operator<<()
[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x788b8f531f72] ray::TerminateHandler()
[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x788b8e3ce502] __cxxabiv1::__terminate()
[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x788b8e3c8303] std::unexpected()
[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x788b8e3ce1d1] __gxx_personality_v0
[36m(train_model pid=2454)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x788b8e30ef48] _Unwind_ForcedUnwind_Pha

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2574)[0m RESULTS:  
[36m(train_model pid=2574)[0m tensor(4.6580, device='cuda:0') tensor(1.0557, device='cuda:0') tensor(117.9084, device='cuda:0') 50.06750822067261
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.85it/s]
[36m(train_model pid=2574)[0m RESULTS:  tensor(4.6423, device='cuda:0') tensor(1.0503, device='cuda:0') tensor(117.5725, device='cuda:0') 50.068867206573486
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:10<46:19,  0.09it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:12<26:35,  0.16it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:14<19:57,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:20<21:21,  0.20it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:22<18:21,  0.23it/s, v_num=0]

[36m(train_model pid=2713)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=2713)[0m   self.pid = os.fork()


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2713)[0m RESULTS:  
[36m(train_model pid=2713)[0m tensor(1.3551, device='cuda:0') tensor(1.0628, device='cuda:0') tensor(214.9903, device='cuda:0') 50.30680298805237
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.01it/s]
[36m(train_model pid=2713)[0m RESULTS:  tensor(1.3488, device='cuda:0') tensor(1.0573, device='cuda:0') tensor(214.4004, device='cuda:0') 50.3132700920105
                                                                           
Epoch 0:   0%|          | 0/254 [00:00<?, ?it/s] 
Epoch 0:   0%|          | 1/254 [00:09<41:03,  0.10it/s, v_num=0]
Epoch 0:   1%|          | 2/254 [00:10<21:32,  0.19it/s, v_num=0]
Epoch 0:   1%|          | 3/254 [00:10<15:02,  0.28it/s, v_num=0]
Epoch 0:   2%|▏         | 4/254 [00:18<19:30,  0.21it/s, v_num=0]
Epoch 0:   2%|▏         | 5/254 [00:19<15:57,  0.26it/s, v_num=0]
Epoch 0:   2%|▏         | 6/254 [00:19<13:35,  0.30i

[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
[36m(train_model pid=2843)[0m   self.pid = os.fork()


Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
[36m(train_model pid=2843)[0m RESULTS:  
[36m(train_model pid=2843)[0m tensor(16.3652, device='cuda:0') tensor(1.0591, device='cuda:0') tensor(429.2677, device='cuda:0') 50.253844261169434
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.32it/s]
[36m(train_model pid=2843)[0m RESULTS:  tensor(16.3694, device='cuda:0') tensor(1.0594, device='cuda:0') tensor(429.3726, device='cuda:0') 50.25439262390137
                                                                           
Epoch 0:   0%|          | 0/127 [00:00<?, ?it/s] 
Epoch 0:   1%|          | 1/127 [00:08<18:29,  0.11it/s, v_num=0]
Epoch 0:   2%|▏         | 2/127 [00:09<10:10,  0.20it/s, v_num=0]
Epoch 0:   2%|▏         | 3/127 [00:10<07:23,  0.28it/s, v_num=0]
Epoch 0:   3%|▎         | 4/127 [00:15<07:49,  0.26it/s, v_num=0]
Epoch 0:   4%|▍         | 5/127 [00:16<06:53,  0.29it/s, v_num=

2024-12-04 14:04:05,861	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/kaggle/working/results/hyperparameter_search' in 0.0314s.
2024-12-04 14:04:05,888	INFO tune.py:1041 -- Total run time: 4208.28 seconds (4202.26 seconds for the tuning loop).


Best Hyperparameters Found:
{'HB': {'hidden_dim': 65536, 'batch_size': 4096}, 'lr': 0.00043794538106065907, 'l1_lambda': 0.00786079101749009}


[36m(train_model pid=2843)[0m [2024-12-04 14:04:06,432 E 2843 2885] logging.cc:115: Stack trace: 
[36m(train_model pid=2843)[0m  /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1033f2a) [0x79a27933ff2a] ray::operator<<()
[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/ray/_raylet.so(+0x1036f72) [0x79a279342f72] ray::TerminateHandler()
[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(+0xb6502) [0x79a2781df502] __cxxabiv1::__terminate()
[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(_ZSt10unexpectedv+0) [0x79a2781d9303] std::unexpected()
[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libstdc++.so.6(__gxx_personality_v0+0x414) [0x79a2781df1d1] __gxx_personality_v0
[36m(train_model pid=2843)[0m /opt/conda/lib/python3.10/site-packages/ray/../../../libgcc_s.so.1(+0x15f48) [0x79a27811ff48] _Unwind_ForcedUnwind_Pha