In [2]:
!pip install pytorch-lightning ray[tune]

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ray[tune]
  Downloading ray-2.39.0-cp310-cp310-manylinux2014_x86_64.whl (66.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting lightning-utilities>=0.10.0
  Downloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torch>=2.1.0
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl (906.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.4/906.4 MB[0m [31m3.4 MB/s

In [1]:
import os
from pathlib import Path
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, EarlyStopping
from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
from ray import train, tune
from ray.tune import Tuner, with_resources
# from ray_lightning.tune import TuneReportCallback
from ray.train import RunConfig

# Set global parameters
scale_factor = 11.888623072966611
input_dim = 3072
data_dir = os.path.join(os.getcwd(), "activations_data")
# sample_rate = 0.1 # 10% of data
sample_size = 8192 # file_size * sample_rate
num_epochs = 1  # Only one epoch over all data

# Dataset with Subsampling
class ActivationDataset(Dataset):
    def __init__(self, data_dir, f_type, test_fraction=0.01, scale_factor=1.0, batch_size=2048, seed=42):
        self.data_dir = data_dir
        self.test_fraction = test_fraction
        self.scale_factor = scale_factor
        self.batch_size = batch_size
        self.multi = sample_size // batch_size
        self.seed = seed
        self.file_names = sorted([f for f in os.listdir(data_dir) if f.endswith('.npy') and f.startswith('activations_batch')])
        if f_type not in ["train", "test"]:
            raise ValueError("f_type must be 'train' or 'test'")
        if f_type == "train":
            self.file_names = self.file_names[:int(len(self.file_names)*(1 - self.test_fraction))]
        else:
            self.file_names = self.file_names[int(len(self.file_names)*(1 - self.test_fraction)):]
        self.f_type = f_type

    def __len__(self):
        return len(self.file_names)*self.multi

    def __getitem__(self, idx):
        # Load a single file
        f_ix = idx // self.multi # sample the file 4 times
        file_path = os.path.join(self.data_dir, self.file_names[f_ix])
        activations = np.load(file_path)[:, :-3]  # Remove metadata columns

        # Normalize
        activations = activations / self.scale_factor

        # Random subsampling to sample_size
        np.random.seed(self.seed + idx)  # Change seed per file for reproducibility
        subsample_indices = np.random.choice(activations.shape[0], sample_size, replace=False)
        activations = activations[subsample_indices]

        # Get batch 
        batch_i = idx % self.multi
        start = batch_i*self.batch_size
        end = (batch_i+1)*self.batch_size
        activations = activations[start:end]

        # Convert to tensor
        return torch.tensor(activations, dtype=torch.float32, device="cuda")

# Model Definition
class SparseAutoencoder(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, l1_lambda, lr):
        super().__init__()
        self.save_hyperparameters()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.criterion = nn.MSELoss()
        self.l1_lambda = l1_lambda
        self.lr = lr

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

    def compute_loss(self, batch, decoded, encoded):
        mse_loss = self.criterion(decoded, batch)
        decoder_weight_norms = torch.norm(self.decoder.weight, p=2, dim=0)
        l1_terms = encoded * decoder_weight_norms.unsqueeze(0)
        l1_loss = torch.mean(torch.sum(l1_terms, dim=1))
        return mse_loss, l1_loss

    def training_step(self, batch, batch_idx):
        decoded, encoded = self(batch)
        mse_loss, l1_loss = self.compute_loss(batch, decoded, encoded)
        total_loss = mse_loss + self.l1_lambda * l1_loss

        # Compute active features
        active_features = (encoded > 0).any(dim=0).float().mean().item() * 100

        # Log metrics
        self.log("train_loss", total_loss, on_step=True, on_epoch=True)
        self.log("train_mse_loss", mse_loss, on_step=True, on_epoch=True)
        self.log("train_l1_loss", l1_loss, on_step=True, on_epoch=True)
        self.log("active_features", active_features, on_step=True, on_epoch=True)
        self.log("val_loss", 0, on_step=True, on_epoch=True)
        # train.report({"loss": total_loss, "active_features": active_features})
        return total_loss

    def validation_step(self, batch, batch_idx):
        decoded, encoded = self(batch)
        mse_loss, l1_loss = self.compute_loss(batch, decoded, encoded)
        total_loss = mse_loss + self.l1_lambda * l1_loss

        # Compute active features
        active_features = (encoded > 0).any(dim=0).float().mean().item() * 100

        # Log metrics
        self.log("val_loss", total_loss, on_step=False, on_epoch=True)
        self.log("val_mse_loss", mse_loss, on_step=False, on_epoch=True)
        self.log("val_l1_loss", l1_loss, on_step=False, on_epoch=True)
        self.log("val_active_features", active_features, on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

# DataLoader Creation
def create_data_loaders(batch_size):
    train_dataset = ActivationDataset(data_dir, "train", 0.01, scale_factor, batch_size, 42)
    val_dataset = ActivationDataset(data_dir, "test", 0.01, scale_factor, batch_size, 42)
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)#, num_workers=3, pin_memory=True, persistent_workers=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    return train_loader, val_loader

# Training Function with Ray Tune
def train_model(config):
    train_loader, val_loader = create_data_loaders(config["HB"]["batch_size"])
    model = SparseAutoencoder(input_dim, hidden_dim=config["HB"]["hidden_dim"], l1_lambda=config["l1_lambda"], lr=config["lr"])

    logger = TensorBoardLogger("tb_logs", name="SparseAutoencoder")

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        logger=logger,
        # val_check_interval=0.25,  # Check validation 4 times per epoch
        # max_time="00:30:00",  # Stop after 30 minutes
        enable_progress_bar=True, # Show progress bar
        callbacks=[
            LearningRateMonitor(logging_interval="step"),
            EarlyStopping(monitor="val_loss", patience=3, mode="min"),
            TuneReportCheckpointCallback(
                {
                    "train_loss": "train_loss",
                    "train_mse_loss": "train_mse_loss",
                    "train_l1_loss": "train_l1_loss",
                    "active_features": "active_features",
                    "val_loss": "val_loss",
                },
                filename="none",  # Do not save checkpoints
                save_checkpoints = False,
                on="train_batch_end",
            ),
            TuneReportCheckpointCallback(
                {
                    "val_loss": "val_loss",
                    "val_mse_loss": "val_mse_loss",
                    "val_l1_loss": "val_l1_loss",
                    "val_active_features": "val_active_features",
                },
                on="validation_end",
            ),
            # RayTrainReportCallback(),
        ],
        # strategy=RayDDPStrategy(), # Use Ray for distributed training, DDP stands for Distributed Data Parallel
        # callbacks=[RayTrainReportCallback()], # Report metrics to Ray
        # plugins=[RayLightningEnvironment()], # Use Ray for distributed training
    )
    trainer.fit(model, train_loader, val_loader)

# Ray Tune Hyperparameter Search
def tune_hyperparameters():

    # possible_hidden_dims = [4096, 8192, 16384, 20000, 32768]
    # possible_batch_sizes = [512, 1024, 2048, 4096, 8192]
    possible_hidden_dims = [8192, 16384, 20000, 32768]
    possible_batch_sizes = [1024, 2048, 4096]

    valid_hb_pairs = []
    for hidden_dim in possible_hidden_dims:
        for batch_size in possible_batch_sizes:
            if hidden_dim * batch_size <= 41_000_000: # VRAM limit
                valid_hb_pairs.append({"hidden_dim": hidden_dim, "batch_size": batch_size})


    # search_space = {
    #     "hidden_dim": tune.choice([4096, 8192, 16384, 20000, 32768]),
    #     "batch_size": tune.choice([512, 1024, 2048, 4096, 8192]),
    #     "l1_lambda": tune.loguniform(1e-4, 1e-2),
    #     "lr": tune.loguniform(1e-4, 1e-2),
    # }

    search_space = {
        "HB": tune.choice(valid_hb_pairs),
        "l1_lambda": tune.loguniform(1e-4, 1e-2),
        "lr": tune.loguniform(1e-4, 1e-2),
    }

    # scheduler_asha = tune.schedulers.ASHAScheduler(
    #     max_t=num_epochs,
    #     grace_period=1,
    #     reduction_factor=2,
    # )

    # os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0" # Allows relative paths, but trials are not isolated

    trainable_with_resources = with_resources(
        train_model,
        {"cpu":4, "gpu": 1}  # Adjust based on your available resources
    )

    tuner = Tuner(
        trainable=trainable_with_resources,
        param_space=search_space,
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            num_samples=1, # Number of hyperparameter sets to try
            max_concurrent_trials=10, # Number of trials to run concurrently
            # scheduler=scheduler_asha,
        ),
        run_config=RunConfig(
            name="hyperparameter_search",
            storage_path=str(Path("./results").resolve()),
        ),
    )
    results = tuner.fit()
    best_result = results.get_best_result(metric="val_loss", mode="min")
    print("Best Hyperparameters Found:")
    print(best_result.config)
    return results

# # Run Hyperparameter Search
# if __name__ == "__main__":
#     tune_hyperparameters()


In [2]:
results = tune_hyperparameters()

0,1
Current time:,2024-11-29 01:43:46
Running for:,00:00:13.72
Memory:,6.4/15.0 GiB

Trial name,# failures,error file
train_model_f5826_00000,1,"/tmp/ray/session_2024-11-29_01-43-30_676248_535939/artifacts/2024-11-29_01-43-32/hyperparameter_search/driver_artifacts/train_model_f5826_00000_0_HB=hidden_dim_20000_batch_size_2048,l1_lambda=0.0001,lr=0.0001_2024-11-29_01-43-32/error.txt"

Trial name,status,loc,HB,l1_lambda,lr
train_model_f5826_00000,ERROR,192.168.178.90:537105,{'hidden_dim': _8980,0.000126523,0.000103278


[36m(train_model pid=537105)[0m GPU available: True (cuda), used: True
[36m(train_model pid=537105)[0m TPU available: False, using: 0 TPU cores
[36m(train_model pid=537105)[0m HPU available: False, using: 0 HPUs
[36m(train_model pid=537105)[0m You are using a CUDA device ('NVIDIA GeForce RTX 3070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(train_model pid=537105)[0m 2024-11-29 01:43:37.310373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(train_model pid=537105)[0m E0000 00:00:1732841017.327900  537222 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attem

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(train_model pid=537105)[0m 
[36m(train_model pid=537105)[0m   | Name      | Type    | Params | Mode 
[36m(train_model pid=537105)[0m ----------------------------------------------
[36m(train_model pid=537105)[0m 0 | encoder   | Linear  | 61.5 M | train
[36m(train_model pid=537105)[0m 1 | decoder   | Linear  | 61.4 M | train
[36m(train_model pid=537105)[0m 2 | criterion | MSELoss | 0      | train
[36m(train_model pid=537105)[0m ----------------------------------------------
[36m(train_model pid=537105)[0m 122 M     Trainable params
[36m(train_model pid=537105)[0m 0         Non-trainable params
[36m(train_model pid=537105)[0m 122 M     Total params
[36m(train_model pid=537105)[0m 491.612   Total estimated model params size (MB)
[36m(train_model pid=537105)[0m 3         Modules in train mode
[36m(train_model pid=537105)[0m 0         Modules in eval mode
[36m(train_model pid=537105)[0m /home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/pytor

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  4.69it/s]
Epoch 0:   0%|          | 0/128 [00:00<?, ?it/s]                           


2024-11-29 01:43:46,194	ERROR tune_controller.py:1331 -- Trial task failed for trial train_model_f5826_00000
Traceback (most recent call last):
  File "/home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/ray/_private/worker.py", line 2753, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/drew99/IJS/LLMinfluence/venvllm/lib/python3.10/site-packages/ray/_private/worker.py", line 904, in get_objects
    raise value.as

Best Hyperparameters Found:
{'HB': {'hidden_dim': 20000, 'batch_size': 2048}, 'l1_lambda': 0.0001265234404620061, 'lr': 0.00010327792598627444}


In [None]:
# List all Ray Tune experiments
# ls ./ray_results

# Show the best trial from an experiment
# cat ./ray_results/<experiment_name>/best_result.json


# tensorboard --logdir tb_logs
# tensorboard --logdir=~/ray_results/my_experiment


# ray dashboard


In [5]:
%reload_ext tensorboard
%tensorboard --logdir=./results/hyperparameter_search