# RayTune Hyperparameter Optimization for Chest X-Ray Classification

This notebook demonstrates how to use Ray Tune to perform hyperparameter optimization for a ResNet18 model on the Chest X-Ray dataset for pneumonia classification.

In [None]:
# Install required packages
!pip install torch torchvision ray[data,train,tune,serve] gdown optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting ray[data,serve,train,tune]
  Downloading ray-2.37.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (16 kB)
Collecting tensorboardX>=1.9 (from ray[data,serve,train,tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting colorful (from ray[data,serve,train,tune])
  Downloading colorful-0.5.6-py2.py3-none-any.whl.metadata (16 kB)
Collecting aiohttp-cors (from ray[data,serve,train,tune])
  Downloading aiohttp_cors-0.7.0-py3-none-any.whl.metadata (20 kB)
Collecting watchfiles (from ray[data,serve,train,tune])
  Downloading watchfiles-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting starlette (from ray[data,serve,train,tune])
  Downloading starlette-0.39.2-py3-none-any.whl.metadata (6.0 kB)
Collecting uvicorn[standard] (from ray[data,serve,train,tune])
  Downloading uvicorn-0.31.0-py3-none-any.whl.metadata (6.6 kB)
Collecti

In [None]:
import ray
ray.init()
print(ray.cluster_resources())

2024-10-02 15:50:31,900	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


{'object_store_memory': 26818505932.0, 'accelerator_type:A100': 1.0, 'node:172.28.0.12': 1.0, 'memory': 53637011867.0, 'CPU': 12.0, 'node:__internal_head__': 1.0, 'GPU': 1.0}


In [None]:
import os
import gdown

def download_dataset():
    if not os.path.exists('chest_xray'):
        print("Downloading dataset...")
        url = 'https://drive.google.com/uc?id=1jf1XvAeXPD4XAerknz5inxM0StuCNbyX'
        output = 'ChestXRay2017.zip'
        gdown.download(url, output, quiet=False)
        print("Extracting dataset...")
        !unzip -q ChestXRay2017.zip
        print("Dataset downloaded and extracted.")
    else:
        print("Dataset already exists.")

    # Print the contents of the dataset directory
    print("Contents of chest_xray directory:")
    print(os.listdir('chest_xray'))
    for subset in ['train', 'test']:
        print(f"Contents of chest_xray/{subset} directory:")
        print(os.listdir(f'chest_xray/{subset}'))

In [None]:
# Call the function to download and extract the dataset
download_dataset()

Downloading dataset...


Downloading...
From (original): https://drive.google.com/uc?id=1jf1XvAeXPD4XAerknz5inxM0StuCNbyX
From (redirected): https://drive.google.com/uc?id=1jf1XvAeXPD4XAerknz5inxM0StuCNbyX&confirm=t&uuid=f15a3945-3bdd-48da-852e-a31d154a277f
To: /content/ChestXRay2017.zip
100%|██████████| 1.24G/1.24G [00:53<00:00, 23.2MB/s]


Extracting dataset...
Dataset downloaded and extracted.
Contents of chest_xray directory:
['test', 'train', '.DS_Store']
Contents of chest_xray/train directory:
['NORMAL', '.DS_Store', 'PNEUMONIA']
Contents of chest_xray/test directory:
['NORMAL', '.DS_Store', 'PNEUMONIA']


In [None]:
# Import necessary libraries
import os
import torch
import numpy as np
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from functools import partial

# Define constants
DATA_DIR = os.path.join(os.getcwd(), 'chest_xray')
NUM_CLASSES = 2
NUM_WORKERS = 2  # Adjusted for Colab environment
MAX_EPOCHS = 25

## Dataset Download and Preparation

In [None]:
def download_dataset():
    """Download and extract the Chest X-Ray dataset if it doesn't exist."""
    if not os.path.exists(DATA_DIR):
        print("Downloading dataset...")
        try:
            !gdown --fuzzy https://drive.google.com/file/d/1jf1XvAeXPD4XAerknz5inxM0StuCNbyX/view?usp=sharing
            !unzip -qq ChestXRay2017.zip
            print("Dataset downloaded and extracted.")
        except Exception as e:
            print(f"Error downloading dataset: {e}")
            raise
    else:
        print("Dataset already exists.")

def get_data_transforms():
    return {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'test': transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }

def load_data(data_dir, batch_size):
    data_transforms = get_data_transforms()
    image_datasets = {
        'train': datasets.ImageFolder(os.path.join(data_dir, 'train'), data_transforms['train']),
        'test': datasets.ImageFolder(os.path.join(data_dir, 'test'), data_transforms['test'])
    }
    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
                                                 shuffle=True, num_workers=NUM_WORKERS)
                   for x in ['train', 'test']}
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'test']}
    return dataloaders, dataset_sizes

## Model Definition

In [None]:
def create_model():
    """Create and prepare the ResNet18 model for transfer learning."""
    model = models.resnet18(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, NUM_CLASSES)
    return model

## Training Function

In [None]:
from ray import train

def train_model(config):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model = create_model().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    dataloaders, dataset_sizes = load_data(DATA_DIR, config["batch_size"])

    global_step = 0
    for epoch in range(MAX_EPOCHS):
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        global_step += 1

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            # Report metrics
            results = {
                "step": global_step,
                "loss": epoch_loss,
                "accuracy": epoch_acc.item(),
                "epoch": epoch,
                "phase": phase
            }
            train.report(results)

    return model

## Main Function for Hyperparameter Optimization

In [None]:
def main(num_samples=10, gpus_per_trial=1):
    print("Current working directory:", os.getcwd())
    print("Contents of current directory:", os.listdir())

    if os.path.exists('chest_xray'):
        print("Contents of chest_xray directory:", os.listdir('chest_xray'))
    else:
        print("chest_xray directory not found")

    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "momentum": tune.uniform(0.5, 0.9),
        "batch_size": tune.choice([16, 32, 64])
    }

    scheduler = ASHAScheduler(
        metric="accuracy",
        mode="max",
        max_t=MAX_EPOCHS,
        grace_period=1,
        reduction_factor=2
    )

    search_alg = OptunaSearch(
        metric="accuracy",
        mode="max"
    )

    resources_per_trial = {"cpu": 2, "gpu": 1}

    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_model),
            resources=resources_per_trial
        ),
        tune_config=tune.TuneConfig(
            scheduler=scheduler,
            search_alg=search_alg,
            num_samples=num_samples,
        ),
        param_space=config,
    )

    results = tuner.fit()

    best_result = results.get_best_result("accuracy", "max")

    print("Best trial config:", best_result.config)
    print("Best trial final validation loss:", best_result.metrics["loss"])
    print("Best trial final validation accuracy:", best_result.metrics["accuracy"])

    best_trained_model = train_model(best_result.config)
    torch.save(best_trained_model.state_dict(), "best_model.pth")
    print("Best model saved to best_model.pth")

In [None]:
# Run the main function
if __name__ == "__main__":
    main()

[I 2024-10-02 15:51:49,794] A new study created in memory with name: optuna


Current working directory: /content
Contents of current directory: ['.config', 'chest_xray', 'ChestXRay2017.zip', '__MACOSX', 'sample_data']
Contents of chest_xray directory: ['test', 'train', '.DS_Store']
+--------------------------------------------------------------------+
| Configuration for experiment     train_model_2024-10-02_15-51-49   |
+--------------------------------------------------------------------+
| Search algorithm                 SearchGenerator                   |
| Scheduler                        AsyncHyperBandScheduler           |
| Number of trials                 10                                |
+--------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_model_2024-10-02_15-51-49
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-10-02_15-50-30_166550_272/artifacts/2024-10-02_15-51-49/train_model_2024-10-02_15-51-49/driver_artifacts`

Trial status: 1 PE

[36m(train_model pid=1691)[0m Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
  0%|          | 0.00/44.7M [00:00<?, ?B/s]
 47%|████▋     | 21.0M/44.7M [00:00<00:00, 219MB/s]
100%|██████████| 44.7M/44.7M [00:00<00:00, 217MB/s]



Trial status: 1 RUNNING | 1 PENDING
Current time: 2024-10-02 15:52:20. Total running time: 30s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+---------------------------------------------------------------------------+
| Trial name             status              lr     momentum     batch_size |
+---------------------------------------------------------------------------+
| train_model_ace34fef   RUNNING    0.000121147     0.524969             32 |
| train_model_709f9d24   PENDING    0.000112429     0.771644             32 |
+---------------------------------------------------------------------------+
Trial status: 1 RUNNING | 1 PENDING
Current time: 2024-10-02 15:52:50. Total running time: 1min 0s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+--------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             sta




Trial status: 1 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:00:50. Total running time: 9min 1s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_709f9d24   RUNNING      0.000112429     0.771644             32                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25            516.306     2132   0.257978     0.895833        12 |
| train_model_521bda6d   PENDING      0.00480564  




Trial status: 2 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:01:20. Total running time: 9min 31s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_521bda6d   RUNNING      0.00480564      0.763265             16                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.000112429




Trial status: 3 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:10:21. Total running time: 18min 31s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_84a5d8d8   RUNNING      0.00101123      0.653886             16                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.00011242




Trial status: 4 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:19:22. Total running time: 27min 32s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_6dd79148   RUNNING      0.000499305     0.572462             32                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.00011242




Trial status: 5 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:19:52. Total running time: 28min 2s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_d35c65a8   RUNNING      0.00199513      0.798236             64                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.000112429




Trial status: 6 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:20:53. Total running time: 29min 3s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_fc00dce4   RUNNING      0.0692302       0.84238              32                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.000112429




Trial status: 7 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:21:23. Total running time: 29min 33s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_eb973a08   RUNNING      0.00128688      0.572295             32                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.00011242




Trial status: 8 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2024-10-02 16:21:53. Total running time: 30min 3s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_5b71b689   RUNNING      0.00493331      0.61664              64                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.000112429




Trial status: 9 TERMINATED | 1 RUNNING
Current time: 2024-10-02 16:22:53. Total running time: 31min 3s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status                lr     momentum     batch_size     iter     total time (s)     step       loss     accuracy     epoch |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_b4f3df2a   RUNNING      0.0712347       0.855546             32                                                                        |
| train_model_ace34fef   TERMINATED   0.000121147     0.524969             32       25           516.306      2132   0.257978     0.895833        12 |
| train_model_709f9d24   TERMINATED   0.000112429     0.77164

2024-10-02 16:23:10,396	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_model_2024-10-02_15-51-49' in 0.0063s.



Trial train_model_b4f3df2a completed after 1 iterations at 2024-10-02 16:23:10. Total running time: 31min 20s
+------------------------------------------------+
| Trial train_model_b4f3df2a result              |
+------------------------------------------------+
| checkpoint_dir_name                            |
| time_this_iter_s                      35.02983 |
| time_total_s                          35.02983 |
| training_iteration                           1 |
| accuracy                               0.69878 |
| epoch                                        0 |
| loss                                    1.5261 |
| phase                                    train |
| step                                       164 |
+------------------------------------------------+

Trial status: 10 TERMINATED
Current time: 2024-10-02 16:23:10. Total running time: 31min 20s
Logical resource usage: 2.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:A100)
+---------------------------------------------------



Best model saved to best_model.pth
