## Imports

In [1]:
import os

from mmaction.datasets import build_dataset, build_dataloader
from mmaction.models import build_model
from mmcv import Config

  from .autonotebook import tqdm as notebook_tqdm


## Loading batches

In [2]:
cfg = Config.fromfile('./baseline.py')

In [3]:
os.chdir('../../..')

In [4]:
cfg.data.train

{'type': 'RawframeDataset',
 'ann_file': 'data/hmdb51/annotation_train.txt',
 'data_prefix': 'data/hmdb51/rawframes',
 'pipeline': [{'type': 'SampleFrames',
   'clip_len': 32,
   'frame_interval': 2,
   'num_clips': 1},
  {'type': 'RawFrameDecode'},
  {'type': 'Resize', 'scale': (-1, 256)},
  {'type': 'RandomResizedCrop'},
  {'type': 'Resize', 'scale': (224, 224), 'keep_ratio': False},
  {'type': 'Flip', 'flip_ratio': 0.5},
  {'type': 'Normalize',
   'mean': [123.675, 116.28, 103.53],
   'std': [58.395, 57.12, 57.375],
   'to_bgr': False},
  {'type': 'FormatShape', 'input_format': 'NCTHW'},
  {'type': 'Collect', 'keys': ['imgs', 'label'], 'meta_keys': []},
  {'type': 'ToTensor', 'keys': ['imgs', 'label']}]}

In [5]:
train_dataset = build_dataset(cfg=cfg.data.train)
train_loader = build_dataloader(
        train_dataset,
        videos_per_gpu=10,
        workers_per_gpu=4,
        persistent_workers=False,
        num_gpus=1,
        dist=False)

val_dataset = build_dataset(cfg=cfg.data.val)
val_loader = build_dataloader(
        val_dataset,
        videos_per_gpu=1,
        workers_per_gpu=4,
        persistent_workers=False,
        num_gpus=1,
        dist=False)

## Learning Hyperparameters

In [6]:
import optuna
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import MultiStepLR
from mmcv import Config
import torch
import os
import logging

# Configure logging
logging.basicConfig(filename='optuna_training.log', 
                    filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s', 
                    level=logging.INFO)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Database file path for saving study
db_file = "sqlite:///optuna_study.db"

# Set up study with the option to resume if it already exists
study = optuna.create_study(
    direction="maximize", 
    study_name="my_study", 
    storage=db_file,
    load_if_exists=True
)

def objective(trial):
    # Hyperparameters to tune
    dropout_ratio = trial.suggest_float("dropout_ratio", 0.3, 0.7)
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.05, 0.2)
    max_norm = trial.suggest_int("max_norm", 10, 50)
    
    # Backbone parameters
    cfg.model.backbone.with_pool2 = trial.suggest_categorical("with_pool2", [True, False])
    cfg.model.backbone.bottleneck_mode = trial.suggest_categorical("bottleneck_mode", ["ir", "ip"])
    cfg.model.backbone.norm_eval = trial.suggest_categorical("norm_eval", [True, False])
    cfg.model.backbone.bn_frozen = trial.suggest_categorical("bn_frozen", [True, False])
    
    # Fixed pretrained URL
    cfg.model.backbone.pretrained = 'https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth'

    # Adjust config parameters
    cfg.model.cls_head.dropout_ratio = dropout_ratio
    warmup_start_lr = lr * warmup_ratio
    initial_lr = lr
    
    # Initialize model, criterion, optimizer, scheduler
    model = build_model(cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg')).to(device)
    optimizer = optim.SGD(
        model.parameters(),
        lr=initial_lr,
        momentum=0.9,
        weight_decay=0.0001
    )
    scheduler = MultiStepLR(optimizer, milestones=[70, 140], gamma=0.1)
    
    # Warmup settings
    warmup_epochs = 16

    # Training and validation
    total_epochs = 35
    eval_interval = 5
    
    for epoch in range(total_epochs):
        if epoch < warmup_epochs:
            warmup_lr = warmup_start_lr + (initial_lr - warmup_start_lr) * (epoch / warmup_epochs)
            for param_group in optimizer.param_groups:
                param_group['lr'] = warmup_lr
        else:
            scheduler.step()
        
        # Training loop
        model.train()
        running_loss, correct, total = 0.0, 0, 0
        for i, data in enumerate(train_loader):
            inputs, labels = data['imgs'].to(device), data['label'].to(device)
            
            optimizer.zero_grad()
            results = model(inputs, labels, return_loss=True)
            loss = results['loss_cls']
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            optimizer.step()

            running_loss += loss.item()
            correct += (results['top1_acc'] * inputs.size(0))
            total += inputs.size(0)

        train_accuracy = correct / total
        train_loss = running_loss / len(train_loader)

        logging.info(f"Epoch [{epoch + 1}/{total_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

        # Validation loop (every `eval_interval` epochs)
        if (epoch + 1) % eval_interval == 0:
            model.eval()
            val_running_loss, val_correct, val_total = 0.0, 0, 0
            with torch.no_grad():
                for val_data in val_loader:
                    val_inputs, val_labels = val_data['imgs'].to(device), val_data['label'].to(device)
                    
                    val_results = model(val_inputs, val_labels, return_loss=True)
                    val_loss = val_results['loss_cls']
                    val_running_loss += val_loss.item()
                    val_correct += (val_results['top1_acc'] * val_inputs.size(0))
                    val_total += val_inputs.size(0)

            val_accuracy = val_correct / val_total
            val_loss = val_running_loss / len(val_loader)

            # Report validation accuracy to Optuna
            trial.report(val_accuracy, epoch)

            # Prune unpromising trials
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
    
    return val_accuracy 

# Run Optuna Study
study.optimize(objective, n_trials=50)

logging.info("Best hyperparameters: %s", study.best_params)
logging.info("Best validation accuracy: %f", study.best_value)

[I 2024-11-05 07:47:03,482] Using an existing study with name 'my_study' instead of creating a new one.
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
2024-11-05 07:47:03,924 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 07:47:03,925 - mmaction - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth

size mismatch for layer1.0.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 1, 1, 1]).
size mismatch for layer1.1.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 1, 1, 1]).
size mismatch for layer1.2.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape

KeyboardInterrupt: 

[I 2024-11-05 02:02:27,080] Using an existing study with name 'my_study' instead of creating a new one.
/tmp/ipykernel_930960/1812890224.py:33: FutureWarning: suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.
  lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
2024-11-05 02:02:27,513 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 02:02:27,514 - mmaction - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
[I 2024-11-05 03:03:39,859] Trial 2 finished with value: 0.6947712418300653 and parameters: {'dropout_ratio': 0.37629465362426134, 'lr': 7.80756602206136e-05, 'warmup_ratio': 0.11381606982740221, 'max_norm': 39, 'with_pool2': True, 'bottleneck_mode': 'ir', 'norm_eval': True, 'bn_frozen': True}. Best is trial 0 with value: 0.7588235294117647.
2024-11-05 03:03:40,235 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 03:03:40,236 - mmaction - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 03:03:40,268 - mmaction - WARNING - The model and loaded state dict do not match exactly

size mismatch for layer1.0.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 1, 1, 1]).
size mismatch for layer1.1.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 1, 1, 1]).
size mismatch for layer1.2.conv2.0.conv.weight: copying a param with shape torch.Size([64, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([64, 64, 1, 1, 1]).
size mismatch for layer2.0.conv2.0.conv.weight: copying a param with shape torch.Size([128, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 128, 1, 1, 1]).
size mismatch for layer2.1.conv2.0.conv.weight: copying a param with shape torch.Size([128, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 128, 1, 1, 1]).
size mismatch for layer2.2.conv2.0.conv.weight: copying a param with shape torch.Size([128, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 128, 1, 1, 1]).
size mismatch for layer2.3.conv2.0.conv.weight: copying a param with shape torch.Size([128, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([128, 128, 1, 1, 1]).
size mismatch for layer3.0.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer3.1.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer3.2.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer3.3.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer3.4.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer3.5.conv2.0.conv.weight: copying a param with shape torch.Size([256, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([256, 256, 1, 1, 1]).
size mismatch for layer4.0.conv2.0.conv.weight: copying a param with shape torch.Size([512, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([512, 512, 1, 1, 1]).
size mismatch for layer4.1.conv2.0.conv.weight: copying a param with shape torch.Size([512, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([512, 512, 1, 1, 1]).
size mismatch for layer4.2.conv2.0.conv.weight: copying a param with shape torch.Size([512, 1, 3, 3, 3]) from checkpoint, the shape in current model is torch.Size([512, 512, 1, 1, 1]).
missing keys in source state_dict: layer1.0.conv2.1.conv.weight, layer1.0.conv2.1.bn.weight, layer1.0.conv2.1.bn.bias, layer1.0.conv2.1.bn.running_mean, layer1.0.conv2.1.bn.running_var, layer1.1.conv2.1.conv.weight, layer1.1.conv2.1.bn.weight, layer1.1.conv2.1.bn.bias, layer1.1.conv2.1.bn.running_mean, layer1.1.conv2.1.bn.running_var, layer1.2.conv2.1.conv.weight, layer1.2.conv2.1.bn.weight, layer1.2.conv2.1.bn.bias, layer1.2.conv2.1.bn.running_mean, layer1.2.conv2.1.bn.running_var, layer2.0.conv2.1.conv.weight, layer2.0.conv2.1.bn.weight, layer2.0.conv2.1.bn.bias, layer2.0.conv2.1.bn.running_mean, layer2.0.conv2.1.bn.running_var, layer2.1.conv2.1.conv.weight, layer2.1.conv2.1.bn.weight, layer2.1.conv2.1.bn.bias, layer2.1.conv2.1.bn.running_mean, layer2.1.conv2.1.bn.running_var, layer2.2.conv2.1.conv.weight, layer2.2.conv2.1.bn.weight, layer2.2.conv2.1.bn.bias, layer2.2.conv2.1.bn.running_mean, layer2.2.conv2.1.bn.running_var, layer2.3.conv2.1.conv.weight, layer2.3.conv2.1.bn.weight, layer2.3.conv2.1.bn.bias, layer2.3.conv2.1.bn.running_mean, layer2.3.conv2.1.bn.running_var, layer3.0.conv2.1.conv.weight, layer3.0.conv2.1.bn.weight, layer3.0.conv2.1.bn.bias, layer3.0.conv2.1.bn.running_mean, layer3.0.conv2.1.bn.running_var, layer3.1.conv2.1.conv.weight, layer3.1.conv2.1.bn.weight, layer3.1.conv2.1.bn.bias, layer3.1.conv2.1.bn.running_mean, layer3.1.conv2.1.bn.running_var, layer3.2.conv2.1.conv.weight, layer3.2.conv2.1.bn.weight, layer3.2.conv2.1.bn.bias, layer3.2.conv2.1.bn.running_mean, layer3.2.conv2.1.bn.running_var, layer3.3.conv2.1.conv.weight, layer3.3.conv2.1.bn.weight, layer3.3.conv2.1.bn.bias, layer3.3.conv2.1.bn.running_mean, layer3.3.conv2.1.bn.running_var, layer3.4.conv2.1.conv.weight, layer3.4.conv2.1.bn.weight, layer3.4.conv2.1.bn.bias, layer3.4.conv2.1.bn.running_mean, layer3.4.conv2.1.bn.running_var, layer3.5.conv2.1.conv.weight, layer3.5.conv2.1.bn.weight, layer3.5.conv2.1.bn.bias, layer3.5.conv2.1.bn.running_mean, layer3.5.conv2.1.bn.running_var, layer4.0.conv2.1.conv.weight, layer4.0.conv2.1.bn.weight, layer4.0.conv2.1.bn.bias, layer4.0.conv2.1.bn.running_mean, layer4.0.conv2.1.bn.running_var, layer4.1.conv2.1.conv.weight, layer4.1.conv2.1.bn.weight, layer4.1.conv2.1.bn.bias, layer4.1.conv2.1.bn.running_mean, layer4.1.conv2.1.bn.running_var, layer4.2.conv2.1.conv.weight, layer4.2.conv2.1.bn.weight, layer4.2.conv2.1.bn.bias, layer4.2.conv2.1.bn.running_mean, layer4.2.conv2.1.bn.running_var

[I 2024-11-05 04:17:34,351] Trial 3 finished with value: 0.19607843137254902 and parameters: {'dropout_ratio': 0.6300026158652796, 'lr': 0.0007329990590709254, 'warmup_ratio': 0.07932626660238631, 'max_norm': 23, 'with_pool2': True, 'bottleneck_mode': 'ip', 'norm_eval': False, 'bn_frozen': False}. Best is trial 0 with value: 0.7588235294117647.
2024-11-05 04:17:34,722 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 04:17:34,723 - mmaction - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
[I 2024-11-05 05:25:47,516] Trial 4 finished with value: 0.7039215686274509 and parameters: {'dropout_ratio': 0.6152719310623556, 'lr': 0.00041679938242399035, 'warmup_ratio': 0.17738656176381717, 'max_norm': 29, 'with_pool2': True, 'bottleneck_mode': 'ir', 'norm_eval': False, 'bn_frozen': True}. Best is trial 0 with value: 0.7588235294117647.
2024-11-05 05:25:47,878 - mmaction - INFO - load model from: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
2024-11-05 05:25:47,879 - mmaction - INFO - load checkpoint from http path: https://download.openmmlab.com/mmaction/recognition/csn/ircsn_from_scratch_r50_ig65m_20210617-ce545a37.pth
[W 2024-11-05 05:35:54,072] Trial 5 failed with parameters: {'dropout_ratio': 0.5687442299426126, 'lr': 0.0001458748799450692, 'warmup_ratio': 0.1773074887200759, 'max_norm': 34, 'with_pool2': False, 'bottleneck_mode': 'ir', 'norm_eval': False, 'bn_frozen': False} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):