In [None]:
# Replace with your token and repo URL
token = "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"
!git clone https://{token}@github.com/puru-samal/thesis.git


In [1]:
import torch
import yaml
from data import DrumMIDIDataset
from torch.utils.data import DataLoader
import numpy as np
from torchinfo import summary
from models.IQAE import IQAE
from trainers import IQAE_Trainer
from utils import create_optimizer, create_scheduler
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


# Config

In [2]:
%%writefile _configs/config_test.yaml

name : "Puru"
expt : "Test"

###### Dataset -----------------------------------------------------------------
data:
    train_path: "dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_train.pkl"
    val_path: "dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_val.pkl"
    test_path: "dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_test.pkl"
    num_bars: 2
    feature_type: "fixed"  # [fixed, flexible]
    steps_per_quarter: 4
    subset: 0.05            # Fraction of the dataset to load
    num_workers: 0         # Number of workers for data loading
    batch_size: 16         # Batch size

###### Network Specs -------------------------------------------------------------
model:
    embed_dim: 32
    encoder_depth: 1
    encoder_heads: 1
    decoder_depth: 1
    decoder_heads: 1
    num_buttons: 3

###### Common Training Parameters ------------------------------------------------
training:
  config_file                 : "_configs/config_test.yaml"
  use_wandb                   : False   # Toggle wandb logging
  wandb_run_id                : "none" # "none" or "run_id"
  resume                      : True   # Resume an existing run (run_id != 'none')
  gradient_accumulation_steps : 1
  wandb_project               : "Set-Project-Name-Here" # wandb project to log to

###### Loss ----------------------------------------------------------------------
loss: # Just good ol' CrossEntropy
  label_smoothing: 0.0
  ctc_weight: 0.2

###### Optimizer -----------------------------------------------------------------
optimizer:
  name: "adamw" # Options: sgd, adam, adamw
  lr: 0.0004    # Base learning rate

  # Common parameters
  weight_decay: 0.000001

  # Parameter groups
  # You can add more param groups as you want and set their learning rates and patterns
  param_groups:
    - name: self_attn
      patterns: []  # Will match all parameters containing "encoder"
      lr: 0.0002    # LR for self_attn
      layer_decay:
        enabled: False
        decay_rate: 0.8

    - name: ffn
      patterns: []
      lr: 0.0002  # LR for ffn
      layer_decay:
        enabled: False
        decay_rate: 0.8

  # Layer-wise learning rates
  layer_decay:
    enabled: False
    decay_rate: 0.75

  # SGD specific parameters
  sgd:
    momentum: 0.9
    nesterov: True
    dampening: 0

  # Adam specific parameters
  adam:
    betas: [0.9, 0.999]
    eps: 1.0e-8
    amsgrad: False

  # AdamW specific parameters
  adamw:
    betas: [0.9, 0.999]
    eps: 1.0e-8
    amsgrad: False

###### Scheduler -----------------------------------------------------------------
scheduler:
  name: "cosine"  # Options: reduce_lr, cosine, cosine_warm

  # ReduceLROnPlateau specific parameters
  reduce_lr:
    mode: "min"  # Options: min, max
    factor: 0.1  # Factor to reduce learning rate by
    patience: 10  # Number of epochs with no improvement after which LR will be reduced
    threshold: 0.0001  # Threshold for measuring the new optimum
    threshold_mode: "rel"  # Options: rel, abs
    cooldown: 0  # Number of epochs to wait before resuming normal operation
    min_lr: 0.0000001  # Minimum learning rate
    eps: 1e-8  # Minimal decay applied to lr

  # CosineAnnealingLR specific parameters
  cosine:
    T_max: 15  # Maximum number of iterations
    eta_min: 0.0000001  # Minimum learning rate
    last_epoch: -1

  # CosineAnnealingWarmRestarts specific parameters
  cosine_warm:
    T_0: 10    # Number of iterations for the first restart
    T_mult: 10 # Factor increasing T_i after each restart
    eta_min: 0.0000001  # Minimum learning rate
    last_epoch: -1

  # Warmup parameters (can be used with any scheduler)
  warmup:
    enabled: True
    type: "exponential"  # Options: linear, exponential
    epochs: 5
    start_factor: 0.1
    end_factor: 1.0


Overwriting _configs/config_test.yaml


In [3]:
with open('_configs/config_test.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Datasets / Dataloader

In [4]:
## Load Datasets
train_dataset = DrumMIDIDataset(
    path     = config["data"]["train_path"],
    num_bars = config["data"]["num_bars"],
    feature_type = config["data"]["feature_type"],
    steps_per_quarter = config["data"]["steps_per_quarter"],
    subset   = config["data"]["subset"]
)

val_dataset = DrumMIDIDataset(
    path     = config["data"]["val_path"],
    num_bars = config["data"]["num_bars"],
    feature_type = config["data"]["feature_type"],
    steps_per_quarter = config["data"]["steps_per_quarter"],
    subset   = config["data"]["subset"]
)

test_dataset = DrumMIDIDataset(
    path     = config["data"]["test_path"],
    num_bars = config["data"]["num_bars"],
    feature_type = config["data"]["feature_type"],
    steps_per_quarter = config["data"]["steps_per_quarter"],
    subset   = config["data"]["subset"]
)

## Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size = config["data"]["batch_size"],
    num_workers = config["data"]["num_workers"],
    shuffle = True,
    collate_fn = lambda batch: train_dataset.collate_fn(batch, train_dataset),
    pin_memory = True
)

val_loader = DataLoader(
    val_dataset,
    batch_size = config["data"]["batch_size"],
    num_workers = config["data"]["num_workers"],
    shuffle = True,
    collate_fn = lambda batch: val_dataset.collate_fn(batch, val_dataset),
    pin_memory = True
)

test_loader = DataLoader(
    test_dataset,
    batch_size = config["data"]["batch_size"],
    num_workers = config["data"]["num_workers"],
    shuffle = True,
    collate_fn = lambda batch: test_dataset.collate_fn(batch, test_dataset),
    pin_memory = True
)

## Test a sample
for batch in train_loader:
    print(f"Grid shape: {batch['grid'].shape}")
    print(f"Samples shape: {len(batch['samples'])}")
    grid = batch['grid']
    random_idx = np.random.randint(len(batch['samples']))
    sample = batch['samples'][random_idx]
    sample.feature.play()
    break

Loading dataset from: dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_train.pkl...
Processing 6578 samples...


Accumulating:: 100%|██████████| 6578/6578 [00:01<00:00, 5399.37sample/s]


Skipped 0 samples due to errors.
Loaded and processed 6578 samples.

Number of negative samples in fixed grid: 6778010
Number of positive samples in fixed grid: 398096
pos_weight: 17.02606964111328
Loading dataset from: dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_val.pkl...
Processing 813 samples...


Accumulating:: 100%|██████████| 813/813 [00:00<00:00, 5500.94sample/s]

Skipped 0 samples due to errors.
Loaded and processed 813 samples.






Number of negative samples in fixed grid: 806458
Number of positive samples in fixed grid: 45316
pos_weight: 17.79631996154785
Loading dataset from: dataset/serialized/merged_ts=4-4_tr0.80-va0.10-te0.10_test.pkl...
Processing 812 samples...


Accumulating:: 100%|██████████| 812/812 [00:00<00:00, 5198.33sample/s]

Skipped 0 samples due to errors.
Loaded and processed 812 samples.






Number of negative samples in fixed grid: 871512
Number of positive samples in fixed grid: 50314
pos_weight: 17.321460723876953
Grid shape: torch.Size([16, 33, 9, 3])
Samples shape: 16


In [5]:
NUM_QUARTERS_PER_BAR = 4 # 4/4 time signature
MAX_LENGTH = config["data"]["num_bars"] * NUM_QUARTERS_PER_BAR * config["data"]["steps_per_quarter"] + 1
print(f"Max length: {MAX_LENGTH}")

Max length: 33


# Model

In [6]:
model_config = config["model"]
model_config.update(
    T=MAX_LENGTH,
    E=grid.shape[2],
    M=grid.shape[3]
)
model = IQAE(**model_config)
summary(model, input_data = grid, device = "cpu")

Layer (type:depth-idx)                                  Output Shape              Param #
IQAE                                                    [16, 33, 9]               27
├─DrumAxialTransformer: 1-1                             [16, 33, 9, 32]           --
│    └─Conv2d: 2-1                                      [16, 32, 33, 9]           128
│    └─AxialPositionalEmbedding: 2-2                    [16, 32, 33, 9]           1,344
│    └─Sequential: 2-3                                  [16, 32, 33, 9]           --
│    │    └─ModuleList: 3-1                             --                        156,288
├─AdaptiveAvgPool2d: 1-2                                [16, 33, 3, 3]            --
├─Linear: 1-3                                           [16, 33, 2]               20
├─IntegerQuantizer: 1-4                                 [16, 33]                  --
├─Linear: 1-5                                           [16, 33, 32]              896
├─PositionalEncoding: 1-6                         

# Training

In [7]:
trainer = IQAE_Trainer(
    model = model,
    config = config,
    run_name = config["expt"],
    config_file = config['training']['config_file'],
    device = device
)

trainer.set_optimizer(
    create_optimizer(
        model=model,
        opt_config=config['optimizer']
    )
)

trainer.set_scheduler(
    create_scheduler(
        optimizer=trainer.optimizer,
        scheduler_config=config['scheduler'],
        train_loader=train_loader,
        gradient_accumulation_steps=config['training']['gradient_accumulation_steps']
    )
)


Using device: cpu

🔧 Configuring Optimizer:
├── Type: ADAMW
├── Base LR: 0.0004
├── Weight Decay: 1e-06
├── Parameter Groups:
│   ├── Group: self_attn
│   │   ├── LR: 0.0002
│   │   └── Patterns: []
│   ├── Group: ffn
│   │   ├── LR: 0.0002
│   │   └── Patterns: []
│   └── Default Group (unmatched parameters)
└── AdamW Specific:
    ├── Betas: [0.9, 0.999]
    ├── Epsilon: 1e-08
    └── AMSGrad: False

📈 Configuring Learning Rate Scheduler:
├── Type: COSINE
├── Cosine Annealing Settings:
│   ├── T_max: 15 epochs (6180 steps)
│   └── Min LR: 1e-07
├── Warmup Settings:
│   ├── Duration: 5 epochs (2060 steps)
│   ├── Start Factor: 0.1
│   └── End Factor: 1.0


In [8]:
trainer.train(train_loader, val_loader, epochs=1)

                                                                                                                                                                                                                                                    

max_length: 33


                                                            


📊 Metrics (Epoch 0):
├── TRAIN:
│   ├── hit_bce: 1.3998
│   ├── joint_loss: 1.6342
│   ├── margin_loss: 0.0000
│   ├── offset_mse: 0.0388
│   ├── temporal_loss: 0.0000
│   └── velocity_mse: 0.1955
└── VAL:
    ├── hit_acc: 0.7382
    ├── hit_f1: 0.3996
    ├── hit_ppv: 0.2708
    ├── hit_tpr: 0.8475
    ├── offset_mse: 0.0043
    └── velocity_mse: 0.1315
└── TRAINING:
    └── learning_rate: 0.000112
