!pip install optuna pytorch-lightning==2.2.4 torchaudio --quiet

In [1]:
import pathlib, torch, numpy as np, librosa, optuna, torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, RichProgressBar
from sklearn.metrics import accuracy_score, f1_score

pl.seed_everything(42, workers=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
optuna.logging.set_verbosity(optuna.logging.INFO)
print("Device:", DEVICE)


Seed set to 42


Device: cpu


In [2]:
GTZAN_DIR = pathlib.Path("../datasets/GTZAN")
TIME_FRAMES = 128          # target width after crop / pad
N_MELS      = 128          # height

class GTZANMel(Dataset):
    def __init__(self, files, sr=22_050):
        self.files, self.sr = files, sr
        genres = sorted({p.parent.name for p in files})
        self.genre2idx = {g: i for i, g in enumerate(genres)}

    def __len__(self): return len(self.files)

    def _fix_length(self, mel):
        """Center‑crop or zero‑pad to (N_MELS, TIME_FRAMES)."""
        n_mels, t = mel.shape
        if t > TIME_FRAMES:                           # crop
            start = (t - TIME_FRAMES) // 2
            mel = mel[:, start : start + TIME_FRAMES]
        elif t < TIME_FRAMES:                         # pad
            pad_width = TIME_FRAMES - t
            mel = np.pad(mel, ((0, 0), (0, pad_width)), mode="constant")
        return mel

    def __getitem__(self, idx):
        path = self.files[idx]
        y, _   = librosa.load(path, sr=self.sr, mono=True)
        mel    = librosa.feature.melspectrogram(y, sr=self.sr,
                                                n_mels=N_MELS)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_db = self._fix_length(mel_db)
        x      = torch.tensor(mel_db).unsqueeze(0).float()  # (1, 128, 128)
        y_lbl  = torch.tensor(self.genre2idx[path.parent.name])
        return x, y_lbl


In [3]:
class MusicCNN(pl.LightningModule):
    def __init__(self, lr=1e-3, dropout=.3, n_filters=32):
        super().__init__()
        self.save_hyperparameters()
        self.conv = torch.nn.Sequential(
            torch.nn.Conv2d(1, n_filters, 3, padding=1),  # 128×128
            torch.nn.BatchNorm2d(n_filters), torch.nn.ReLU(), torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(n_filters, n_filters*2, 3, padding=1),  # 64×64
            torch.nn.BatchNorm2d(n_filters*2), torch.nn.ReLU(), torch.nn.MaxPool2d(2)
        )
        self.drop = torch.nn.Dropout(dropout)
        self.fc   = torch.nn.Linear((n_filters*2)*32*32, 10)        # 32×32 flat

    def forward(self, x):
        return self.fc(self.drop(torch.flatten(self.conv(x), 1)))

    def _step(self, batch):
        x, y = batch; logits = self(x)
        loss = F.cross_entropy(logits, y)
        acc  = (logits.argmax(1) == y).float().mean()
        return loss, acc

    def training_step(self, batch, _):  l, a = self._step(batch); self.log_dict({"train_loss": l, "train_acc": a}); return l
    def validation_step(self, batch, _): l, a = self._step(batch); self.log_dict({"val_loss": l, "val_acc": a}, prog_bar=True)
    def configure_optimizers(self):      return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)


In [4]:
def build_loaders(val_frac=.1, test_frac=.1, batch=16, sample_frac=1.0):
    all_files = np.array(sorted(GTZAN_DIR.rglob("*.au")))
    rng = np.random.default_rng(42); rng.shuffle(all_files)

    n_tot = len(all_files)
    n_test, n_val = int(n_tot*test_frac), int(n_tot*val_frac)
    test, val, train = (all_files[:n_test],
                        all_files[n_test:n_test+n_val],
                        all_files[n_test+n_val:])

    if sample_frac < 1.0:
        train = train[: int(len(train)*sample_frac)]

    def loader(files, shuffle):
        return DataLoader(GTZANMel(list(files)), batch_size=batch,
                          shuffle=shuffle, num_workers=0)

    return loader(train, True), loader(val, False), loader(test, False)


In [None]:
from pytorch_lightning.callbacks import TQDMProgressBar
def objective(trial):
    # search space
    lr        = trial.suggest_float("lr", 1e-4, 3e-3, log=True)
    dropout   = trial.suggest_float("dropout", .1, .5)
    n_filters = trial.suggest_categorical("n_filters", [16, 32, 48])
    batch     = trial.suggest_categorical("batch", [8, 16, 32])

    train_ld, val_ld, _ = build_loaders(batch=batch, sample_frac=1.0)  
    model  = MusicCNN(lr=lr, dropout=dropout, n_filters=n_filters)

    ckpt  = ModelCheckpoint(monitor="val_acc", mode="max")
    early = EarlyStopping(monitor="val_acc", mode="max", patience=3)
    trainer = Trainer(max_epochs=20, accelerator=DEVICE,
                      callbacks=[ckpt, early, TQDMProgressBar(refresh_rate=10)],
                      logger=False)
            
    trainer.fit(model, train_ld, val_ld)
    trial.set_user_attr("best_model_path", ckpt.best_model_path)
    return ckpt.best_model_score.item()


In [None]:
study = optuna.create_study(direction="maximize",
                            sampler=optuna.samplers.RandomSampler(),
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=1))
study.optimize(objective, n_trials=5, timeout=20*60)

print("Best trial:", study.best_trial.number,
      "| val_acc:", study.best_value,
      "| params:", study.best_params)


[I 2025-04-17 14:28:30,458] A new study created in memory with name: no-name-aa5de83f-21dc-4c7e-9b28-546bd617719e
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\sachi\OneDrive\Documents\music-genre-classification-and-recommendation\venv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:654: Checkpoint directory c:\Users\sachi\OneDrive\Documents\music-genre-classification-and-recommendation\notebooks\checkpoints exists and is not empty.

  | Name | Type       | Params | Mode 
--------------------------------------------
0 | conv | Sequential | 4.9 K  | train
1 | drop | Dropout    | 0      | train
2 | fc   | Linear     | 327 K  | train
--------------------------------------------
332 K     Trainable params
0         Non-trainable params
332 K     Total params
1.330     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\sachi\OneDrive\Documents\music-genre-classification-and-recommendation\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.00958252] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.00109863] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.02426147] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.04885864] as keyword args. From version 0.10 passing these as positional

Training: |          | 0/? [00:00<?, ?it/s]

  0.11502075] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.40567017] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.16949463] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.18206787] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.02993774] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.09066772] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=s

Validation: |          | 0/? [00:00<?, ?it/s]

  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.04608154] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.67718506] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.3480835 ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.01708984] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.08422852] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.6029663 ] as keyword args. From version 0.10 passing these as positional arguments will result in a

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-04-17 14:34:05,880] Trial 0 finished with value: 0.6000000238418579 and parameters: {'lr': 0.00042294727289485874, 'dropout': 0.35145183962082294, 'n_filters': 16, 'batch': 8}. Best is trial 0 with value: 0.6000000238418579.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type       | Params | Mode 
--------------------------------------------
0 | conv | Sequential | 4.9 K  | train
1 | drop | Dropout    | 0      | train
2 | fc   | Linear     | 327 K  | train
--------------------------------------------
332 K     Trainable params
0         Non-trainable params
332 K     Total params
1.330     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-04-17 14:39:43,929] Trial 1 finished with value: 0.6499999761581421 and parameters: {'lr': 0.00029156488619735445, 'dropout': 0.10866984675135237, 'n_filters': 16, 'batch': 16}. Best is trial 1 with value: 0.6499999761581421.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type       | Params | Mode 
--------------------------------------------
0 | conv | Sequential | 19.0 K | train
1 | drop | Dropout    | 0      | train
2 | fc   | Linear     | 655 K  | train
--------------------------------------------
674 K     Trainable params
0         Non-trainable params
674 K     Total params
2.698     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-04-17 14:43:57,580] Trial 2 finished with value: 0.5699999928474426 and parameters: {'lr': 0.00014215518819342169, 'dropout': 0.4715447220276513, 'n_filters': 32, 'batch': 32}. Best is trial 1 with value: 0.6499999761581421.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type       | Params | Mode 
--------------------------------------------
0 | conv | Sequential | 42.3 K | train
1 | drop | Dropout    | 0      | train
2 | fc   | Linear     | 983 K  | train
--------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.102     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-04-17 14:52:24,705] Trial 3 finished with value: 0.5600000023841858 and parameters: {'lr': 0.00022920757264827886, 'dropout': 0.17664993903088747, 'n_filters': 48, 'batch': 8}. Best is trial 1 with value: 0.6499999761581421.


Best trial: 1 | val_acc: 0.6499999761581421 | params: {'lr': 0.00029156488619735445, 'dropout': 0.10866984675135237, 'n_filters': 16, 'batch': 16}


In [7]:
import pathlib, shutil
import warnings, librosa
warnings.filterwarnings("ignore", category=FutureWarning, module="librosa")

best_ckpt = study.best_trial.user_attrs["best_model_path"]
best_model = MusicCNN.load_from_checkpoint(best_ckpt).to(DEVICE).eval()

_, _, test_ld = build_loaders(batch=study.best_params["batch"], sample_frac=1.0)

preds, targets = [], []
with torch.no_grad():
    for x, y in test_ld:
        logits = best_model(x.to(DEVICE))
        preds.extend(logits.argmax(1).cpu().numpy())
        targets.extend(y.numpy())

print("TEST  -  Acc: {:.4f}  |  Macro‑F1: {:.4f}".format(
      accuracy_score(targets, preds),
      f1_score(targets, preds, average="macro")))


final_ckpt_src = best_ckpt                         
final_ckpt_dst = pathlib.Path(
    "../notebooks/checkpoints/best_cnn.ckpt")      
final_ckpt_dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(final_ckpt_src, final_ckpt_dst)

print("Checkpoint saved to:", final_ckpt_dst)



  0.01364136] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.15994263] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
  0.04144287] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.07458496] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
 -0.05703735] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, s

TEST  -  Acc: 0.6200  |  Macro‑F1: 0.5909
Checkpoint saved to: ..\notebooks\checkpoints\best_cnn.ckpt


  0.21017456] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel    = librosa.feature.melspectrogram(y, sr=self.sr,
