In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
from torch.utils.data import Dataset, DataLoader, random_split

In [8]:
class MelSpectrogramDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.transform = transform
        self.label_map = {}

        for label_idx, label in enumerate(sorted(os.listdir(root_dir))):
            label_path = os.path.join(root_dir, label, "mel_spec_tensor")
            if not os.path.isdir(label_path):
                continue

            self.label_map[label] = label_idx
            for file in os.listdir(label_path):
                if file.endswith(".pt"):
                    self.samples.append({
                        "path": os.path.join(label_path, file),
                        "label": label_idx
                    })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        mel_tensor = torch.load(sample["path"])  # shape: (1, 128, 256)
        label = sample["label"]

        if self.transform:
            mel_tensor = self.transform(mel_tensor)

        return mel_tensor, label


In [9]:
def get_dataloaders(root_dir, batch_size=32, val_frac=0.1, test_frac=0.1):
    dataset = MelSpectrogramDataset(root_dir)
    total_size = len(dataset)
    test_size = int(total_size * test_frac)
    val_size = int(total_size * val_frac)
    train_size = total_size - val_size - test_size

    train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])
    
    return {
        "train": DataLoader(train_set, batch_size=batch_size, shuffle=True),
        "val": DataLoader(val_set, batch_size=batch_size),
        "test": DataLoader(test_set, batch_size=batch_size),
    }


In [13]:
class ConvTransformerClassifier(nn.Module):
    def __init__(self, n_classes, d_model=128, nhead=4, num_layers=2, input_shape=(1, 128, 256)):
        super(ConvTransformerClassifier, self).__init__()

        # --- CNN Feature Extractor ---
        conv_layers = []
        in_channels = input_shape[0]
        for _ in range(4):
            conv_layers += [
                nn.Conv2d(in_channels, d_model, kernel_size=3, padding=1),
                nn.BatchNorm2d(d_model),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)
            ]
            in_channels = d_model

        self.cnn = nn.Sequential(*conv_layers)

        # --- Flatten + Positional Encoding ---
        self.flatten = nn.Flatten(2)  # flatten spatial dims into a sequence
        self.positional_encoding = PositionalEncoding(d_model)

        # --- Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # --- Classification Head ---
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.bn = nn.BatchNorm1d(d_model)
        self.fc = nn.Linear(d_model, n_classes)

    def forward(self, x):
        # x: (B, 1, 128, 256)
        x = self.cnn(x)  # (B, C, H', W') => e.g. (B, 128, 8, 16)
        B, C, H, W = x.shape

        x = self.flatten(x)            # (B, C, H*W)  → sequence
        x = x.permute(0, 2, 1)         # (B, seq_len, C)
        x = self.positional_encoding(x)

        x = self.transformer(x)        # (B, seq_len, C)
        x = x.permute(0, 2, 1)         # (B, C, seq_len)
        x = self.avgpool(x).squeeze(-1)  # (B, C)

        x = self.bn(x)
        x = self.fc(x)                 # (B, n_classes)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1024):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (B, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]


In [14]:
def train_model(model, dataloaders, device="cuda", epochs=10, lr=1e-3):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        # --- Training ---
        model.train()
        train_loss, train_correct = 0.0, 0

        for X, y in dataloaders["train"]:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * X.size(0)
            train_correct += (outputs.argmax(1) == y).sum().item()

        train_acc = train_correct / len(dataloaders["train"].dataset)
        train_loss /= len(dataloaders["train"].dataset)
        print(f"Train Loss: {train_loss:.4f} | Accuracy: {train_acc:.4f}")

        # --- Validation ---
        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for X, y in dataloaders["val"]:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)
                val_correct += (outputs.argmax(1) == y).sum().item()

        val_acc = val_correct / len(dataloaders["val"].dataset)
        val_loss /= len(dataloaders["val"].dataset)
        print(f"Val   Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f}")


In [15]:
# Suppose `model` is your CNN+Transformer hybrid
model = ConvTransformerClassifier(n_classes=2, d_model=128, nhead=4, num_layers=2)
dataloaders = get_dataloaders("/vol/bitbucket/sg2121/fypdataset/dataset/tensors", batch_size=32)
train_model(model, dataloaders, device="cuda", epochs=20)


Epoch 1/20
Train Loss: 0.6920 | Accuracy: 0.6267
Val   Loss: 0.7940 | Accuracy: 0.6071

Epoch 2/20
Train Loss: 0.5577 | Accuracy: 0.7185
Val   Loss: 0.7830 | Accuracy: 0.6786

Epoch 3/20
Train Loss: 0.5206 | Accuracy: 0.7644
Val   Loss: 1.2348 | Accuracy: 0.6190

Epoch 4/20
Train Loss: 0.4584 | Accuracy: 0.7733
Val   Loss: 0.7591 | Accuracy: 0.5476

Epoch 5/20
Train Loss: 0.4407 | Accuracy: 0.7896
Val   Loss: 0.7626 | Accuracy: 0.6905

Epoch 6/20
Train Loss: 0.3958 | Accuracy: 0.8133
Val   Loss: 1.4092 | Accuracy: 0.6310

Epoch 7/20
Train Loss: 0.4833 | Accuracy: 0.7704
Val   Loss: 0.9739 | Accuracy: 0.5952

Epoch 8/20
Train Loss: 0.4185 | Accuracy: 0.8000
Val   Loss: 1.2364 | Accuracy: 0.6190

Epoch 9/20
Train Loss: 0.3673 | Accuracy: 0.8504
Val   Loss: 0.9022 | Accuracy: 0.6548

Epoch 10/20
Train Loss: 0.3175 | Accuracy: 0.8681
Val   Loss: 0.4565 | Accuracy: 0.8095

Epoch 11/20
Train Loss: 0.2984 | Accuracy: 0.8830
Val   Loss: 0.3867 | Accuracy: 0.9048

Epoch 12/20
Train Loss: 0.316

In [16]:
def evaluate_model(model, dataloader, device="cuda"):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    test_loss, test_correct = 0.0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)
            test_loss += loss.item() * X.size(0)
            test_correct += (outputs.argmax(1) == y).sum().item()

    test_acc = test_correct / len(dataloader.dataset)
    test_loss /= len(dataloader.dataset)
    print(f"Test  Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}")


In [18]:
evaluate_model(model, dataloaders["test"], device="cuda")

Test  Loss: 1.9323 | Accuracy: 0.5833
