Input: Mel Spectrogram (1, 128, 256)
↓
Conv Block ×4: (Conv → BN + ReLU → MaxPool)
↓
Flatten + Positional Encoding
↓
Transformer Encoder Layer ×2
↓
Avg Pool → BN → FC → Softmax

In [17]:
print("HELLO")

HELLO


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
from torch.utils.data import Dataset, DataLoader, random_split

import numpy as np
import random
import copy

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import time

from itertools import product

In [19]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [20]:
class MelSpectrogramDataset(Dataset):
    def __init__(self, samples, transform=None):
        """
        Args:
            samples (List[Tuple[str, int]]): List of (tensor_path, label) pairs.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        tensor_path, label = self.samples[idx]
        mel_tensor = torch.load(tensor_path)  # Expected shape: (1, 128, 256)

        if self.transform:
            mel_tensor = self.transform(mel_tensor)

        return mel_tensor, label


In [21]:
def get_dataloaders(root_dir, batch_size=32, val_frac=0.1, test_frac=0.1):
    # Helper function to read file paths from a text file
    def read_file_paths(file_name):
        with open(file_name, 'r') as f:
            return [line.strip() for line in f.readlines()]
        
    ai_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/ai_segments"
    human_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/human"
    ai_tensor_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/tensors/ai_segments/mel_spec_tensor"
    human_tensor_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/tensors/human/mel_spec_tensor"

    train_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/train_files_large.txt')
    val_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/val_files_large.txt')
    test_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/test_files_large.txt')

    # Function to convert segment file path to lyric file path
    def convert_to_tensor_path(file_path, is_ai):
        if is_ai:
            if file_path.startswith(ai_segments_path):
                base_tensor_path = ai_tensor_path
            else:
                return
        else:
            if file_path.startswith(human_segments_path):
                base_tensor_path = human_tensor_path
            else:
                return

        # Convert filename to mfcc filename
        file_name = os.path.basename(file_path).replace('.mp3', '.pt')
        return os.path.join(base_tensor_path, file_name)

    # Process the file lists and create tuples of (lyric_path, label)
    def process_file_paths(file_paths, is_ai):
        return [(convert_to_tensor_path(file_path, is_ai), 0 if is_ai else 1) for file_path in file_paths]
    
    # Convert all file paths from the train, validation, and test sets
    ai_train_files = process_file_paths(train_files, is_ai=True)
    human_train_files = process_file_paths(train_files, is_ai=False)
    
    ai_val_files = process_file_paths(val_files, is_ai=True)
    human_val_files = process_file_paths(val_files, is_ai=False)
    
    ai_test_files = process_file_paths(test_files, is_ai=True)
    human_test_files = process_file_paths(test_files, is_ai=False)
    
    def clean(paths):
        return [(p, l) for p, l in paths if p is not None]
    
    train_files_combined = clean(ai_train_files) + clean(human_train_files)
    val_files_combined = clean(ai_val_files) + clean(human_val_files)
    test_files_combined = clean(ai_test_files) + clean(human_test_files)
    
    # Shuffle the data if needed
    random.shuffle(train_files_combined)
    random.shuffle(val_files_combined)
    random.shuffle(test_files_combined)

    # Example of how you might check the splits
    print(f"Training set size: {len(train_files_combined)}")
    print(f"Validation set size: {len(val_files_combined)}")
    print(f"Test set size: {len(test_files_combined)}")
    
    # Create datasets
    train_dataset = MelSpectrogramDataset(train_files_combined)
    val_dataset = MelSpectrogramDataset(val_files_combined)
    test_dataset = MelSpectrogramDataset(test_files_combined)
    
    return {
        "train": DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
        "val": DataLoader(val_dataset, batch_size=batch_size),
        "test": DataLoader(test_dataset, batch_size=batch_size),
    }


In [22]:
class ConvTransformerClassifier(nn.Module):
    def __init__(self, n_classes, d_model=128, nhead=4, num_layers=2, input_shape=(1, 128, 256)):
        super(ConvTransformerClassifier, self).__init__()

        # --- CNN Feature Extractor ---
        conv_layers = []
        in_channels = input_shape[0]
        for _ in range(4):
            conv_layers += [
                nn.Conv2d(in_channels, d_model, kernel_size=3, padding=1),
                nn.BatchNorm2d(d_model),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2)
            ]
            in_channels = d_model

        self.cnn = nn.Sequential(*conv_layers)

        # --- Flatten + Positional Encoding ---
        self.flatten = nn.Flatten(2)  # flatten spatial dims into a sequence
        self.positional_encoding = PositionalEncoding(d_model)

        # --- Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # --- Classification Head ---
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.bn = nn.BatchNorm1d(d_model)
        self.fc = nn.Linear(d_model, n_classes)

    def forward(self, x):
        # x: (B, 1, 128, 256)
        x = self.cnn(x)  # (B, C, H', W') => e.g. (B, 128, 8, 16)
        B, C, H, W = x.shape

        x = self.flatten(x)            # (B, C, H*W)  → sequence
        x = x.permute(0, 2, 1)         # (B, seq_len, C)
        x = self.positional_encoding(x)

        x = self.transformer(x)        # (B, seq_len, C)
        x = x.permute(0, 2, 1)         # (B, C, seq_len)
        x = self.avgpool(x).squeeze(-1)  # (B, C)

        x = self.bn(x)
        x = self.fc(x)                 # (B, n_classes)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1024):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (B, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]


In [23]:
def train_model(model, dataloaders, device="cuda", epochs=10, lr=1e-3, weight_decay=0.0, clip_grad=True, patience=5):
    set_seed()
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

    best_model = None
    best_val_acc = 0.0
    best_epoch = 0
    patience_counter = 0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        model.train()
        train_loss, train_correct = 0.0, 0

        for X, y in dataloaders["train"]:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            if clip_grad:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_loss += loss.item() * X.size(0)
            train_correct += (outputs.argmax(1) == y).sum().item()

        train_acc = train_correct / len(dataloaders["train"].dataset)
        train_loss /= len(dataloaders["train"].dataset)

        model.eval()
        val_loss, val_correct = 0.0, 0
        with torch.no_grad():
            for X, y in dataloaders["val"]:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)
                val_correct += (outputs.argmax(1) == y).sum().item()

        val_acc = val_correct / len(dataloaders["val"].dataset)
        val_loss /= len(dataloaders["val"].dataset)
        scheduler.step(val_loss)

        print(f"Train Loss: {train_loss:.4f} | Accuracy: {train_acc:.4f}")
        print(f"Val   Loss: {val_loss:.4f} | Accuracy: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = copy.deepcopy(model.state_dict())
            best_epoch = epoch
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

    model.load_state_dict(best_model)
    print(f"Best validation accuracy: {best_val_acc:.4f} at epoch {best_epoch+1}")
    return model, best_val_acc


In [24]:
# Suppose `model` is your CNN+Transformer hybrid
model = ConvTransformerClassifier(n_classes=2, d_model=128, nhead=4, num_layers=2)
dataloaders = get_dataloaders("/vol/bitbucket/sg2121/fypdataset/dataset_large2/tensors", batch_size=32)

Training set size: 22736
Validation set size: 4871
Test set size: 4875


In [None]:
train_model(model, dataloaders, device="cuda", epochs=10)

In [25]:
def evaluate_model(model, dataloader, device="cuda"):
    model.eval()
    criterion = nn.CrossEntropyLoss()
    test_loss, test_correct = 0.0, 0
    all_preds, all_labels = [], []
    total_inference_time = 0.0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            start_time = time.time()
            outputs = model(X)
            end_time = time.time()

            inference_time = end_time - start_time
            total_inference_time += inference_time

            loss = criterion(outputs, y)
            preds = outputs.argmax(1)

            test_loss += loss.item() * X.size(0)
            test_correct += (preds == y).sum().item()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    test_acc = test_correct / len(dataloader.dataset)
    test_loss /= len(dataloader.dataset)
    avg_inference_time = total_inference_time / len(dataloader.dataset)

    # Metrics
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')

    # False Positive Rate (FPR = FP / (FP + TN))
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    fpr = fp / (fp + tn + 1e-10)  # avoid division by zero

    print(f"Test  Loss: {test_loss:.4f} | Accuracy: {test_acc:.4f}")
    print(f"Precision: {precision:.4f} | Recall: {recall:.4f} | F1 Score: {f1:.4f}")
    print(f"False Positive Rate: {fpr:.4f}")
    print(f"Avg Inference Time per Sample: {avg_inference_time * 1000:.2f} ms")


In [36]:
evaluate_model(model, dataloaders["test"], device="cuda")

Test  Loss: 0.0836 | Accuracy: 0.9705
Precision: 0.9727 | Recall: 0.9893 | F1 Score: 0.9810
False Positive Rate: 0.0924
Avg Inference Time per Sample: 0.13 ms


In [37]:
# Save model weights
torch.save(model.state_dict(), 'model_weights.pt')

# Load model weights (later)
model = ConvTransformerClassifier(n_classes=2, d_model=128, nhead=4, num_layers=2)
model.load_state_dict(torch.load('model_weights.pt'))
model.eval()  # Set to evaluation mode


ConvTransformerClassifier(
  (cnn): Sequential(
    (0): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_ru

In [26]:
# Define your search space
param_grid = {
    "lr": [1e-4, 5e-4, 1e-3],
    "weight_decay": [0.0, 1e-5],
    "clip_grad": [True, False],
    "patience": [3, 5],
    "epochs": [10, 15, 20]  # Can be tuned or fixed
}

search_space = list(product(*param_grid.values()))
random.shuffle(search_space)

# Tracking best result
best_val_acc = float("-inf")
best_params = None
best_model_path = "best_model.pt"

In [27]:
# Start search
max_trials = min(10, len(search_space))
for i, values in enumerate(search_space[:max_trials]):
    params = dict(zip(param_grid.keys(), values))
    print(f"\n🔍 Trial {i + 1}/{max_trials} with params: {params}")

    model = ConvTransformerClassifier(n_classes=2, d_model=128, nhead=4, num_layers=2) 
    trained_model, val_acc = train_model(model, dataloaders, device="cuda", **params)

    print(f"Validation Accuracy: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_params = params

        # Save the best model
        torch.save(trained_model.state_dict(), best_model_path)
        print("✅ New best model saved!")

print("\n🏆 Best Hyperparameters:")
print(best_params)
print(f"Best Validation Accuracy: {best_val_acc:.4f}")
print(f"Model saved to: {best_model_path}")


🔍 Trial 1/10 with params: {'lr': 0.0005, 'weight_decay': 1e-05, 'clip_grad': True, 'patience': 5, 'epochs': 10}

Epoch 1/10
Train Loss: 0.3016 | Accuracy: 0.8922
Val   Loss: 0.2605 | Accuracy: 0.9060

Epoch 2/10
Train Loss: 0.1828 | Accuracy: 0.9363
Val   Loss: 0.2012 | Accuracy: 0.9306

Epoch 3/10
Train Loss: 0.1339 | Accuracy: 0.9507
Val   Loss: 0.1246 | Accuracy: 0.9577

Epoch 4/10
Train Loss: 0.1102 | Accuracy: 0.9605
Val   Loss: 0.2039 | Accuracy: 0.9396

Epoch 5/10
Train Loss: 0.0957 | Accuracy: 0.9681
Val   Loss: 0.3255 | Accuracy: 0.9076

Epoch 6/10
Train Loss: 0.0815 | Accuracy: 0.9718
Val   Loss: 0.0948 | Accuracy: 0.9684

Epoch 7/10
Train Loss: 0.0693 | Accuracy: 0.9770
Val   Loss: 1.1867 | Accuracy: 0.8241

Epoch 8/10
Train Loss: 0.0612 | Accuracy: 0.9794
Val   Loss: 0.1040 | Accuracy: 0.9684

Epoch 9/10
Train Loss: 0.0560 | Accuracy: 0.9814
Val   Loss: 0.1274 | Accuracy: 0.9661

Epoch 10/10
Train Loss: 0.0307 | Accuracy: 0.9908
Val   Loss: 0.2202 | Accuracy: 0.9509
Best v

In [16]:
model = ConvTransformerClassifier(n_classes=2, d_model=128, nhead=4, num_layers=2)
model.load_state_dict(torch.load('best_model.pt'))
model.eval()  # Set to evaluation mode
evaluate_model(model, dataloaders["test"], device="cuda")

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same