In [1]:
# https://medium.com/@imabhi1216/fine-tuning-a-pre-trained-resnet-18-model-for-image-classification-on-custom-dataset-with-pytorch-02df12e83c2c

In [14]:
print("HELLO")

HELLO


In [15]:
import os
import shutil
from sklearn.model_selection import train_test_split
import random

import torch
import torchvision.models as models

from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader

import time
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

import pandas as pd
import torch.nn.functional as F

from PIL import Image

from tqdm import tqdm

from itertools import product

In [16]:
def get_model():
    # Load the pre-trained ResNet-18 model
    model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)

    # Modify the last layer of the model
    num_classes = 2 # number of classes in dataset
    model.fc = torch.nn.Linear(model.fc.in_features, num_classes)
    return model

In [17]:
# Directory paths for the segments and lyrics
ai_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/ai_segments"
human_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/human"
ai_mfcc_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/features/ai/MFCC"
human_mfcc_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/features/human/MFCC"

# Helper function to read file paths from a text file
def read_file_paths(file_name):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

# Read all file paths from the text files
train_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/train_files_large.txt')
val_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/val_files_large.txt')
test_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/test_files_large.txt')


# Function to convert segment file path to lyric file path
def convert_to_mfcc_path(file_path, is_ai):
    if is_ai:
        if file_path.startswith(ai_segments_path):
            base_mfcc_path = ai_mfcc_path
        else:
            return
    else:
        if file_path.startswith(human_segments_path):
            base_mfcc_path = human_mfcc_path
        else:
            return

    # Convert filename to mfcc filename
    file_name = os.path.basename(file_path).replace('.mp3', '-MFCC.png')
    return os.path.join(base_mfcc_path, file_name)


# Process the file lists and create tuples of (lyric_path, label)
def process_file_paths(file_paths, is_ai):
    return [(convert_to_mfcc_path(file_path, is_ai), 0 if is_ai else 1) for file_path in file_paths]

# Convert all file paths from the train, validation, and test sets
ai_train_files = process_file_paths(train_files, is_ai=True)
human_train_files = process_file_paths(train_files, is_ai=False)

ai_val_files = process_file_paths(val_files, is_ai=True)
human_val_files = process_file_paths(val_files, is_ai=False)

ai_test_files = process_file_paths(test_files, is_ai=True)
human_test_files = process_file_paths(test_files, is_ai=False)

def clean(paths):
    return [(p, l) for p, l in paths if p is not None]

train_files_combined = clean(ai_train_files) + clean(human_train_files)
val_files_combined = clean(ai_val_files) + clean(human_val_files)
test_files_combined = clean(ai_test_files) + clean(human_test_files)

# Shuffle the data if needed
random.shuffle(train_files_combined)
random.shuffle(val_files_combined)
random.shuffle(test_files_combined)

# Example of how you might check the splits
print(f"Training set size: {len(train_files_combined)}")
print(f"Validation set size: {len(val_files_combined)}")
print(f"Test set size: {len(test_files_combined)}")

Training set size: 22736
Validation set size: 4871
Test set size: 4875


In [18]:
# Custom dataset class
class MFCCDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        path, label = self.data[idx]
        image = Image.open(path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        filename = os.path.basename(path)
        return image, label, filename
        
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [19]:
# Create datasets
train_dataset = MFCCDataset(train_files_combined, transform=transform)
val_dataset = MFCCDataset(val_files_combined, transform=transform)
test_dataset = MFCCDataset(test_files_combined, transform=transform)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Confirm sizes
print(f"Train set: {len(train_dataset)} samples")
print(f"Val set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 22736 samples
Val set: 4871 samples
Test set: 4875 samples


In [20]:
# Determine whether to use GPU (if available) or CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(device)

def train(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()

        # Initialize running loss and correct predictions count for training
        running_loss = 0.0
        running_corrects = 0

        # Iterate over the training data loader
        for inputs, labels, filenames in train_loader:
            # Move inputs and labels to the device (GPU or CPU)
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Reset the gradients to zero before the backward pass
            optimizer.zero_grad()

            # Forward pass: compute the model output
            outputs = model(inputs)
            # Get the predicted class (with the highest score)
            _, preds = torch.max(outputs, 1)
            # Compute the loss between the predictions and actual labels
            loss = criterion(outputs, labels)

            # Backward pass: compute gradients
            loss.backward()
            # Perform the optimization step to update model parameters
            optimizer.step()

            # Accumulate the running loss and the number of correct predictions
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        # Compute average training loss and accuracy for this epoch
        train_loss = running_loss / len(train_loader.dataset)
        train_acc = running_corrects.float() / len(train_loader.dataset)

        # Set the model to evaluation mode for validation
        model.eval()
        # Initialize running loss and correct predictions count for validation
        running_loss = 0.0
        running_corrects = 0

        # Disable gradient computation for validation (saves memory and computations)
        with torch.no_grad():
            # Iterate over the validation data loader
            for inputs, labels, filenames in val_loader:
                # Move inputs and labels to the device (GPU or CPU)
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Forward pass: compute the model output
                outputs = model(inputs)
                # Get the predicted class (with the highest score)
                _, preds = torch.max(outputs, 1)
                # Compute the loss between the predictions and actual labels
                loss = criterion(outputs, labels)

                # Accumulate the running loss and the number of correct predictions
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

        # Compute average validation loss and accuracy for this epoch
        val_loss = running_loss / len(val_loader.dataset)
        val_acc = running_corrects.float() / len(val_loader.dataset)

        # Print the results for the current epoch
        print(f'Epoch [{epoch+1}/{num_epochs}], train loss: {train_loss:.4f}, train acc: {train_acc:.4f}, val loss: {val_loss:.4f}, val acc: {val_acc:.4f}')

cuda


In [21]:
model = get_model()
model = model.to(device)

# Define the loss function and optimizer
lr = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.fc.parameters(), lr, momentum=0.9)
num_epochs = 5

In [8]:
train(model, train_loader, val_loader, criterion, optimizer, num_epochs)

Epoch [1/5], train loss: 0.3849, train acc: 0.8350, val loss: 0.3693, val acc: 0.8421
Epoch [2/5], train loss: 0.3517, train acc: 0.8519, val loss: 0.3285, val acc: 0.8680
Epoch [3/5], train loss: 0.3423, train acc: 0.8578, val loss: 0.3362, val acc: 0.8666
Epoch [4/5], train loss: 0.3371, train acc: 0.8634, val loss: 0.3232, val acc: 0.8684
Epoch [5/5], train loss: 0.3379, train acc: 0.8594, val loss: 0.3303, val acc: 0.8680


In [22]:
def evaluate_model(model, test_loader, device, hyperparams=None):
    log_file = "/vol/bitbucket/sg2121/fyp/aimusicdetector/music_cnn/large/mfcc/training_large_logfile.txt"

    model.eval()
    correct_pred = {classname: 0 for classname in ['ai', 'human']}
    total_pred = {classname: 0 for classname in ['ai', 'human']}

    all_labels = []
    all_preds = []
    
    start_time = time.time()

    with torch.no_grad():
        for inputs, labels, filenames in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            probs = F.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            for label, prediction in zip(labels, preds):
                classname = 'ai' if label.item() == 0 else 'human'
                if label == prediction:
                    correct_pred[classname] += 1
                total_pred[classname] += 1

    end_time = time.time()
    avg_inference_time = (end_time - start_time) / len(test_loader.dataset)

    accuracy_per_class = {
        classname: correct_pred[classname] / total_pred[classname]
        if total_pred[classname] > 0 else 0
        for classname in ['ai', 'human']
    }

    overall_accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None, labels=[0, 1])
    cm = confusion_matrix(all_labels, all_preds)

    fpr = {}
    for i, classname in enumerate(['ai', 'human']):
        FP = cm[:, i].sum() - cm[i, i]
        TN = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
        fpr[classname] = FP / (FP + TN) if (FP + TN) > 0 else 0

    # Logging
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_lines = [f"===== Evaluation at {timestamp} =====\n"]

    if hyperparams:
        log_lines.append("Hyperparameters:")
        for key, value in hyperparams.items():
            log_lines.append(f"{key}: {value}")
    else:
        log_lines.append("No hyperparameters provided.")
    log_lines.append("")

    log_lines.append("Accuracy per class:")
    for classname, acc in accuracy_per_class.items():
        log_lines.append(f"{classname}: {acc:.4f}")
    log_lines.append("\nPrecision, Recall, F1:")
    for i, classname in enumerate(['ai', 'human']):
        log_lines.append(f"{classname} → Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1: {f1[i]:.4f}")
    log_lines.append("\nFalse Positive Rate:")
    for classname, rate in fpr.items():
        log_lines.append(f"{classname}: {rate:.4f}")
    log_lines.append(f"\nOverall Accuracy: {overall_accuracy:.4f}")
    log_lines.append(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")
    log_lines.append("=" * 40 + "\n\n")

    print("\n".join(log_lines))
    with open(log_file, "a") as f:
        f.write("\n".join(log_lines))

    return overall_accuracy


In [10]:
model.eval()
results = []

for batch in tqdm(test_loader):
    inputs, labels, filenames = batch  # Unpack the filename from the dataset
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)

        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],  # Use filename directly from dataset
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": torch.argmax(probs[i]).item()
            })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("mfcc_test_large_predictions.csv", index=False)

# Preview results
print(df.head())


100%|█████████████████████████████████| 153/153 [07:36<00:00,  2.99s/it]

                    filename   prob_ai  prob_human  true_label  pred_label
0           H11709N-MFCC.png  0.499168    0.500832           1           1
1            H3140N-MFCC.png  0.006976    0.993024           1           1
2           H16928N-MFCC.png  0.266731    0.733269           1           1
3  S900RN_segment_1-MFCC.png  0.797132    0.202868           0           0
4           H17840N-MFCC.png  0.202964    0.797036           1           1





In [13]:
hyperparams = {
    "batch_size": batch_size,
    "learning_rate": lr,
    "epochs": num_epochs,
    "optimizer": optimizer,
    "model": model.__class__.__name__,
}

evaluate_model(model, test_loader, device, hyperparams=hyperparams)


KeyboardInterrupt: 

In [23]:
# Define hyperparameter search space
learning_rates = [1e-3, 1e-4, 5e-4]
weight_decays = [1e-6, 1e-7, 1e-8]
epochs_list = [5, 10, 20] 

# Generate hyperparameter combinations and randomly select 10
param_combinations = list(product(learning_rates, weight_decays, epochs_list))
random.shuffle(param_combinations)
hyperparam_trials = param_combinations[:10]

# Run randomized search
best_model = None
best_acc = 0

for i, (lr, wd, epochs) in enumerate(hyperparam_trials):
    print(f"\n=== Trial {i+1}/10: LR={lr}, WD={wd}, Epochs={epochs} ===")

    model = get_model()
    model = model.to(device)
    
    # Use an optimizer like Adam or SGD (adjust based on your requirement)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
    
    # Train the model (assuming you have a function to handle training)
    train(model, train_loader, val_loader, criterion, optimizer, epochs)

    hyperparams = {
        "batch_size": batch_size,
        "learning_rate": lr,
        "epochs": epochs,
        "optimizer": optimizer,
        "model": model.__class__.__name__,
    }
    
    val_acc = evaluate_model(model, val_loader, device, hyperparams=hyperparams)
    if val_acc > best_acc:
        best_acc = val_acc
        best_model = model
        torch.save(model.state_dict(), 'best_model.pt')
    
print("\nBest validation accuracy:", best_acc)


=== Trial 1/10: LR=0.0001, WD=1e-06, Epochs=5 ===
Epoch [1/5], train loss: 0.3265, train acc: 0.8677, val loss: 0.2543, val acc: 0.8953
Epoch [2/5], train loss: 0.2356, train acc: 0.9053, val loss: 0.2355, val acc: 0.9068
Epoch [3/5], train loss: 0.2022, train acc: 0.9183, val loss: 0.2194, val acc: 0.9109
Epoch [4/5], train loss: 0.1782, train acc: 0.9295, val loss: 0.2280, val acc: 0.9047
Epoch [5/5], train loss: 0.1552, train acc: 0.9394, val loss: 0.1999, val acc: 0.9224
===== Evaluation at 2025-05-23 16:59:53 =====

Hyperparameters:
batch_size: 32
learning_rate: 0.0001
epochs: 5
optimizer: SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 1e-06
)
model: ResNet

Accuracy per class:
ai: 0.8105
human: 0.9560

Precision, Recall, F1:
ai → Precision: 0.8467, Recall: 0.8105, F1: 0.8282
human → Precision: 0.9439, Recall: 0.9560, F1: 0.9499

False P

In [24]:
model = get_model()
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.to(device)
model.eval()
results = []

for batch in tqdm(test_loader):
    inputs, labels, filenames = batch  # Unpack the filename from the dataset
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)

        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],  # Use filename directly from dataset
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": torch.argmax(probs[i]).item()
            })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("best_mfcc_test_large_predictions.csv", index=False)

# Preview results
print(df.head())


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [06:22<00:00,  2.50s/it]

           filename       prob_ai  prob_human  true_label  pred_label
0  H10451N-MFCC.png  1.072708e-08    1.000000           1           1
1  H14191N-MFCC.png  1.924204e-05    0.999981           1           1
2  H21973N-MFCC.png  3.239322e-06    0.999997           1           1
3   H2830N-MFCC.png  1.103561e-10    1.000000           1           1
4   H7264N-MFCC.png  2.370661e-11    1.000000           1           1



