In [None]:
# https://medium.com/@imabhi1216/fine-tuning-a-pre-trained-resnet-18-model-for-image-classification-on-custom-dataset-with-pytorch-02df12e83c2c

In [10]:
import os
import shutil
from sklearn.model_selection import train_test_split
import random

import torch
import torchvision.models as models

from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms
from torch.utils.data import Dataset, DataLoader

import time
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

import pandas as pd
import torch.nn.functional as F

from PIL import Image

from tqdm import tqdm

from itertools import product

In [11]:
def get_model():
    # Load the pre-trained ResNet-18 model
    model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT)

    # Modify the last layer of the model
    num_classes = 2 # number of classes in dataset
    model.fc = torch.nn.Linear(model.fc.in_features, num_classes)
    return model

In [12]:
# Directory paths for the segments and lyrics
ai_segments_path = "/data/sg2121/fypdataset/dataset_large/normal_data/ai_segments"
human_segments_path = "/data/sg2121/fypdataset/dataset_large/normal_data/human"
ai_mel_path = "/data/sg2121/fypdataset/dataset_large/features/ai/Mel_Spectrogram"
human_mel_path = "/data/sg2121/fypdataset/dataset_large/features/human/Mel_Spectrogram"

# Helper function to read file paths from a text file
def read_file_paths(file_name):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

# Read all file paths from the text files
train_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/train_files_large.txt')
val_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/val_files_large.txt')
test_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/test_files_large.txt')


# Function to convert segment file path to lyric file path
def convert_to_mel_path(file_path, is_ai):
    if is_ai:
        if file_path.startswith(ai_segments_path):
            base_mel_path = ai_mel_path
        else:
            return
    else:
        if file_path.startswith(human_segments_path):
            base_mel_path = human_mel_path
        else:
            return

    # Convert filename to mel filename
    file_name = os.path.basename(file_path).replace('.mp3', '-Mel_Spectrogram.png')
    return os.path.join(base_mel_path, file_name)


# Process the file lists and create tuples of (lyric_path, label)
def process_file_paths(file_paths, is_ai):
    return [(convert_to_mel_path(file_path, is_ai), 0 if is_ai else 1) for file_path in file_paths]

# Convert all file paths from the train, validation, and test sets
ai_train_files = process_file_paths(train_files, is_ai=True)
human_train_files = process_file_paths(train_files, is_ai=False)

ai_val_files = process_file_paths(val_files, is_ai=True)
human_val_files = process_file_paths(val_files, is_ai=False)

ai_test_files = process_file_paths(test_files, is_ai=True)
human_test_files = process_file_paths(test_files, is_ai=False)

def clean(paths):
    return [(p, l) for p, l in paths if p is not None]

train_files_combined = clean(ai_train_files) + clean(human_train_files)
val_files_combined = clean(ai_val_files) + clean(human_val_files)
test_files_combined = clean(ai_test_files) + clean(human_test_files)

# Shuffle the data if needed
random.shuffle(train_files_combined)
random.shuffle(val_files_combined)
random.shuffle(test_files_combined)

# Example of how you might check the splits
print(f"Training set size: {len(train_files_combined)}")
print(f"Validation set size: {len(val_files_combined)}")
print(f"Test set size: {len(test_files_combined)}")

Training set size: 15499
Validation set size: 3321
Test set size: 3324


In [13]:
# Custom dataset class
class MelSpectrogramDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        path, label = self.data[idx]
        image = Image.open(path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        filename = os.path.basename(path)
        return image, label, filename
        
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [14]:
# Create datasets
train_dataset = MelSpectrogramDataset(train_files_combined, transform=transform)
val_dataset = MelSpectrogramDataset(val_files_combined, transform=transform)
test_dataset = MelSpectrogramDataset(test_files_combined, transform=transform)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Confirm sizes
print(f"Train set: {len(train_dataset)} samples")
print(f"Val set: {len(val_dataset)} samples")
print(f"Test set: {len(test_dataset)} samples")

Train set: 15499 samples
Val set: 3321 samples
Test set: 3324 samples


In [15]:
# Determine whether to use GPU (if available) or CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(device)

def train(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()

        # Initialize running loss and correct predictions count for training
        running_loss = 0.0
        running_corrects = 0

        # Iterate over the training data loader
        for inputs, labels, filenames in train_loader:
            # Move inputs and labels to the device (GPU or CPU)
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Reset the gradients to zero before the backward pass
            optimizer.zero_grad()

            # Forward pass: compute the model output
            outputs = model(inputs)
            # Get the predicted class (with the highest score)
            _, preds = torch.max(outputs, 1)
            # Compute the loss between the predictions and actual labels
            loss = criterion(outputs, labels)

            # Backward pass: compute gradients
            loss.backward()
            # Perform the optimization step to update model parameters
            optimizer.step()

            # Accumulate the running loss and the number of correct predictions
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        # Compute average training loss and accuracy for this epoch
        train_loss = running_loss / len(train_loader.dataset)
        train_acc = running_corrects.float() / len(train_loader.dataset)

        # Set the model to evaluation mode for validation
        model.eval()
        # Initialize running loss and correct predictions count for validation
        running_loss = 0.0
        running_corrects = 0

        # Disable gradient computation for validation (saves memory and computations)
        with torch.no_grad():
            # Iterate over the validation data loader
            for inputs, labels, filenames in val_loader:
                # Move inputs and labels to the device (GPU or CPU)
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Forward pass: compute the model output
                outputs = model(inputs)
                # Get the predicted class (with the highest score)
                _, preds = torch.max(outputs, 1)
                # Compute the loss between the predictions and actual labels
                loss = criterion(outputs, labels)

                # Accumulate the running loss and the number of correct predictions
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

        # Compute average validation loss and accuracy for this epoch
        val_loss = running_loss / len(val_loader.dataset)
        val_acc = running_corrects.float() / len(val_loader.dataset)

        # Print the results for the current epoch
        print(f'Epoch [{epoch+1}/{num_epochs}], train loss: {train_loss:.4f}, train acc: {train_acc:.4f}, val loss: {val_loss:.4f}, val acc: {val_acc:.4f}')

cuda


In [16]:
model = get_model()
model = model.to(device)

# Define the loss function and optimizer
lr = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.fc.parameters(), lr, momentum=0.9)
num_epochs = 5

train(model, train_loader, val_loader, criterion, optimizer, num_epochs)

Epoch [1/5], train loss: 0.4631, train acc: 0.7719, val loss: 0.4221, val acc: 0.7853
Epoch [2/5], train loss: 0.3990, train acc: 0.8149, val loss: 0.3866, val acc: 0.8139
Epoch [3/5], train loss: 0.3853, train acc: 0.8243, val loss: 0.4108, val acc: 0.8148
Epoch [4/5], train loss: 0.3819, train acc: 0.8250, val loss: 0.4666, val acc: 0.7772
Epoch [5/5], train loss: 0.3766, train acc: 0.8274, val loss: 0.3800, val acc: 0.8238


In [17]:
def evaluate_model(model, test_loader, device, hyperparams=None):
    log_file = "/data/sg2121/aimusicdetector/training_large_logfile.txt"

    model.eval()
    correct_pred = {classname: 0 for classname in ['ai', 'human']}
    total_pred = {classname: 0 for classname in ['ai', 'human']}

    all_labels = []
    all_preds = []
    
    start_time = time.time()

    with torch.no_grad():
        for inputs, labels, filenames in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            probs = F.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, 1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            for label, prediction in zip(labels, preds):
                classname = 'ai' if label.item() == 0 else 'human'
                if label == prediction:
                    correct_pred[classname] += 1
                total_pred[classname] += 1

    end_time = time.time()
    avg_inference_time = (end_time - start_time) / len(test_loader.dataset)

    accuracy_per_class = {
        classname: correct_pred[classname] / total_pred[classname]
        if total_pred[classname] > 0 else 0
        for classname in ['ai', 'human']
    }

    overall_accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None, labels=[0, 1])
    cm = confusion_matrix(all_labels, all_preds)

    fpr = {}
    for i, classname in enumerate(['ai', 'human']):
        FP = cm[:, i].sum() - cm[i, i]
        TN = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
        fpr[classname] = FP / (FP + TN) if (FP + TN) > 0 else 0

    # Logging
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_lines = [f"===== Evaluation at {timestamp} =====\n"]

    if hyperparams:
        log_lines.append("Hyperparameters:")
        for key, value in hyperparams.items():
            log_lines.append(f"{key}: {value}")
    else:
        log_lines.append("No hyperparameters provided.")
    log_lines.append("")

    log_lines.append("Accuracy per class:")
    for classname, acc in accuracy_per_class.items():
        log_lines.append(f"{classname}: {acc:.4f}")
    log_lines.append("\nPrecision, Recall, F1:")
    for i, classname in enumerate(['ai', 'human']):
        log_lines.append(f"{classname} → Precision: {precision[i]:.4f}, Recall: {recall[i]:.4f}, F1: {f1[i]:.4f}")
    log_lines.append("\nFalse Positive Rate:")
    for classname, rate in fpr.items():
        log_lines.append(f"{classname}: {rate:.4f}")
    log_lines.append(f"\nOverall Accuracy: {overall_accuracy:.4f}")
    log_lines.append(f"Average Inference Time per Sample: {avg_inference_time:.6f} seconds")
    log_lines.append("=" * 40 + "\n\n")

    print("\n".join(log_lines))
    with open(log_file, "a") as f:
        f.write("\n".join(log_lines))

    return overall_accuracy


In [18]:
model.eval()
results = []

for batch in tqdm(test_loader):
    inputs, labels, filenames = batch  # Unpack the filename from the dataset
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)

        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],  # Use filename directly from dataset
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": torch.argmax(probs[i]).item()
            })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("mel-spec_test_large_predictions.csv", index=False)

# Preview results
print(df.head())


100%|██████████████████████████████████████████████████████████| 104/104 [00:50<00:00,  2.08it/s]

                                filename   prob_ai  prob_human  true_label  \
0    S97RN_segment_2-Mel_Spectrogram.png  0.831969    0.168030           0   
1             H3233N-Mel_Spectrogram.png  0.210405    0.789595           1   
2               H63N-Mel_Spectrogram.png  0.603652    0.396348           1   
3  U1692RN_segment_2-Mel_Spectrogram.png  0.392459    0.607541           0   
4  S4552RN_segment_2-Mel_Spectrogram.png  0.828288    0.171712           0   

   pred_label  
0           0  
1           1  
2           0  
3           1  
4           0  





In [19]:
hyperparams = {
    "batch_size": batch_size,
    "learning_rate": lr,
    "epochs": num_epochs,
    "optimizer": optimizer,
    "model": model.__class__.__name__,
}

evaluate_model(model, test_loader, device, hyperparams=hyperparams)


===== Evaluation at 2025-05-12 15:16:47 =====

Hyperparameters:
batch_size: 32
learning_rate: 0.001
epochs: 5
optimizer: SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    momentum: 0.9
    nesterov: False
    weight_decay: 0
)
model: ResNet

Accuracy per class:
ai: 0.9044
human: 0.7044

Precision, Recall, F1:
ai → Precision: 0.8440, Recall: 0.9044, F1: 0.8731
human → Precision: 0.8065, Recall: 0.7044, F1: 0.7520

False Positive Rate:
ai: 0.2956
human: 0.0956

Overall Accuracy: 0.8321
Average Inference Time per Sample: 0.013093 seconds




0.8321299638989169

In [20]:
# Define hyperparameter search space
learning_rates = [1e-3, 1e-4, 5e-4]
weight_decays = [1e-6, 1e-7, 1e-8]
epochs_list = [5, 10, 20] 

# Generate hyperparameter combinations and randomly select 10
param_combinations = list(product(learning_rates, weight_decays, epochs_list))
random.shuffle(param_combinations)
hyperparam_trials = param_combinations[:10]

# Run randomized search
best_model = None
best_acc = 0

for i, (lr, wd, epochs) in enumerate(hyperparam_trials):
    print(f"\n=== Trial {i+1}/10: LR={lr}, WD={wd}, Epochs={epochs} ===")

    model = get_model()
    model = model.to(device)
    
    # Use an optimizer like Adam or SGD (adjust based on your requirement)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)
    
    # Train the model (assuming you have a function to handle training)
    train(model, train_loader, val_loader, criterion, optimizer, epochs)

    hyperparams = {
        "batch_size": batch_size,
        "learning_rate": lr,
        "epochs": epochs,
        "optimizer": optimizer,
        "model": model.__class__.__name__,
    }
    
    val_acc = evaluate_model(model, val_loader, device, hyperparams=hyperparams)
    if val_acc > best_acc:
        best_acc = val_acc
        best_model = model
        torch.save(model.state_dict(), 'best_model.pt')
    
print("\nBest validation accuracy:", best_acc)


=== Trial 1/10: LR=0.001, WD=1e-07, Epochs=20 ===
Epoch [1/20], train loss: 0.2538, train acc: 0.8864, val loss: 0.1714, val acc: 0.9289
Epoch [2/20], train loss: 0.1218, train acc: 0.9517, val loss: 0.1661, val acc: 0.9292
Epoch [3/20], train loss: 0.0737, train acc: 0.9716, val loss: 0.1566, val acc: 0.9395
Epoch [4/20], train loss: 0.0412, train acc: 0.9850, val loss: 0.1838, val acc: 0.9322
Epoch [5/20], train loss: 0.0310, train acc: 0.9889, val loss: 0.1945, val acc: 0.9377
Epoch [6/20], train loss: 0.0267, train acc: 0.9908, val loss: 0.1623, val acc: 0.9437
Epoch [7/20], train loss: 0.0129, train acc: 0.9961, val loss: 0.1706, val acc: 0.9422
Epoch [8/20], train loss: 0.0136, train acc: 0.9951, val loss: 0.1654, val acc: 0.9440
Epoch [9/20], train loss: 0.0103, train acc: 0.9964, val loss: 0.2037, val acc: 0.9419
Epoch [10/20], train loss: 0.0048, train acc: 0.9988, val loss: 0.1757, val acc: 0.9509
Epoch [11/20], train loss: 0.0059, train acc: 0.9982, val loss: 0.2049, val ac

In [21]:
model = get_model()
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.to(device)
model.eval()
results = []

for batch in tqdm(test_loader):
    inputs, labels, filenames = batch  # Unpack the filename from the dataset
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1)

        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],  # Use filename directly from dataset
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": torch.argmax(probs[i]).item()
            })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Save to CSV
df.to_csv("best_mel-spec_test_large_predictions.csv", index=False)

# Preview results
print(df.head())


100%|███████████████████████████████████████████████████████████████████████████████| 104/104 [00:45<00:00,  2.29it/s]

                                filename   prob_ai    prob_human  true_label  \
0    S97RN_segment_2-Mel_Spectrogram.png  1.000000  1.422857e-08           0   
1             H3233N-Mel_Spectrogram.png  0.000010  9.999905e-01           1   
2               H63N-Mel_Spectrogram.png  0.000199  9.998007e-01           1   
3  U1692RN_segment_2-Mel_Spectrogram.png  0.999978  2.168724e-05           0   
4  S4552RN_segment_2-Mel_Spectrogram.png  0.999986  1.384977e-05           0   

   pred_label  
0           0  
1           1  
2           1  
3           0  
4           0  



