In [2]:
print("HEELO")

HEELO


In [3]:
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler

import os
import random
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from tqdm import tqdm

import pandas as pd
import torch.nn.functional as F

In [4]:
# Load pre-trained DistilBERT tokenizer and model for binary classification
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def build_model():
    return DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model = build_model()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
print(device)

cuda


In [6]:
# Directory paths for the segments and lyrics
ai_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/ai_segments"
human_segments_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/normal_data/human"
ai_lyrics_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/lyrics/ai_clean"
human_lyrics_path = "/vol/bitbucket/sg2121/fypdataset/dataset_large2/lyrics/human_clean"

# Helper function to read file paths from a text file
def read_file_paths(file_name):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

# Read all file paths from the text files
train_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/train_files_large.txt')
val_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/val_files_large.txt')
test_files = read_file_paths('/vol/bitbucket/sg2121/fyp/aimusicdetector/train_test_split/bitbucket/test_files_large.txt')

# Function to convert segment file path to lyric file path
def convert_to_lyric_path(file_path, is_ai):
    if is_ai:
        if file_path.startswith(ai_segments_path):
            base_lyrics_path = ai_lyrics_path
        else:
            return None
    else:
        if file_path.startswith(human_segments_path):
            base_lyrics_path = human_lyrics_path
        else:
            return None

    # Construct full lyric file path
    file_name = os.path.basename(file_path).replace('.mp3', '_lyrics.txt')
    lyric_path = os.path.join(base_lyrics_path, file_name)

    # Now check if the path exists
    if not os.path.exists(lyric_path):
        return None

    return lyric_path



# Process the file lists and create tuples of (lyric_path, label)
def process_file_paths(file_paths, is_ai):
    return [
        (lyric_path, 0 if is_ai else 1)
        for file_path in file_paths
        if (lyric_path := convert_to_lyric_path(file_path, is_ai)) is not None
    ]


# Convert all file paths from the train, validation, and test sets
ai_train_files = process_file_paths(train_files, is_ai=True)
human_train_files = process_file_paths(train_files, is_ai=False)

ai_val_files = process_file_paths(val_files, is_ai=True)
human_val_files = process_file_paths(val_files, is_ai=False)

ai_test_files = process_file_paths(test_files, is_ai=True)
human_test_files = process_file_paths(test_files, is_ai=False)

ai_train_files = [(path, label) for path, label in ai_train_files if path is not None]
human_train_files = [(path, label) for path, label in human_train_files if path is not None]

ai_val_files = [(path, label) for path, label in ai_val_files if path is not None]
human_val_files = [(path, label) for path, label in human_val_files if path is not None]

ai_test_files = [(path, label) for path, label in ai_test_files if path is not None]
human_test_files = [(path, label) for path, label in human_test_files if path is not None]


# Combine all files into a single list for each split
train_files_combined = ai_train_files + human_train_files
val_files_combined = ai_val_files + human_val_files
test_files_combined = ai_test_files + human_test_files

# Shuffle the data if needed
random.shuffle(train_files_combined)
random.shuffle(val_files_combined)
random.shuffle(test_files_combined)

# Example of how you might check the splits
print(f"Training set size: {len(train_files_combined)}")
print(f"Validation set size: {len(val_files_combined)}")
print(f"Test set size: {len(test_files_combined)}")

Training set size: 14157
Validation set size: 2958
Test set size: 2967


In [7]:
class LyricsDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_length=512):
        self.file_paths = file_paths
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path, label = self.file_paths[idx]
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long),
            'filename': os.path.basename(file_path)  # This is important
        }



In [8]:
class EarlyStopping:
    def __init__(self, patience=2, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_score = None
        self.counter = 0

    def should_stop(self, score):
        if self.best_score is None:
            self.best_score = score
            return False
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.best_score = score
            self.counter = 0
        return False


In [9]:
# Create datasets for training, validation, and testing
train_dataset = LyricsDataset(train_files_combined, tokenizer)
val_dataset = LyricsDataset(val_files_combined, tokenizer)
test_dataset = LyricsDataset(test_files_combined, tokenizer)

# Create DataLoader for each dataset
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [10]:
def train_model(model, train_dataloader, val_dataloader, device, num_epochs=3, lr=5e-5, weight_decay=0.01):
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    num_training_steps = num_epochs * len(train_dataloader)
    
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1} Training Loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        correct_predictions = 0
        total_samples = 0

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc="Validating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                correct_predictions += (preds == labels).sum().item()
                total_samples += labels.size(0)

        avg_val_loss = val_loss / len(val_dataloader)
        accuracy = correct_predictions / total_samples
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

In [9]:
train_model(model, train_dataloader, val_dataloader, device, num_epochs=3)

Training Epoch 1: 100%|██████████████████████████████████████████████████████████| 1770/1770 [03:25<00:00,  8.61it/s]


Epoch 1 Training Loss: 0.3498


Validating: 100%|██████████████████████████████████████████████████████████████████| 370/370 [00:35<00:00, 10.30it/s]


Validation Loss: 0.3111, Accuracy: 0.8675


Training Epoch 2: 100%|██████████████████████████████████████████████████████████| 1770/1770 [02:11<00:00, 13.45it/s]


Epoch 2 Training Loss: 0.2228


Validating: 100%|██████████████████████████████████████████████████████████████████| 370/370 [00:17<00:00, 21.21it/s]


Validation Loss: 0.3167, Accuracy: 0.8749


Training Epoch 3: 100%|██████████████████████████████████████████████████████████| 1770/1770 [02:11<00:00, 13.47it/s]


Epoch 3 Training Loss: 0.0996


Validating: 100%|██████████████████████████████████████████████████████████████████| 370/370 [00:17<00:00, 21.29it/s]

Validation Loss: 0.4060, Accuracy: 0.8729





In [11]:
def train_model_with_early_stopping(model, train_dataloader, val_dataloader, device, num_epochs=3, lr=5e-5,
                                    weight_decay=0.01, patience=2):
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    num_training_steps = num_epochs * len(train_dataloader)

    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    early_stopper = EarlyStopping(patience=patience)

    best_val_accuracy = 0.0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        # Validation
        model.eval()
        correct_predictions = 0
        total_samples = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                correct_predictions += (preds == labels).sum().item()
                total_samples += labels.size(0)

        val_accuracy = correct_predictions / total_samples
        print(f"Epoch {epoch+1}: Validation Accuracy = {val_accuracy:.4f}")

        # Early stopping check
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = deepcopy(model.state_dict())

        if early_stopper.should_stop(val_accuracy):
            print("Early stopping triggered.")
            break

    # Load best weights before returning
    model.load_state_dict(best_model_state)
    return model, best_val_accuracy


In [12]:
def evaluate_model(model, test_dataloader, device, output_csv_path="predictions.csv", save_model_path="best_model.pt"):
    model.eval()
    results = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating on Test Set"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            filenames = batch['filename']

            outputs = model(input_ids, attention_mask=attention_mask)
            probs = F.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

            for i in range(len(filenames)):
                results.append({
                    "filename": filenames[i],
                    "prob_ai": probs[i][0].item(),
                    "prob_human": probs[i][1].item(),
                    "true_label": labels[i].item(),
                    "pred_label": preds[i].item()
                })

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

    # Save model weights
    torch.save(model.state_dict(), save_model_path)
    print(f"Model weights saved to {save_model_path}")

    # Accuracy
    accuracy = correct_predictions / total_predictions
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy

In [11]:
accuracy = evaluate_model(
    model,
    test_dataloader,
    device,
    output_csv_path="clean_lyrics_test_large_predictions.csv",
    save_model_path="clean_lyrics_model.pt"
)

Evaluating on Test Set: 100%|██████████████████████████████████████████████████████| 371/371 [00:34<00:00, 10.80it/s]


Predictions saved to clean_lyrics_test_large_predictions.csv
Model weights saved to clean_lyrics_model.pt
Test Accuracy: 0.8746


In [13]:
import random
from copy import deepcopy

def random_search(model_class, train_dataloader, val_dataloader, device,
                  param_distributions, n_trials=10, patience=2, save_path="best_model_random.pt"):
    best_accuracy = 0.0
    best_params = None
    best_model_state = None
    results = []

    keys = list(param_distributions.keys())

    for i in range(n_trials):
        # Sample random hyperparameters
        sampled_params = {k: random.choice(param_distributions[k]) for k in keys}
        print(f"\nTrial {i + 1}: {sampled_params}")

        model = model_class().to(device)

        trained_model, val_accuracy = train_model_with_early_stopping(
            model=model,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader,
            device=device,
            num_epochs=sampled_params['num_epochs'],
            lr=sampled_params['lr'],
            weight_decay=sampled_params['weight_decay'],
            patience=patience
        )

        results.append((sampled_params, val_accuracy))

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_params = sampled_params
            best_model_state = deepcopy(trained_model.state_dict())

    torch.save(best_model_state, save_path)
    print(f"\nBest model saved to {save_path}")
    print(f"Best Hyperparameters: {best_params}, Validation Accuracy: {best_accuracy:.4f}")
    return best_params, best_accuracy, results


In [14]:
param_distributions = {
    'lr': [5e-5, 3e-5, 1e-5],
    'weight_decay': [0.01, 0.001],
    'num_epochs': [2, 3, 4]
}

best_params, best_acc, search_results = random_search(
    model_class=build_model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    param_distributions=param_distributions,
    n_trials=10,
    patience=2,
    save_path="best_model_randomsearch.pt"
)



Trial 1: {'lr': 3e-05, 'weight_decay': 0.001, 'num_epochs': 2}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8695
Epoch 2: Validation Accuracy = 0.8769

Trial 2: {'lr': 3e-05, 'weight_decay': 0.001, 'num_epochs': 2}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8742
Epoch 2: Validation Accuracy = 0.8753

Trial 3: {'lr': 1e-05, 'weight_decay': 0.01, 'num_epochs': 2}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8631
Epoch 2: Validation Accuracy = 0.8773

Trial 4: {'lr': 5e-05, 'weight_decay': 0.001, 'num_epochs': 3}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8634
Epoch 2: Validation Accuracy = 0.8692
Epoch 3: Validation Accuracy = 0.8766

Trial 5: {'lr': 3e-05, 'weight_decay': 0.01, 'num_epochs': 3}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8631
Epoch 2: Validation Accuracy = 0.8769
Epoch 3: Validation Accuracy = 0.8712

Trial 6: {'lr': 3e-05, 'weight_decay': 0.001, 'num_epochs': 4}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8702
Epoch 2: Validation Accuracy = 0.8577
Epoch 3: Validation Accuracy = 0.8654
Early stopping triggered.

Trial 7: {'lr': 3e-05, 'weight_decay': 0.001, 'num_epochs': 4}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8634
Epoch 2: Validation Accuracy = 0.8759
Epoch 3: Validation Accuracy = 0.8648
Epoch 4: Validation Accuracy = 0.8614
Early stopping triggered.

Trial 8: {'lr': 1e-05, 'weight_decay': 0.001, 'num_epochs': 4}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8763
Epoch 2: Validation Accuracy = 0.8790
Epoch 3: Validation Accuracy = 0.8769
Epoch 4: Validation Accuracy = 0.8749
Early stopping triggered.

Trial 9: {'lr': 5e-05, 'weight_decay': 0.01, 'num_epochs': 4}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8668
Epoch 2: Validation Accuracy = 0.8600
Epoch 3: Validation Accuracy = 0.8607
Early stopping triggered.

Trial 10: {'lr': 3e-05, 'weight_decay': 0.01, 'num_epochs': 4}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Validation Accuracy = 0.8732
Epoch 2: Validation Accuracy = 0.8807
Epoch 3: Validation Accuracy = 0.8749
Epoch 4: Validation Accuracy = 0.8719
Early stopping triggered.

Best model saved to best_model_randomsearch.pt
Best Hyperparameters: {'lr': 3e-05, 'weight_decay': 0.01, 'num_epochs': 4}, Validation Accuracy: 0.8807


In [None]:
print(best_params)
print(best_acc)
print(search_results)