In [1]:
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler

import os
import random
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from tqdm import tqdm

import pandas as pd
import torch.nn.functional as F

In [2]:
# Load pre-trained DistilBERT tokenizer and model for binary classification
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
print(device)

cuda


In [18]:
# Directory paths for the segments and lyrics
ai_segments_path = "/data/sg2121/fypdataset/dataset/normal_data/ai/segments"
human_segments_path = "/data/sg2121/fypdataset/dataset/normal_data/human"
ai_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/ai"
human_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/human"

# Helper function to read file paths from a text file
def read_file_paths(file_name):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

# Read all file paths from the text files
train_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/train_files.txt')
val_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/val_files.txt')
test_files = read_file_paths('/data/sg2121/aimusicdetector/train_test_split/test_files.txt')

# Function to convert segment file path to lyric file path
def convert_to_lyric_path(file_path, is_ai):
    if is_ai:
        if file_path.startswith(ai_segments_path):
            base_lyrics_path = ai_lyrics_path
        else:
            return
    else:
        if file_path.startswith(human_segments_path):
            base_lyrics_path = human_lyrics_path
        else:
            return

    # Convert filename to lyrics filename
    file_name = os.path.basename(file_path).replace('.mp3', '_lyrics.txt')
    return os.path.join(base_lyrics_path, file_name)


# Process the file lists and create tuples of (lyric_path, label)
def process_file_paths(file_paths, is_ai):
    return [(convert_to_lyric_path(file_path, is_ai), 0 if is_ai else 1) for file_path in file_paths]

# Convert all file paths from the train, validation, and test sets
ai_train_files = process_file_paths(train_files, is_ai=True)
human_train_files = process_file_paths(train_files, is_ai=False)

ai_val_files = process_file_paths(val_files, is_ai=True)
human_val_files = process_file_paths(val_files, is_ai=False)

ai_test_files = process_file_paths(test_files, is_ai=True)
human_test_files = process_file_paths(test_files, is_ai=False)

ai_train_files = [(path, label) for path, label in ai_train_files if path is not None]
human_train_files = [(path, label) for path, label in human_train_files if path is not None]

ai_val_files = [(path, label) for path, label in ai_val_files if path is not None]
human_val_files = [(path, label) for path, label in human_val_files if path is not None]

ai_test_files = [(path, label) for path, label in ai_test_files if path is not None]
human_test_files = [(path, label) for path, label in human_test_files if path is not None]


# Combine all files into a single list for each split
train_files_combined = ai_train_files + human_train_files
val_files_combined = ai_val_files + human_val_files
test_files_combined = ai_test_files + human_test_files

# Shuffle the data if needed
random.shuffle(train_files_combined)
random.shuffle(val_files_combined)
random.shuffle(test_files_combined)

# Example of how you might check the splits
print(f"Training set size: {len(train_files_combined)}")
print(f"Validation set size: {len(val_files_combined)}")
print(f"Test set size: {len(test_files_combined)}")

Training set size: 589
Validation set size: 126
Test set size: 128


In [20]:
class LyricsDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_length=512):
        self.file_paths = file_paths
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path, label = self.file_paths[idx]
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long),
            'filename': os.path.basename(file_path)  # This is important
        }



In [21]:
# Create datasets for training, validation, and testing
train_dataset = LyricsDataset(train_files_combined, tokenizer)
val_dataset = LyricsDataset(val_files_combined, tokenizer)
test_dataset = LyricsDataset(test_files_combined, tokenizer)

# Create DataLoader for each dataset
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Print a sample to check the dataset
sample = train_dataset[0]
print(f"Sample input IDs: {sample['input_ids']}")
print(f"Sample attention mask: {sample['attention_mask']}")
print(f"Sample label: {sample['labels']}")


Sample input IDs: tensor([ 101, 4658, 1010, 2023, 4569, 2729, 2395,  102,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   

In [22]:
# Set up the optimizer and training loop
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear",                     # "linear", "cosine", "cosine_with_restarts", etc.
    optimizer=optimizer,
    num_warmup_steps=0,               # Optional: small warmup period to avoid large initial updates
    num_training_steps=num_training_steps
)

# Training loop
for epoch in range(num_epochs):  # Number of epochs
    model.train()
    for batch in tqdm(train_dataloader):
        # Ensure batch is a dictionary of tensors
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backpropagation
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update learning rate
        lr_scheduler.step() 
    
    print(f"Epoch {epoch + 1} completed. Loss: {loss.item()}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_files)
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")


100%|████████████████████████████████████████████████████████████████████████████| 74/74 [00:10<00:00,  6.87it/s]


Epoch 1 completed. Loss: 0.37104618549346924


100%|████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 22.27it/s]


Validation Loss: 0.4535, Accuracy: 0.7937


100%|████████████████████████████████████████████████████████████████████████████| 74/74 [00:10<00:00,  7.08it/s]


Epoch 2 completed. Loss: 0.12233859300613403


100%|████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 22.22it/s]


Validation Loss: 0.4206, Accuracy: 0.7857


100%|████████████████████████████████████████████████████████████████████████████| 74/74 [00:10<00:00,  7.02it/s]


Epoch 3 completed. Loss: 0.31677931547164917


100%|████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 22.09it/s]

Validation Loss: 0.4282, Accuracy: 0.8254





In [23]:
model.eval()
results = []
correct_predictions = 0
total_predictions = 0

for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    filenames = batch['filename']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)  # Predicted label

        # Calculate the number of correct predictions
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        # Save results
        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": preds[i].item()
            })

# Save to CSV
df = pd.DataFrame(results)
df.to_csv("lyrics_test_predictions.csv", index=False)

# Calculate and print accuracy
accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.4f}")


100%|████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 20.79it/s]

Test Accuracy: 0.7891



