In [26]:
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler

import os
import random
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from tqdm import tqdm

import pandas as pd
import torch.nn.functional as F

In [27]:
# Load pre-trained DistilBERT tokenizer and model for binary classification
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
print(device)

cuda


In [29]:
# Directory paths for the lyrics
ai_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/ai"
human_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/human"

# Read all file paths
ai_files = [os.path.join(ai_lyrics_path, f) for f in os.listdir(ai_lyrics_path) if f.endswith('.txt')]
human_files = [os.path.join(human_lyrics_path, f) for f in os.listdir(human_lyrics_path) if f.endswith('.txt')]

# Combine all files into a single list with labels (0 for AI, 1 for Human)
all_files = [(file, 0) for file in ai_files] + [(file, 1) for file in human_files]

# Split into train and temp (temp is for validation and test)
train_files, temp_files = train_test_split(all_files, test_size=0.2, random_state=42)

# Further split temp into validation and test sets
val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

# Example of how you might check the splits
print(f"Training set size: {len(train_files)}")
print(f"Validation set size: {len(val_files)}")
print(f"Test set size: {len(test_files)}")


Training set size: 674
Validation set size: 84
Test set size: 85


In [42]:
class LyricsDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_length=512):
        self.file_paths = file_paths
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path, label = self.file_paths[idx]
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long),
            'filename': os.path.basename(file_path)  # This is important
        }



In [44]:
# Create datasets for training, validation, and testing
train_dataset = LyricsDataset(train_files, tokenizer)
val_dataset = LyricsDataset(val_files, tokenizer)
test_dataset = LyricsDataset(test_files, tokenizer)

# Create DataLoader for each dataset
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Print a sample to check the dataset
sample = train_dataset[0]
print(f"Sample input IDs: {sample['input_ids']}")
print(f"Sample attention mask: {sample['attention_mask']}")
print(f"Sample label: {sample['labels']}")


Sample input IDs: tensor([  101, 15333,  2033, 16220,  6906,  2072, 29316,  1037,  5673,  4372,
         8774,  4630,  4372,  7151, 10207, 10364,  2139,  4606,  5285,  4630,
        10861,  4649, 10364, 18033,  2121,  2474, 29086,  2474, 29086,  1010,
         2474, 29086,  1010,  2474, 29086,  1039,  1005,  9765, 25353,  8737,
         2050,  1010,  2474, 29086,  1010,  2474, 29086, 10613,  3540, 19445,
         1010,  2474, 29086,  1010,  2474, 29086,  1010,  2474, 29086,  1039,
         1005,  9765, 25353,  8737,  2050,  1010,  2474, 29086,  1010,  2474,
        29086, 10613,  3540, 19445,  1010, 15317, 22825,  1037,  2033,  4189,
         2063,  9610,  2121,  1010,  5003,  9004, 11022,  2139,  8872,  3170,
         1037,  5003,  8915,  6593,  2063,  2143,  2139, 12731,  2140, 15317,
        25636,  2063, 27830,  4886,  3672, 24209,  1005,  1037,  2474,  8632,
         3802,  7367,  2015,  2061, 21823,  6132, 14980,  3372,  4372, 21418,
        14774,  1010, 25175,  6335,  6904,  49

In [51]:
# Set up the optimizer and training loop
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear",                     # "linear", "cosine", "cosine_with_restarts", etc.
    optimizer=optimizer,
    num_warmup_steps=0,               # Optional: small warmup period to avoid large initial updates
    num_training_steps=num_training_steps
)

# Training loop
for epoch in range(num_epochs):  # Number of epochs
    model.train()
    for batch in tqdm(train_dataloader):
        # Ensure batch is a dictionary of tensors
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backpropagation
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Update learning rate
        lr_scheduler.step() 
    
    print(f"Epoch {epoch + 1} completed. Loss: {loss.item()}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_files)
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")


100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  7.00it/s]


Epoch 1 completed. Loss: 0.0036571966484189034


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.72it/s]


Validation Loss: 0.8289, Accuracy: 0.7500


100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  6.95it/s]


Epoch 2 completed. Loss: 0.003831487614661455


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.44it/s]


Validation Loss: 0.9162, Accuracy: 0.7381


100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  6.89it/s]


Epoch 3 completed. Loss: 0.002079684752970934


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.33it/s]

Validation Loss: 0.9843, Accuracy: 0.7619





In [52]:
model.eval()
results = []
correct_predictions = 0
total_predictions = 0

for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    filenames = batch['filename']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        preds = torch.argmax(probs, dim=1)  # Predicted label

        # Calculate the number of correct predictions
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        # Save results
        for i in range(len(filenames)):
            results.append({
                "filename": filenames[i],
                "prob_ai": probs[i][0].item(),
                "prob_human": probs[i][1].item(),
                "true_label": labels[i].item(),
                "pred_label": preds[i].item()
            })

# Save to CSV
df = pd.DataFrame(results)
df.to_csv("lyrics_test_predictions.csv", index=False)

# Calculate and print accuracy
accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.4f}")


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 19.32it/s]

Test Accuracy: 0.7294



