In [4]:
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

import os
import random
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader


In [11]:
# Load pre-trained DistilBERT tokenizer and model for binary classification
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
print(device)

cuda


In [13]:
# Directory paths for the lyrics
ai_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/ai"
human_lyrics_path = "/data/sg2121/fypdataset/dataset/lyrics/human"

# Read all file paths
ai_files = [os.path.join(ai_lyrics_path, f) for f in os.listdir(ai_lyrics_path) if f.endswith('.txt')]
human_files = [os.path.join(human_lyrics_path, f) for f in os.listdir(human_lyrics_path) if f.endswith('.txt')]

# Combine all files into a single list with labels (0 for AI, 1 for Human)
all_files = [(file, 0) for file in ai_files] + [(file, 1) for file in human_files]

# Split into train and temp (temp is for validation and test)
train_files, temp_files = train_test_split(all_files, test_size=0.2, random_state=42)

# Further split temp into validation and test sets
val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

# Example of how you might check the splits
print(f"Training set size: {len(train_files)}")
print(f"Validation set size: {len(val_files)}")
print(f"Test set size: {len(test_files)}")


Training set size: 674
Validation set size: 84
Test set size: 85


In [14]:
class LyricsDataset(Dataset):
    def __init__(self, file_paths, tokenizer, max_length=512):
        self.file_paths = file_paths
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path, label = self.file_paths[idx]
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )

        input_ids = encoding['input_ids'].squeeze()  # Remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze()  # Remove batch dimension
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = LyricsDataset(train_files, tokenizer)
val_dataset = LyricsDataset(val_files, tokenizer)
test_dataset = LyricsDataset(test_files, tokenizer)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)


In [17]:
from torch.optim import AdamW
from tqdm import tqdm

# Set up the optimizer and training loop
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear",                     # "linear", "cosine", "cosine_with_restarts", etc.
    optimizer=optimizer,
    num_warmup_steps=0,               # Optional: small warmup period to avoid large initial updates
    num_training_steps=num_training_steps
)

# Training loop

for epoch in range(num_epochs):  # Number of epochs
    model.train()
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step() 
    
    print(f"Epoch {epoch + 1} completed. Loss: {loss.item()}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == labels).sum().item()

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = correct_predictions / len(val_files)
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  7.03it/s]


Epoch 1 completed. Loss: 1.8972280025482178


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.67it/s]


Validation Loss: 0.6628, Accuracy: 0.7143


100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  6.98it/s]


Epoch 2 completed. Loss: 0.011722455732524395


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.62it/s]


Validation Loss: 0.7844, Accuracy: 0.7381


100%|█████████████████████████████████████████████████████████████████████| 85/85 [00:12<00:00,  6.91it/s]


Epoch 3 completed. Loss: 0.008140352554619312


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 22.47it/s]

Validation Loss: 0.7463, Accuracy: 0.7381





In [18]:
# Evaluate on test set
model.eval()
correct = 0
total = 0

for batch in tqdm(test_dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


100%|█████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 20.94it/s]

Test Accuracy: 0.7294



