In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from transformers import XLNetTokenizer, XLNetForSequenceClassification, BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F

# Load the dataset
file_path = 'datasets/cleaned_dataset_with_lyrics.csv'
df = pd.read_csv(file_path)

# Data Cleaning: Feature Selection
df = df[['lyrics', 'mood']]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], df['mood'], test_size=0.2, random_state=42)

# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Machine Learning Model: SVM
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

# Prepare dataset for XLNet and BERT
class SongDataset(Dataset):
    def __init__(self, lyrics, labels, tokenizer, max_length):
        self.lyrics = lyrics
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, index):
        lyric = str(self.lyrics[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            lyric,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_transformer_model(model_name, train_dataset, val_dataset):
    model = model_name.from_pretrained(model_name_path, num_labels=len(df['mood'].unique()))

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy='epoch'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': (preds == p.label_ids).astype(np.float32).mean().item()
    }

# Tokenizer and datasets for XLNet
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
train_dataset_xlnet = SongDataset(X_train.to_list(), y_train.to_list(), xlnet_tokenizer, max_length=128)
val_dataset_xlnet = SongDataset(X_test.to_list(), y_test.to_list(), xlnet_tokenizer, max_length=128)

# Tokenizer and datasets for BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset_bert = SongDataset(X_train.to_list(), y_train.to_list(), bert_tokenizer, max_length=128)
val_dataset_bert = SongDataset(X_test.to_list(), y_test.to_list(), bert_tokenizer, max_length=128)

# Train XLNet model
trainer_xlnet = train_transformer_model(XLNetForSequenceClassification, train_dataset_xlnet, val_dataset_xlnet)
trainer_bert = train_transformer_model(BertForSequenceClassification, train_dataset_bert, val_dataset_bert)

# Evaluate models
eval_results_xlnet = trainer_xlnet.evaluate()
eval_results_bert = trainer_bert.evaluate()

print("XLNet Evaluation Results:")
print(eval_results_xlnet)

print("BERT Evaluation Results:")
print(eval_results_bert)

# Compare results
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print(f"XLNet Accuracy: {eval_results_xlnet['eval_accuracy']}")
print(f"BERT Accuracy: {eval_results_bert['eval_accuracy']}")