In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from tqdm import tqdm
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.tokenize import word_tokenize
from torch.cuda.amp import autocast, GradScaler

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
try:
    df = pd.read_csv("kaggle_processed.csv", on_bad_lines='skip', engine='python')
except FileNotFoundError:
    print("Error: 'kaggle_processed.csv' not found.")
    exit(1)

# Clean column names and drop NA
df.columns = df.columns.str.strip()
df = df.dropna(subset=['Lyrics'])

# Keep top 5 genres
top_genres = df['Genre'].value_counts().nlargest(5).index
df = df[df['Genre'].isin(top_genres)]
print(f"Top 5 genres: {top_genres.tolist()}")

# Handle class imbalance with oversampling
X_temp = df[['Lyrics']]
y_temp = df['Genre']
ros = RandomOverSampler(random_state=42)
X_temp, y_temp = ros.fit_resample(X_temp, y_temp)
df = pd.concat([pd.DataFrame(X_temp), pd.Series(y_temp, name='Genre')], axis=1)

# Encode labels
le = LabelEncoder()
df['Genre_encoded'] = le.fit_transform(df['Genre'])
print("Encoded genres:", le.classes_)

# Preprocessing with lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
remove_punct = str.maketrans('', '', punctuation + '’' + '‘' + '“' + '”' + '—' + '…')

def clean_text(text):
    text = str(text).lower()
    text = text.translate(remove_punct)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return ' '.join(tokens)

print("Preprocessing lyrics...")
df['Clean_Lyrics'] = df['Lyrics'].apply(clean_text)




# Split into train, validation, and test (80-10-10)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['Genre_encoded'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Genre_encoded'], random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# BERT Embeddings
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

max_len = 512

def get_token_level_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        encoded = tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            output = bert_model(**encoded)
        emb = output.last_hidden_state.squeeze(0) 
        embeddings.append(emb.cpu())
    return torch.stack(embeddings) 

print("Encoding train set...")
X_train = get_token_level_embeddings(train_df['Clean_Lyrics'])
y_train = torch.tensor(train_df['Genre_encoded'].values)

print("Encoding validation set...")
X_val = get_token_level_embeddings(val_df['Clean_Lyrics'])
y_val = torch.tensor(val_df['Genre_encoded'].values)

print("Encoding test set...")
X_test = get_token_level_embeddings(test_df['Clean_Lyrics'])
y_test = torch.tensor(test_df['Genre_encoded'].values)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=16)

# Enhanced CNN Model
class TextCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=input_dim, out_channels=256, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(in_channels=input_dim, out_channels=256, kernel_size=5, padding=2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256 * 3, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x1 = self.pool(self.relu(self.conv1(x))).squeeze(-1)
        x2 = self.pool(self.relu(self.conv2(x))).squeeze(-1)
        x3 = self.pool(self.relu(self.conv3(x))).squeeze(-1)
        x = torch.cat((x1, x2, x3), dim=1)
        x = self.dropout(x)
        return self.fc(x)

# LSTM Model with Attention
class TextLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TextLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context_vector = torch.sum(lstm_out * attention_weights, dim=1)
        out = self.dropout(context_vector)
        return self.fc(out)

# Training function with mixed precision and early stopping
def train_model(model, train_loader, val_loader, epochs=10, patience=10):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0005)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()
    best_val_acc = 0
    trigger_times = 0

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        with autocast():
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                pred = model(xb)
                loss = criterion(pred, yb)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                total_train_loss += loss.item()

        # Validation
        model.eval()
        total_val_loss = 0
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                total_val_loss += loss.item()
                val_preds.append(torch.argmax(pred, dim=1).cpu())
                val_labels.append(yb.cpu())

        val_preds = torch.cat(val_preds)
        val_labels = torch.cat(val_labels)
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            trigger_times = 0
            torch.save(model.state_dict(), f"best_{model.__class__.__name__}.pt")
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping triggered!")
                break

        scheduler.step(avg_val_loss)

    model.load_state_dict(torch.load(f"best_{model.__class__.__name__}.pt"))
    return model

# Evaluation function
def evaluate(model, loader, y_true):
    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            pred = model(xb)
            preds.append(torch.argmax(pred, dim=1).cpu())
    y_pred = torch.cat(preds)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))

# Run training and evaluation
print("\nTraining CNN with BERT...")
cnn = TextCNN(input_dim=X_train.shape[2], num_classes=len(le.classes_))
cnn_model = train_model(cnn, train_loader, val_loader)
print("\nCNN Evaluation with BERT:")
evaluate(cnn_model, test_loader, y_test)

print("\nTraining LSTM with BERT...")
lstm = TextLSTM(input_dim=X_train.shape[2], hidden_dim=256, num_classes=len(le.classes_))
lstm_model = train_model(lstm, train_loader, val_loader)
print("\nLSTM Evaluation with BERT:")
evaluate(lstm_model, test_loader, y_test)

[nltk_data] Downloading package punkt to /Users/shruthi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shruthi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shruthi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 genres: ['Pop', 'Rock', 'Country', 'R&B', 'Folk']
Encoded genres: ['Country' 'Folk' 'Pop' 'R&B' 'Rock']
Preprocessing lyrics...




Train size: 4000, Val size: 500, Test size: 500
Encoding train set...


Embedding: 100%|████████████████████████████| 4000/4000 [14:30<00:00,  4.60it/s]


Encoding validation set...


Embedding: 100%|██████████████████████████████| 500/500 [01:51<00:00,  4.47it/s]


Encoding test set...


Embedding: 100%|██████████████████████████████| 500/500 [01:51<00:00,  4.48it/s]



Training CNN with BERT...


  scaler = GradScaler()
  with autocast():


Epoch 1: Train Loss = 1.4134, Val Loss = 1.1878, Val Acc = 0.5120


  with autocast():


Epoch 2: Train Loss = 0.9992, Val Loss = 1.1100, Val Acc = 0.5500


  with autocast():


Epoch 3: Train Loss = 0.6662, Val Loss = 1.1513, Val Acc = 0.5720


  with autocast():


Epoch 4: Train Loss = 0.3787, Val Loss = 1.0629, Val Acc = 0.6240


  with autocast():


Epoch 5: Train Loss = 0.2216, Val Loss = 1.0627, Val Acc = 0.6060


  with autocast():


Epoch 6: Train Loss = 0.1089, Val Loss = 1.2167, Val Acc = 0.6080


  with autocast():


Epoch 7: Train Loss = 0.0845, Val Loss = 1.1480, Val Acc = 0.6380


  with autocast():


Epoch 8: Train Loss = 0.0550, Val Loss = 1.0920, Val Acc = 0.6340


  with autocast():


Epoch 9: Train Loss = 0.0640, Val Loss = 1.1187, Val Acc = 0.6560


  with autocast():


Epoch 10: Train Loss = 0.0374, Val Loss = 1.1893, Val Acc = 0.6320

CNN Evaluation with BERT:
Accuracy: 0.626
Confusion Matrix:
 [[79  9  2  7  3]
 [17 64  4  2 13]
 [ 8 11 55 14 12]
 [ 7  3 24 57  9]
 [15  9 10  8 58]]
              precision    recall  f1-score   support

     Country       0.63      0.79      0.70       100
        Folk       0.67      0.64      0.65       100
         Pop       0.58      0.55      0.56       100
         R&B       0.65      0.57      0.61       100
        Rock       0.61      0.58      0.59       100

    accuracy                           0.63       500
   macro avg       0.63      0.63      0.62       500
weighted avg       0.63      0.63      0.62       500


Training LSTM with BERT...


  scaler = GradScaler()
  with autocast():


Epoch 1: Train Loss = 1.4300, Val Loss = 1.3418, Val Acc = 0.4120


  with autocast():


Epoch 2: Train Loss = 1.2565, Val Loss = 1.3032, Val Acc = 0.4320


  with autocast():


Epoch 3: Train Loss = 1.1653, Val Loss = 1.2357, Val Acc = 0.5020


  with autocast():


Epoch 4: Train Loss = 1.0631, Val Loss = 1.2149, Val Acc = 0.5000


  with autocast():


Epoch 5: Train Loss = 0.9518, Val Loss = 1.2559, Val Acc = 0.5300


  with autocast():


Epoch 6: Train Loss = 0.8245, Val Loss = 1.3024, Val Acc = 0.5400


  with autocast():


Epoch 7: Train Loss = 0.7158, Val Loss = 1.3446, Val Acc = 0.5240


  with autocast():


Epoch 8: Train Loss = 0.5497, Val Loss = 1.3787, Val Acc = 0.5680


  with autocast():


Epoch 9: Train Loss = 0.3169, Val Loss = 1.5383, Val Acc = 0.6080


  with autocast():


Epoch 10: Train Loss = 0.1982, Val Loss = 1.7608, Val Acc = 0.6120

LSTM Evaluation with BERT:
Accuracy: 0.608
Confusion Matrix:
 [[62 22  6  4  6]
 [ 8 77  4  2  9]
 [ 7 15 62 10  6]
 [ 7  7 22 57  7]
 [15 22 13  4 46]]
              precision    recall  f1-score   support

     Country       0.63      0.62      0.62       100
        Folk       0.54      0.77      0.63       100
         Pop       0.58      0.62      0.60       100
         R&B       0.74      0.57      0.64       100
        Rock       0.62      0.46      0.53       100

    accuracy                           0.61       500
   macro avg       0.62      0.61      0.61       500
weighted avg       0.62      0.61      0.61       500

