In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


try:
    df = pd.read_csv("kaggle_processed.csv", on_bad_lines='skip', engine='python')
except FileNotFoundError:
    print("Error: 'kaggle_processed.csv' not found.")
    exit(1)


df.columns = df.columns.str.strip()
df = df.dropna(subset=['Lyrics'])

top_genres = df['Genre'].value_counts().nlargest(5).index
df = df[df['Genre'].isin(top_genres)]
print(f"Top 5 genres: {top_genres.tolist()}")


le = LabelEncoder()
df['Genre_encoded'] = le.fit_transform(df['Genre'])
print("Encoded genres:", le.classes_)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
remove_punct = str.maketrans('', '', punctuation)

def clean_text(text):
    text = str(text).lower()
    text = text.translate(remove_punct)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)


df['Clean_Lyrics'] = df['Lyrics'].apply(clean_text)


train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Genre_encoded'], random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")


model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

max_len = 256

def get_token_level_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        encoded = tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            output = bert_model(**encoded)
        emb = output.last_hidden_state.squeeze(0)
        embeddings.append(emb.cpu())
    return torch.stack(embeddings)

X_train = get_token_level_embeddings(train_df['Clean_Lyrics'])
y_train = torch.tensor(train_df['Genre_encoded'].values, dtype=torch.long)

X_test = get_token_level_embeddings(test_df['Clean_Lyrics'])
y_test = torch.tensor(test_df['Genre_encoded'].values,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=16)

class TextCNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(input_dim, 128, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(input_dim, 128, kernel_size=5, padding=2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(128 * 3, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x1 = self.pool(self.relu(self.conv1(x))).squeeze(-1)
        x2 = self.pool(self.relu(self.conv2(x))).squeeze(-1)
        x3 = self.pool(self.relu(self.conv3(x))).squeeze(-1)
        x = torch.cat((x1, x2, x3), dim=1)
        x = self.dropout(x)
        return self.fc(x)

class TextLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TextLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.4)
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        output, _ = self.lstm(x)
        out = output[:, -1, :]
        out = self.dropout(out)
        return self.fc(out)

class TextRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TextRNN, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=2, batch_first=True, nonlinearity='relu', dropout=0.4)
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        output, _ = self.rnn(x)
        out = output[:, -1, :]
        out = self.dropout(out)
        return self.fc(out)

def train_model(model, train_loader, val_loader, epochs=25, patience=5):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    best_val_loss = float('inf')
    trigger_times = 0

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()


        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                total_val_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

 
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            trigger_times = 0
            torch.save(model.state_dict(), f"best_{model.__class__.__name__}.pt")
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping triggered!")
                break

    model.load_state_dict(torch.load(f"best_{model.__class__.__name__}.pt"))
    return model


def evaluate(model, loader, y_true):
    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            pred = model(xb)
            preds.append(torch.argmax(pred, dim=1).cpu())
    y_pred = torch.cat(preds)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))


train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['Genre_encoded'], random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

X_val = get_token_level_embeddings(val_df['Clean_Lyrics'])
y_val = torch.tensor(val_df['Genre_encoded'].values, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=16)


print("\nTraining CNN...")
cnn = TextCNN(input_dim=X_train.shape[2], num_classes=len(le.classes_))
cnn_model = train_model(cnn, train_loader, val_loader)
print("\nCNN Evaluation:")
evaluate(cnn_model, test_loader, y_test)

print("\nTraining LSTM...")
lstm = TextLSTM(input_dim=X_train.shape[2], hidden_dim=128, num_classes=len(le.classes_))
lstm_model = train_model(lstm, train_loader, val_loader)
print("\nLSTM Evaluation:")
evaluate(lstm_model, test_loader, y_test)

print("\nTraining RNN...")
rnn = TextRNN(input_dim=X_train.shape[2], hidden_dim=128, num_classes=len(le.classes_))
rnn_model = train_model(rnn, train_loader, val_loader)
print("\nRNN Evaluation:")
evaluate(rnn_model, test_loader, y_test)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 genres: ['Pop', 'Rock', 'Country', 'R&B', 'Folk']
Encoded genres: ['Country' 'Folk' 'Pop' 'R&B' 'Rock']
Train size: 3925, Test size: 982


Embedding: 100%|██████████| 3925/3925 [11:18<00:00,  5.79it/s]
Embedding: 100%|██████████| 982/982 [01:28<00:00, 11.07it/s]


Train size: 3140, Val size: 785, Test size: 982


Embedding: 100%|██████████| 785/785 [01:10<00:00, 11.14it/s]



Training CNN...
Epoch 1: Train Loss = 1.5216, Val Loss = 1.2647
Epoch 2: Train Loss = 1.2765, Val Loss = 1.0959
Epoch 3: Train Loss = 1.1529, Val Loss = 0.9145
Epoch 4: Train Loss = 1.0117, Val Loss = 0.8095
Epoch 5: Train Loss = 0.8633, Val Loss = 0.6078
Epoch 6: Train Loss = 0.7423, Val Loss = 0.5077
Epoch 7: Train Loss = 0.6419, Val Loss = 0.3805
Epoch 8: Train Loss = 0.5308, Val Loss = 0.2659
Epoch 9: Train Loss = 0.4572, Val Loss = 0.2051
Epoch 10: Train Loss = 0.3911, Val Loss = 0.2256
Epoch 11: Train Loss = 0.3269, Val Loss = 0.1723
Epoch 12: Train Loss = 0.2905, Val Loss = 0.1279
Epoch 13: Train Loss = 0.2757, Val Loss = 0.0871
Epoch 14: Train Loss = 0.2445, Val Loss = 0.0697
Epoch 15: Train Loss = 0.2116, Val Loss = 0.0561
Epoch 16: Train Loss = 0.1785, Val Loss = 0.0598
Epoch 17: Train Loss = 0.1959, Val Loss = 0.0408
Epoch 18: Train Loss = 0.1667, Val Loss = 0.0379
Epoch 19: Train Loss = 0.1683, Val Loss = 0.0414
Epoch 20: Train Loss = 0.1793, Val Loss = 0.0404
Epoch 21: Tr

In [2]:
def evaluate_ensemble(cnn_model, lstm_model, loader, y_true):
    cnn_model.eval()
    lstm_model.eval()
    all_preds = []

    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)


            prob_cnn = torch.softmax(cnn_model(xb), dim=1)
            prob_lstm = torch.softmax(lstm_model(xb), dim=1)


            ensemble_prob = (prob_cnn + prob_lstm) / 2
            preds = torch.argmax(ensemble_prob, dim=1).cpu()
            all_preds.append(preds)

    y_pred = torch.cat(all_preds)
    print("Ensemble Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))


In [3]:
print("\n Evaluating CNN + LSTM Ensemble:")
evaluate_ensemble(cnn_model, lstm_model, test_loader, y_test)



 Evaluating CNN + LSTM Ensemble:
Ensemble Accuracy: 0.5936863543788188
              precision    recall  f1-score   support

     Country       0.69      0.65      0.67       200
        Folk       0.54      0.64      0.59       182
         Pop       0.57      0.55      0.56       200
         R&B       0.64      0.62      0.63       200
        Rock       0.54      0.52      0.53       200

    accuracy                           0.59       982
   macro avg       0.60      0.59      0.59       982
weighted avg       0.60      0.59      0.59       982



In [4]:
def evaluate_weighted_ensemble(cnn_model, lstm_model, loader, y_true):
    cnn_model.eval()
    lstm_model.eval()

    best_f1 = 0
    best_w = None

    for w in [0.1 * i for i in range(1, 10)]:  
        preds = []
        with torch.no_grad():
            for xb, _ in loader:
                xb = xb.to(device)

                cnn_probs = torch.softmax(cnn_model(xb), dim=1)
                lstm_probs = torch.softmax(lstm_model(xb), dim=1)

                ensemble_prob = w * cnn_probs + (1 - w) * lstm_probs
                pred = torch.argmax(ensemble_prob, dim=1).cpu()
                preds.append(pred)

        y_pred = torch.cat(preds)
        f1 = accuracy_score(y_true, y_pred)
        print(f"w = {w:.1f} → Accuracy = {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            best_w = w

    print(f"\n Best Weight: {best_w:.1f} | Accuracy: {best_f1:.4f}")
    return best_w


In [5]:
best_w = evaluate_weighted_ensemble(cnn_model, lstm_model, test_loader, y_test)


w = 0.1 → Accuracy = 0.5428
w = 0.2 → Accuracy = 0.5438
w = 0.3 → Accuracy = 0.5499
w = 0.4 → Accuracy = 0.5662
w = 0.5 → Accuracy = 0.5937
w = 0.6 → Accuracy = 0.6171
w = 0.7 → Accuracy = 0.6141
w = 0.8 → Accuracy = 0.6141
w = 0.9 → Accuracy = 0.6090

 Best Weight: 0.6 | Accuracy: 0.6171


In [6]:
from torch.utils.data import Dataset

class SimpleTextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {k: v.squeeze() for k, v in encoding.items()}


In [7]:
def get_bert_probs(texts, model, tokenizer):
    dataset = SimpleTextDataset(texts, tokenizer)
    loader = DataLoader(dataset, batch_size=32)
    model.eval()

    all_probs = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            probs = torch.softmax(outputs.logits, dim=1)
            all_probs.append(probs.cpu())

    return torch.cat(all_probs)


In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import torch


print("Generating BERT embeddings...")
bert_embedder = SentenceTransformer("all-MiniLM-L6-v2")

X_train_bert = bert_embedder.encode(train_df['Clean_Lyrics'].tolist(), show_progress_bar=True, batch_size=32)
X_test_bert = bert_embedder.encode(test_df['Clean_Lyrics'].tolist(), show_progress_bar=True, batch_size=32)

y_train_bert = train_df['Genre_encoded'].values
y_test_bert = test_df['Genre_encoded'].values

print("Training BERT + MLP...")
mlp_bert = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
mlp_bert.fit(X_train_bert, y_train_bert)

y_pred_bert = mlp_bert.predict(X_test_bert)
bert_probs = torch.tensor(mlp_bert.predict_proba(X_test_bert))

print("\n Classification Report (BERT + MLP):")
print(classification_report(y_test_bert, y_pred_bert, target_names=le.classes_))
print("Accuracy:", accuracy_score(y_test_bert, y_pred_bert))


Generating BERT embeddings...


Batches: 100%|██████████| 99/99 [00:42<00:00,  2.33it/s]
Batches: 100%|██████████| 31/31 [00:13<00:00,  2.32it/s]


Training BERT + MLP...

 Classification Report (BERT + MLP):
              precision    recall  f1-score   support

     Country       0.61      0.57      0.59       200
        Folk       0.50      0.47      0.48       182
         Pop       0.52      0.54      0.53       200
         R&B       0.61      0.66      0.63       200
        Rock       0.45      0.46      0.46       200

    accuracy                           0.54       982
   macro avg       0.54      0.54      0.54       982
weighted avg       0.54      0.54      0.54       982

Accuracy: 0.539714867617108


In [9]:
def evaluate_ensemble_three_way(cnn_model, lstm_model, bert_probs, loader, y_true, weights=(0.8, 0.4, 0.8)):
    cnn_model.eval()
    lstm_model.eval()
    w_cnn, w_lstm, w_bert = weights

    preds = []
    i = 0  

    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)

            cnn_probs = torch.softmax(cnn_model(xb), dim=1)
            lstm_probs = torch.softmax(lstm_model(xb), dim=1)

            batch_size = xb.size(0)
            bert_batch_probs = bert_probs[i:i+batch_size]
            i += batch_size

            final_probs = w_cnn * cnn_probs.cpu() + w_lstm * lstm_probs.cpu() + w_bert * bert_batch_probs
            preds.append(torch.argmax(final_probs, dim=1))

    y_pred = torch.cat(preds)
    print("Ensemble Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))


In [10]:
print("\n Evaluating CNN + LSTM + BERT Ensemble:")
evaluate_ensemble_three_way(cnn_model, lstm_model, bert_probs, test_loader, y_test, weights=(0.8, 0.7, 0.8))



 Evaluating CNN + LSTM + BERT Ensemble:
Ensemble Accuracy: 0.6262729124236253
              precision    recall  f1-score   support

     Country       0.72      0.66      0.68       200
        Folk       0.56      0.62      0.59       182
         Pop       0.62      0.59      0.61       200
         R&B       0.66      0.68      0.67       200
        Rock       0.59      0.58      0.59       200

    accuracy                           0.63       982
   macro avg       0.63      0.63      0.63       982
weighted avg       0.63      0.63      0.63       982

