In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
try:
    df = pd.read_csv("kaggle_processed.csv", on_bad_lines='skip', engine='python')
except FileNotFoundError:
    print("Error: 'kaggle_processed.csv' not found.")
    exit(1)

# Clean column names and drop NA
df.columns = df.columns.str.strip()
df = df.dropna(subset=['Lyrics'])

# Keep top 5 genres
top_genres = df['Genre'].value_counts().nlargest(5).index
df = df[df['Genre'].isin(top_genres)]
print(f"Top 5 genres: {top_genres.tolist()}")

# Encode labels
le = LabelEncoder()
df['Genre_encoded'] = le.fit_transform(df['Genre'])
print("Encoded genres:", le.classes_)

# Preprocessing with lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
remove_punct = str.maketrans('', '', punctuation)

def clean_text(text):
    text = str(text).lower()
    text = text.translate(remove_punct)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

print("Preprocessing lyrics...")
df['Clean_Lyrics'] = df['Lyrics'].apply(clean_text)

# Split into train and test (80-20)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Genre_encoded'], random_state=42)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# DistilBERT Embeddings
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

max_len = 256

def get_token_level_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        encoded = tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            output = bert_model(**encoded)
        emb = output.last_hidden_state.squeeze(0)
        embeddings.append(emb.cpu())
    return torch.stack(embeddings)

print("Encoding train set...")
X_train = get_token_level_embeddings(train_df['Clean_Lyrics'])
y_train = torch.tensor(train_df['Genre_encoded'].values, dtype=torch.long)

print("Encoding test set...")
X_test = get_token_level_embeddings(test_df['Clean_Lyrics'])
y_test = torch.tensor(test_df['Genre_encoded'].values,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=16)

# RNN Model
class TextRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TextRNN, self).__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim, num_layers=2, batch_first=True, nonlinearity='relu', dropout=0.4)
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        output, _ = self.rnn(x)
        out = output[:, -1, :]
        out = self.dropout(out)
        return self.fc(out)

# Training function with early stopping
def train_model(model, train_loader, val_loader, epochs=25, patience=5):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    best_val_loss = float('inf')
    trigger_times = 0

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                loss = criterion(pred, yb)
                total_val_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            trigger_times = 0
            torch.save(model.state_dict(), f"best_{model.__class__.__name__}.pt")
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print("Early stopping triggered!")
                break

    # Load best model
    model.load_state_dict(torch.load(f"best_{model.__class__.__name__}.pt"))
    return model

# Evaluation function
def evaluate(model, loader, y_true):
    model.eval()
    preds = []
    with torch.no_grad():
        for xb, _ in loader:
            xb = xb.to(device)
            pred = model(xb)
            preds.append(torch.argmax(pred, dim=1).cpu())
    y_pred = torch.cat(preds)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=le.classes_))

# Create validation set
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['Genre_encoded'], random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

print("Encoding validation set...")
X_val = get_token_level_embeddings(val_df['Clean_Lyrics'])
y_val = torch.tensor(val_df['Genre_encoded'].values, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=16, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=16)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=16)

print("\nTraining RNN...")
rnn = TextRNN(input_dim=X_train.shape[2], hidden_dim=128, num_classes=len(le.classes_))
rnn_model = train_model(rnn, train_loader, val_loader)
print("\nRNN Evaluation:")
evaluate(rnn_model, test_loader, y_test)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saksh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 genres: ['Pop', 'Rock', 'Country', 'R&B', 'Folk']
Encoded genres: ['Country' 'Folk' 'Pop' 'R&B' 'Rock']
Preprocessing lyrics...
Train size: 3925, Test size: 982
Encoding train set...


Embedding: 100%|██████████| 3925/3925 [11:51<00:00,  5.52it/s]


Encoding test set...


Embedding: 100%|██████████| 982/982 [02:58<00:00,  5.51it/s]


Train size: 3140, Val size: 785, Test size: 982
Encoding validation set...


Embedding: 100%|██████████| 785/785 [02:21<00:00,  5.53it/s]



Training CNN...
Epoch 1: Train Loss = 1.4979, Val Loss = 1.2231
Epoch 2: Train Loss = 1.2715, Val Loss = 1.0739
Epoch 3: Train Loss = 1.1406, Val Loss = 0.9036
Epoch 4: Train Loss = 1.0097, Val Loss = 0.8017
Epoch 5: Train Loss = 0.8818, Val Loss = 0.6456
Epoch 6: Train Loss = 0.7562, Val Loss = 0.5265
Epoch 7: Train Loss = 0.6551, Val Loss = 0.4066
Epoch 8: Train Loss = 0.5451, Val Loss = 0.3391
Epoch 9: Train Loss = 0.4815, Val Loss = 0.2604
Epoch 10: Train Loss = 0.4008, Val Loss = 0.1856
Epoch 11: Train Loss = 0.3601, Val Loss = 0.1382
Epoch 12: Train Loss = 0.3095, Val Loss = 0.1319
Epoch 13: Train Loss = 0.2644, Val Loss = 0.0848
Epoch 14: Train Loss = 0.2429, Val Loss = 0.0743
Epoch 15: Train Loss = 0.2170, Val Loss = 0.1212
Epoch 16: Train Loss = 0.2267, Val Loss = 0.0472
Epoch 17: Train Loss = 0.1909, Val Loss = 0.0348
Epoch 18: Train Loss = 0.1713, Val Loss = 0.0226
Epoch 19: Train Loss = 0.1730, Val Loss = 0.0353
Epoch 20: Train Loss = 0.1699, Val Loss = 0.0580
Epoch 21: Tr