<a href="https://colab.research.google.com/github/ronitavalani/467Project/blob/main/genre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load in the data set
url = 'https://raw.githubusercontent.com/ronitavalani/467Project/main/songs_normalize.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [3]:
# Preprocess the data


# Mixed Genre Problem

# Attempt 1: Specify different types of genres instead of pop - indicates that pop is why the model is performing well
# def get_preferred_genre(genre_str):
#     genres = genre_str.split(",")
#     genres = [g.strip() for g in genres if g.strip()]  # clean and remove empty entries
#     if not genres:
#         return "Unknown"
#     if genres[0].lower() == "pop" and len(genres) > 1:
#         return genres[1]
#     return genres[0]

# Attempt 2: Create a separate "mixed" genre for multiple genre songs
# def get_preferred_genre(genre_str):
#     genres = genre_str.split(",")
#     genres = [g.strip() for g in genres if g.strip()]  # clean and remove empty entries
#     if len(genres) > 1:
#         return "Mixed"
#     return genres[0] if genres else "Unknown"

# df['genre'] = df['genre'].astype(str).apply(get_preferred_genre)

# Keep only the first genre when a song is classified with multiple
df['genre'] = df['genre'].astype(str).apply(lambda x: x.split(',')[0].strip())


# Drop qualitative data (song name, artist name)
non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
non_numeric_cols.remove('genre')
df = df.drop(columns=non_numeric_cols)

# Drop empty values
df = df.dropna()

# Create input and output
X = df.drop(columns=['genre'])
y = df['genre']

# Underrepresented Data Problem
# Drop genre classes with only 1 example
le = LabelEncoder()
y_encoded = le.fit_transform(y)
value_counts = pd.Series(y_encoded).value_counts()
valid_classes = value_counts[value_counts > 1].index
valid_mask = pd.Series(y_encoded).isin(valid_classes)
X = X[valid_mask]
y = y[valid_mask].reset_index(drop=True)

# Redo output labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# Prepare data for training
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Create a PyTorch data set
class SongDataset(Dataset):
    def __init__(self, features, labels):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SongDataset(X_train, y_train)
test_dataset = SongDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [5]:
# Define neural network
# Simple model with one hidden layer and
class GenreNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GenreNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Deeper model with 2 hidden layers and dropout - performs worse
class GenreNet2(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GenreNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.drop1 = nn.Dropout(p=0.3)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.drop2 = nn.Dropout(p=0.3)
        self.fc3 = nn.Linear(hidden_dim // 2, output_dim)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.drop1(out)
        out = F.relu(self.fc2(out))
        out = self.drop2(out)
        out = self.fc3(out)
        return out

# Model, loss, optimizer
input_dim = X_train.shape[1]
hidden_dim = 64
output_dim = len(np.unique(y_encoded))

model = GenreNet(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [6]:
# Training
epochs = 16
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

Epoch 1/16, Loss: 1.9430
Epoch 2/16, Loss: 1.2979
Epoch 3/16, Loss: 1.0883
Epoch 4/16, Loss: 1.0231
Epoch 5/16, Loss: 0.9944
Epoch 6/16, Loss: 0.9778
Epoch 7/16, Loss: 0.9645
Epoch 8/16, Loss: 0.9522
Epoch 9/16, Loss: 0.9425
Epoch 10/16, Loss: 0.9334
Epoch 11/16, Loss: 0.9254
Epoch 12/16, Loss: 0.9176
Epoch 13/16, Loss: 0.9104
Epoch 14/16, Loss: 0.9043
Epoch 15/16, Loss: 0.8968
Epoch 16/16, Loss: 0.8911


In [7]:
# Evaluation
# Consistently gaining around 67% classification accuracy - solid start but not good
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f"\nTest Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 67.00%
