In [22]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [23]:
data_path = os.path.join('..', 'datasets', 'GTZAN')  
genres = os.listdir(data_path)

file_paths = []
for genre in genres:
    genre_folder = os.path.join(data_path, genre)
    if not os.path.isdir(genre_folder):
        continue
    for file_name in os.listdir(genre_folder):
        if file_name.endswith('.au'):  
            file_paths.append({
                'file_path': os.path.join(genre_folder, file_name),
                'genre': genre
            })

df = pd.DataFrame(file_paths)
print("DataFrame head:")
print(df.head())
print("Total files:", len(df))

DataFrame head:
                                file_path  genre
0  ..\datasets\GTZAN\blues\blues.00000.au  blues
1  ..\datasets\GTZAN\blues\blues.00001.au  blues
2  ..\datasets\GTZAN\blues\blues.00002.au  blues
3  ..\datasets\GTZAN\blues\blues.00003.au  blues
4  ..\datasets\GTZAN\blues\blues.00004.au  blues
Total files: 1000


In [24]:
class GTZANDataset(Dataset):
    def __init__(self, df, sr=22050, duration=10, n_mels=128, fixed_length=128, transform=None):
        """
        df: DataFrame with columns 'file_path' and 'genre'
        sr: sampling rate
        duration: seconds to load from each audio file
        n_mels: number of Mel bands
        fixed_length: fixed number of time frames for spectrogram (will pad/truncate)
        transform: any additional transforms (if needed)
        """
        self.df = df.reset_index(drop=True)
        self.sr = sr
        self.duration = duration
        self.n_mels = n_mels
        self.fixed_length = fixed_length
        self.transform = transform
        
        # Encode labels
        self.le = LabelEncoder()
        self.df['genre_encoded'] = self.le.fit_transform(self.df['genre'])
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = row['file_path']
        label = row['genre_encoded']
        
        # Load audio (duration seconds)
        y, sr = librosa.load(file_path, sr=self.sr, duration=self.duration)
        
        # Compute mel spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=self.n_mels)
        # Convert power spectrogram (amplitude squared) to decibel (log scale)
        S_db = librosa.power_to_db(S, ref=np.max)
        
        # Fix time dimension: pad or truncate along axis=1
        if S_db.shape[1] < self.fixed_length:
            S_db = librosa.util.fix_length(S_db, size=self.fixed_length, axis=1)
        else:
            S_db = S_db[:, :self.fixed_length]
        
        # Add channel dimension (for CNN: (1, n_mels, fixed_length))
        S_db = np.expand_dims(S_db, axis=0)
        # Convert to torch tensor
        S_db = torch.tensor(S_db, dtype=torch.float)
        label = torch.tensor(label, dtype=torch.long)
        
        if self.transform:
            S_db = self.transform(S_db)
            
        return S_db, label

In [25]:
# Split the DataFrame into train and validation (80/20)
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['genre'])

# Create dataset objects
train_dataset = GTZANDataset(df_train, duration=10, fixed_length=128)
val_dataset = GTZANDataset(df_val, duration=10, fixed_length=128)

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

print("Number of training samples:", len(train_dataset))
print("Number of validation samples:", len(val_dataset))


Number of training samples: 800
Number of validation samples: 200


In [26]:
class MusicClassifier(pl.LightningModule):
    def __init__(self, num_classes=10, learning_rate=1e-3):
        super(MusicClassifier, self).__init__()
        self.learning_rate = learning_rate
        
        # Define a simple CNN architecture
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)  # input: (1, n_mels, fixed_length)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool1 = nn.MaxPool2d(2)  # halves both dimensions
        
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(2)
        
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(2)
        
        # Calculate flattened feature size:
        # Input shape: (1, 128, 128) -> after pool1: (16, 64, 64)
        # after pool2: (32, 32, 32)
        # after pool3: (64, 16, 16)
        self.fc1 = nn.Linear(64 * 16 * 16, 128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        return {"val_loss": loss, "val_acc": acc}
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


In [27]:
# Create an instance of the model
num_classes = 10  # GTZAN has 10 genres
model = MusicClassifier(num_classes=num_classes, learning_rate=1e-3)

# Create a PyTorch Lightning Trainer (use GPU if available)
trainer = Trainer(max_epochs=10, accelerator="auto", devices="auto")
trainer.fit(model, train_loader, val_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

   | Name  | Type        | Params | Mode 
-----------------------------------------------
0  | conv1 | Conv2d      | 160    | train
1  | bn1   | BatchNorm2d | 32     | train
2  | pool1 | MaxPool2d   | 0      | train
3  | conv2 | Conv2d      | 4.6 K  | train
4  | bn2   | BatchNorm2d | 64     | train
5  | pool2 | MaxPool2d   | 0      | train
6  | conv3 | Conv2d      | 18.5 K | train
7  | bn3   | BatchNorm2d | 128    | train
8  | pool3 | MaxPool2d   | 0      | train
9  | fc1   | Linear      | 2.1 M  | train
10 | fc2   | Linear      | 1.3 K  | train
-----------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.488     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\sachi\OneDrive\Documents\music-genre-classification-and-recommendation\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\sachi\OneDrive\Documents\music-genre-classification-and-recommendation\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [28]:
# Run validation and print out the accuracy
results = trainer.validate(model, dataloaders=val_loader)
print(results)

Validation: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val_acc            0.6100000143051147
        val_loss            1.5067682266235352
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
[{'val_loss': 1.5067682266235352, 'val_acc': 0.6100000143051147}]
