# **Convolutional VAE for Audio Features**
We enhanced the previous Variational Autoencoder (VAE) by using a convolutional architecture specifically designed for 2D audio representations, such as Mel-spectrograms or MFCCs.
Convolutional layers can efficiently capture local time-frequency patterns in audio, leading to a more expressive and compact latent representation suitable for downstream tasks like clustering.

In [1]:
# Import necessary libraries
import numpy as np
import soundfile as sf
import librosa
from pathlib import Path
import pandas as pd
from pathlib import Path
import os
import glob
import soundfile as sf
import librosa
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [3]:
# Load audio files from the specified directory
audio_dir = Path("../data/audio")
audio_files = sorted(audio_dir.glob("*/*.mp3"))

print(f"Loaded {len(audio_files)} audio files")

Loaded 3554 audio files


In [4]:
# Extract Mel-spectrogram features
def load_audio(path, target_sr=22050):
    audio, sr = sf.read(path, dtype='float32')

    # Convert stereo to mono
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    # Resample if needed (keyword arguments!)
    if sr != target_sr:
        audio = librosa.resample(
            y=audio,
            orig_sr=sr,
            target_sr=target_sr
        )

    return audio

In [5]:
# Extract Mel-spectrogram features
def extract_mel(path, n_mels=64, fixed_len=1304):  
    y, sr = librosa.load(path, sr=22050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=512)
    mel = np.log(mel + 1e-9)
    
    # Normalize to [0,1]
    mel = (mel - mel.min()) / (mel.max() - mel.min() + 1e-9)
    
    # Pad or truncate to fixed_len
    if mel.shape[1] < fixed_len:
        pad_width = fixed_len - mel.shape[1]
        mel = np.pad(mel, ((0,0), (0,pad_width)), mode='constant')
    else:
        mel = mel[:, :fixed_len]

    return torch.tensor(mel, dtype=torch.float32)

In [6]:
# Test the extract_mel function
x = extract_mel(audio_files[0])
print(x.shape)

torch.Size([64, 1304])


In [7]:
# Define a custom Dataset for audio files
class AudioDataset(Dataset):
    def __init__(self, audio_files, n_mels=64, fixed_len=1300):
        self.audio_files = audio_files
        self.n_mels = n_mels
        self.fixed_len = fixed_len

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        # Load audio
        y, sr = librosa.load(self.audio_files[idx], sr=22050)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=self.n_mels, hop_length=512)
        mel = np.log(mel + 1e-9)
        # Pad or truncate
        if mel.shape[1] < self.fixed_len:
            pad_width = self.fixed_len - mel.shape[1]
            mel = np.pad(mel, ((0,0),(0,pad_width)), mode='constant')
        else:
            mel = mel[:, :self.fixed_len]
        mel = torch.tensor(mel, dtype=torch.float32).unsqueeze(0)  
        return mel

In [8]:
# Define the ConvVAE model
class ConvVAE(nn.Module):
    def __init__(self, n_mels=64, fixed_len=1300, latent_dim=32):
        super().__init__()
        self.n_mels = n_mels
        self.fixed_len = fixed_len
        self.latent_dim = latent_dim
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1,16,3,stride=2,padding=1), nn.BatchNorm2d(16), nn.ReLU(),
            nn.Conv2d(16,32,3,stride=2,padding=1), nn.BatchNorm2d(32), nn.ReLU(),
            nn.Conv2d(32,64,3,stride=2,padding=1), nn.BatchNorm2d(64), nn.ReLU()
        )
        dummy_input = torch.zeros(1,1,n_mels,fixed_len)
        h = self.encoder(dummy_input)
        self.enc_shape = h.shape[1:]
        self.flattened_size = h.numel() // h.shape[0]

        # Latent space
        self.fc_mu = nn.Linear(self.flattened_size, latent_dim)
        self.fc_logvar = nn.Linear(self.flattened_size, latent_dim)

        # Decoder
        self.fc_decode = nn.Linear(latent_dim, self.flattened_size)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64,32,3,stride=2,padding=1,output_padding=1), nn.BatchNorm2d(32), nn.ReLU(),
            nn.ConvTranspose2d(32,16,3,stride=2,padding=1,output_padding=1), nn.BatchNorm2d(16), nn.ReLU(),
            nn.ConvTranspose2d(16,1,3,stride=2,padding=1,output_padding=1), nn.Sigmoid()
        )

    def encode(self,x):
        h = self.encoder(x)
        h = h.view(h.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self,mu,logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self,z):
        h = self.fc_decode(z)
        h = h.view(-1, *self.enc_shape)
        recon = self.decoder(h)
        recon = F.interpolate(recon, size=(self.n_mels,self.fixed_len), mode='bilinear', align_corners=False)
        return recon

    def forward(self,x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar

In [9]:
# VAE loss function
def vae_loss(recon_x, x, mu, logvar):
    recon_loss = F.mse_loss(recon_x, x, reduction='sum')
    kld = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kld

In [10]:
# Training loop for ConvVAE
def train_vae(audio_files, epochs=10, batch_size=8, latent_dim=32, lr=1e-3, n_mels=64, fixed_len=1300):
    dataset = AudioDataset(audio_files, n_mels=n_mels, fixed_len=fixed_len)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = ConvVAE(n_mels=n_mels, fixed_len=fixed_len, latent_dim=latent_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for batch in loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon, mu, logvar = model(batch)
            loss = vae_loss(recon, batch, mu, logvar)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(loader):.6f}")
    return model

In [11]:
# Example usage
model = ConvVAE(n_mels=64, fixed_len=1300, latent_dim=32)
sum(p.numel() for p in model.parameters() if p.requires_grad)

8142145

In [12]:
# Training loop for ConvVAE 
model = ConvVAE(n_mels=64, fixed_len=1300, latent_dim=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
torch.save(model.state_dict(), "../results/models/conv_audio_vae.pth")

In [13]:
# Extract latent vectors from trained ConvVAE
model.eval()
latent_vectors = []

# Dataset and DataLoader
dataset = AudioDataset(audio_files, n_mels=64, fixed_len=1304)
loader = DataLoader(dataset, batch_size=1, shuffle=False)

# Extract latent vectors
with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)                 
        mu, logvar = model.encode(batch)    #
        z = model.reparameterize(mu, logvar)  
        latent_vectors.append(z.cpu().numpy()) 

# Convert list to numpy array
z_audio = np.vstack(latent_vectors)
print("Audio latent vectors shape:", z_audio.shape)

Audio latent vectors shape: (3554, 32)


In [15]:
# Save latent vectors
np.save("../results/z_audio.npy", z_audio)
print("z_audio saved successfully!")

z_audio saved successfully!
