# **Audio Latent Space Clustering and Visualization**

This project demonstrates how to perform **K-Means clustering** on latent features extracted from a **Variational Autoencoder (VAE)** trained on audio data. It also visualizes the clusters in 2D using **t-SNE**.


In [2]:
# Library Imports
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from pathlib import Path
import librosa
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import os

In [3]:
# Mel-spectrogram extraction
def extract_mel(path, n_mels=64, fixed_len=1304):
    y, sr = librosa.load(path, sr=22050)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=512)
    mel = np.log(mel + 1e-9)
    mel = (mel - mel.min()) / (mel.max() - mel.min() + 1e-9)
    if mel.shape[1] < fixed_len:
        pad_width = fixed_len - mel.shape[1]
        mel = np.pad(mel, ((0,0),(0,pad_width)), mode='constant')
    else:
        mel = mel[:, :fixed_len]
    return torch.tensor(mel, dtype=torch.float32)

In [4]:
# Dataset
class AudioDataset(Dataset):
    def __init__(self, audio_files, fixed_len=1304):
        self.audio_files = audio_files
        self.fixed_len = fixed_len

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        mel = extract_mel(self.audio_files[idx], fixed_len=self.fixed_len)
        return mel.unsqueeze(0)

In [5]:
# VAE model (encoder only)
class VAE(torch.nn.Module):
    def __init__(self, latent_dim=32, fixed_len=1304, n_mels=64):
        super().__init__()
        self.n_mels = n_mels
        self.fixed_len = fixed_len

        self.encoder = torch.nn.Sequential(
            torch.nn.Conv2d(1, 16, 3, stride=2, padding=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(16, 32, 3, stride=2, padding=1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(32, 64, 3, stride=2, padding=1),
            torch.nn.ReLU(),
        )

        dummy_input = torch.zeros(1, 1, n_mels, fixed_len)
        h = self.encoder(dummy_input)
        self.enc_shape = h.shape[1:]
        self.flattened_size = h.numel() // h.shape[0]

        self.fc_mu = torch.nn.Linear(self.flattened_size, latent_dim)
        self.fc_logvar = torch.nn.Linear(self.flattened_size, latent_dim)

    def encode(self, x):
        h = self.encoder(x)
        h = h.view(h.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)


In [8]:
# Load trained VAE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
latent_dim = 32
fixed_len = 1304

vae_model = VAE(latent_dim=latent_dim, fixed_len=fixed_len).to(device)

# Load state dict and ignore extra keys
state_dict = torch.load(r"E:\CSE425_Project\project\results\models\audio_vae.pth", map_location=device)
filtered_dict = {k: v for k, v in state_dict.items() if k in vae_model.state_dict()}
vae_model.load_state_dict(filtered_dict, strict=False)
vae_model.eval()

VAE(
  (encoder): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): ReLU()
  )
  (fc_mu): Linear(in_features=83456, out_features=32, bias=True)
  (fc_logvar): Linear(in_features=83456, out_features=32, bias=True)
)

In [9]:
# Load audio files
audio_dir = Path("../data/audio")
audio_files = sorted(audio_dir.glob("*/*.mp3"))
dataset = AudioDataset(audio_files, fixed_len=fixed_len)
loader = DataLoader(dataset, batch_size=16, shuffle=False)

In [10]:
# Extract latent features
latent_features = []
file_paths = []

with torch.no_grad():
    for batch_idx, batch in enumerate(loader):
        batch = batch.to(device)
        mu, logvar = vae_model.encode(batch)
        latent_features.append(mu.cpu().numpy())
        file_paths.extend(audio_files[batch_idx*loader.batch_size : batch_idx*loader.batch_size + batch.size(0)])

latent_features = np.vstack(latent_features)
print("Latent features shape:", latent_features.shape)

Latent features shape: (3554, 32)


In [11]:
# K-Means clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(latent_features)
print("Clusters assigned.")

Clusters assigned.


In [12]:
# Prepare results folder
output_dir = Path("../results/latent_visualization")
os.makedirs(output_dir, exist_ok=True)

In [13]:
# t-SNE Visualization and save
tsne = TSNE(n_components=2, random_state=42)
tsne_2d = tsne.fit_transform(latent_features)

plt.figure(figsize=(10,6))
for c in range(num_clusters):
    idxs = cluster_labels == c
    plt.scatter(tsne_2d[idxs,0], tsne_2d[idxs,1], label=f'Cluster {c}', alpha=0.7)
plt.title("t-SNE of Latent Features")
plt.legend()
plt.tight_layout()
plt.savefig(output_dir / "tsne_clusters.png")
plt.close()
print("t-SNE plot saved at:", output_dir / "tsne_clusters.png")

t-SNE plot saved at: ..\results\latent_visualization\tsne_clusters.png
