In [12]:
import os
from datasets import load_dataset
import torch
import torchaudio
from transformers import AutoProcessor, AutoModel
import numpy as np
from panns_inference import AudioTagging

In [None]:

# Load the dataset
dataset = load_dataset("lewtun/music_genres_small")

# Check a sample

samples = dataset['train'][0]
print(samples)

{'audio': {'path': None, 'array': array([1.00427963e-10, 1.75107051e-09, 7.35211780e-10, ...,
       5.83759369e-03, 8.89943959e-03, 1.40173053e-02]), 'sampling_rate': 44100}, 'song_id': 6824, 'genre_id': 9, 'genre': 'International'}


In [30]:
# Load PANNs model (Cnn14)
model = AudioTagging(checkpoint_path=None, device='cuda')  # Use 'cuda' if you have a GPU

def extract_embeddings(audio_sample):
    """Extract embeddings from a dataset sample."""
    waveform = torch.tensor(audio_sample["audio"]["array"]).float().unsqueeze(0).to("cuda")  # Convert to FloatTensor & move to GPU
    with torch.no_grad():
        embedding, _ = model.inference(waveform)  # ✅ Unpack tuple: (embedding, class_probabilities)
    return embedding  # ✅ Returns the correct feature embeddings

def extract_and_save_embeddings(dataset, save_path="embeddings-for-instruments-task.npy"):
    embeddings = []
    for sample in dataset:
        embeddings.append(extract_embeddings(sample))
    np.save(save_path, np.array(embeddings))
    return save_path

embedding_file = extract_and_save_embeddings(dataset['train'])



Checkpoint path: C:\Users\Prosp/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


In [35]:
# Load the embeddings
embeddings = np.load("embeddings-for-instruments-task.npy")

# Check the shape of embeddings
print(embeddings.shape)  # (num_samples, embedding_dim)

(1000, 1, 527)


In [33]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Reduce embeddings to 2D
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
plt.title("t-SNE Visualization of Embeddings")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()


ValueError: Found array with dim 3. TSNE expected <= 2.