In [None]:
import os, numpy as np, librosa, laion_clap, torch, umap
root='../webcache'
print("scanning for audio files...")
paths = [os.path.join(dp,f) for dp,_,fs in os.walk(root) for f in fs if f.lower().endswith((".wav",".flac",".mp3",".aiff",".aif",".ogg"))]
print(len(paths),'files')

In [None]:
import tqdm
good_paths = []
for path in tqdm.tqdm(paths):
    audio_waveform, _ = librosa.load(path, sr=48000)           
    if len(audio_waveform) < 100:
        print(path, len(audio_waveform))
    else:
        good_paths.append(path)
print("filtered", len(paths)-len(good_paths), "files")
paths=good_paths



In [None]:
model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base')
model.load_ckpt("../webcache/music_audioset_epoch_15_esc_90.14.pt")  # downloaded from HF

In [None]:
from tqdm import tqdm
batches=[]
for i in tqdm(range(0, len(paths), 100)):
    batch = model.get_audio_embedding_from_filelist(x=paths[i:i+100], use_tensor=False)
    batches.append(batch)
E_clap = np.vstack(batches)
Z = E_clap


In [None]:
np.save("E_clap.npy", E_clap)

In [None]:
def tiny_feats(p):
    y, sr = librosa.load(p, sr=48000, mono=True)
    C = librosa.feature.chroma_cqt(y=y, sr=sr).mean(1)                  # 12
    on = librosa.onset.onset_strength(y=y, sr=sr); trans = [on.mean(), on.std()]  # 2
    spec = librosa.feature.spectral_centroid(y=y, sr=sr).mean()         # 1
    return np.r_[C, trans, spec]                                        # 15D
# tiny features...
use_tiny_feats = False
if use_tiny_feats:
    F = np.vstack([tiny_feats(p) for p in tqdm(paths)])
else:
    F = None

In [None]:
Z = np.hstack([E_clap, F]) if F is not None else E_clap.copy()
Z=Z-Z.mean(0)
Z=Z/Z.std(0)

In [None]:
use_tsne = True
if use_tsne:
    from sklearn.manifold import TSNE
    Z_map = TSNE(n_components=2, perplexity=100, metric="cosine", init="pca").fit_transform(Z)
else:
    Z_map = umap.UMAP(n_neighbors=250, min_dist=0.1, metric="cosine").fit_transform(Z)


In [None]:
import numpy as np, matplotlib.pyplot as plt

def svd_cbcr_colors(Z, Y=0.5, robust=False):
    U, S, Vt = np.linalg.svd(Z, full_matrices=False)
    X = U[:, :2] * S[:2]                             # principal scores on top-2
    if robust:
        lo, hi = np.percentile(X, [1, 99], axis=0)
        X = np.clip(X, lo, hi)
    C = (X - X.min(0)) / (X.ptp(0) + 1e-12)          # scale each to [0,1]
    Cb, Cr = C[:,0], C[:,1]

    # YCbCr (BT.601 full-range-ish) -> RGB
    cb, cr = Cb - 0.5, Cr - 0.5
    R = Y + 1.402 * cr
    G = Y - 0.344136 * cb - 0.714136 * cr
    B = Y + 1.772 * cb
    RGB = np.clip(np.c_[R,G,B], 0, 1)
    return RGB

RGB = svd_cbcr_colors(Z, Y=0.7, robust=True)        # set robust=False if you prefer raw min-max

plt.figure(figsize=(6,5))
plt.style.use('dark_background')

plt.scatter(Z_map[:,0], Z_map[:,1], c=RGB, s=2, linewidths=0)
plt.axis('equal'); plt.xticks([]); plt.yticks([]); plt.tight_layout()


In [None]:
from scipy.spatial.distance import cdist
dists = cdist(Z_map, Z_map, metric='euclidean')
np.fill_diagonal(dists, np.inf)  # Ignore self-distance
mindist = np.min(dists, axis=1)
print(mindist[0])


In [None]:
j = {paths[i].replace(root+'/', '') : Z_map[i].tolist() + RGB[i].tolist() + [mindist[i]] for i in range(len(paths))}
import json
with open('../assets/umap_sounds.json', 'w') as f:
    json.dump(j, f, indent=2)
