In [59]:
import librosa, numpy as np
import numpy as np
from mnist.loader import MNIST
import matplotlib.pyplot as plt
import glob
import os, json, math, hashlib, random, pickle
import h5py
import librosa
import umap
import soundfile as sf
import pandas as pd
from pathlib2 import Path

#import torch.nn.functional as F
#from torch import nn
#import torch

import math
#from torch.utils.data import Dataset, DataLoader

SEED=88
random.seed(SEED)
np.random.seed(SEED)
#torch.manual_seed(88)


#### feature functions (split into beat tokens)
- chroma
- tonnetz
- mfcc data
- mel data
- some representation of the tempogram (1d conv, perhaps with kernel 7, size 16)
- rolloff: brightness
- flatness: noisiness
(note to keep dimensions small.)

In [45]:
STEP=512
SR=44100 
def tokens_from_mp3(path):
    #y: signal
    y, sr = librosa.load(path, sr=SR, mono=True)
    y = librosa.util.normalize(y)
    y_h, y_p = librosa.effects.hpss(y) #percussive splitting

    #compute beat markers via onset graph
    onsets = librosa.onset.onset_strength(y=y_p, sr=SR, hop_length=STEP)
    tempo, beat_frames = librosa.beat.beat_track(onset_envelope=onsets, sr=SR)
    if len(beat_frames)<2:
        return None

    chroma = librosa.feature.chroma_cqt(y=y_h, sr=SR, hop_length = STEP, bins_per_octave=36)
    tonnetz = librosa.feature.tonnetz(chroma=chroma)
    
    mel = librosa.feature.melspectrogram(y=y, sr=SR, n_mels=96, hop_length=STEP)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
    mfcc_d1 = librosa.feature.delta(mfcc, order=1)
    
    tempogram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, hop_length=STEP)
    rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=STEP)
    centroid = librosa.feature.spectral_centroid(y=y, sr=SR, hop_length=STEP)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=SR, hop_length=STEP)
    flatness = librosa.feature.spectral_flatness(y=y, hop_length=STEP)
    
    def sync(feat, agg=np.mean):
        return librosa.util.sync(feat, beat_frames, aggregate=agg)
    
    beats = {
        "chroma": sync(chroma, np.mean),                    # (12, n_beats)
        "tonnetz": sync(tonnetz, np.mean),                  # (6, n_beats)
        "mel_mean": sync(mel_db, np.mean),                  # (96, n_beats)
        "mel_max": sync(mel_db, np.max),                    # (96, n_beats)
        "mfcc": sync(mfcc, np.mean),                        # (20, n_beats)
        "mfcc_d1": sync(mfcc_d1, np.mean),                  # (20, n_beats)
        "tempogram_mean": sync(tempogram, np.mean),         # (128, n_beats). try to conv1d this later
        "rms": sync(rms, np.mean),                          # (1, n_beats)
        "centroid": sync(centroid, np.mean),                # (1, n_beats)
        "rolloff": sync(rolloff, np.mean),                  # (1, n_beats)
        "flatness": sync(flatness, np.mean),                # (1, n_beats)
    }
    
    tg = beats["tempogram_mean"]
    beats["temp_peak_bpms"] = tg.argmax(axis=0, keepdims=True)
    
    n_beats = beats["chroma"].shape[1]
    pos = np.arange(n_beats, dtype=np.float32)[None, :]
    beats["index"] = pos
    
    feat_list = [
        beats["chroma"], beats["tonnetz"], beats["mel_mean"], beats["mfcc"], beats["mfcc_d1"], beats["tempogram_mean"], beats["rms"], beats["centroid"], beats["rolloff"], beats["flatness"]
    ]
    X = np.concatenate(feat_list, axis=0).T.astype(np.float32) 
    return X #(12+6+96+20+20+T+1+1+1+1, n_beats)



load paths

In [46]:
from pathlib import Path
ROOT = Path("./fma_small")
CSV = Path("./cache/fma_small.csv")
CACHE = Path("./cache/beat_tokens")

def paths(root_path):
    return sorted(root_path.rglob("*.mp3"))
all_paths = paths(ROOT)

def relative_cache_path(audio_path, audio_dir, cache_dir):
    relative = audio_path.relative_to(audio_dir)
    return (cache_dir/relative.parent/(relative.stem+".npz"))

def process_one_mp3(path, audio_dir, cache_dir):
    try:
        tokens = tokens_from_mp3(str(path))
        if tokens is None or len(tokens)<2 or tokens.shape[0]<2:
            return None
        
        npz_path = relative_cache_path(path, audio_dir, cache_dir)
        npz_path.parent.mkdir(parents=True, exist_ok=True) #file not found errors
        
        np.savez_compressed(npz_path, tokens=tokens.astype(np.float32))
        
        row = dict(
            audio=str(path),
            npz_path=str(npz_path),
            npz=str(npz_path.name),
            n_beats=int(tokens.shape[0]),
            dim = int(tokens.shape[1]),
        )
        return row
    except Exception as e:
        print(f"failed to process mp3 at {path}/{e}")
        return None



In [47]:
import tqdm

def create_manifest(paths, root_path, cache_dir, batch_size=None):
    rows=[]
    will_iterate = paths if batch_size is None else paths[:batch_size]
    for path in tqdm.tqdm(will_iterate):
        temp = process_one_mp3(path, root_path, cache_dir)
        if temp is not None:
            rows.append(temp)
    
    df = pd.DataFrame(rows).drop_duplicates(subset=["audio"]).reset_index(drop=True) #clean
    return df  

manifest_df = create_manifest(all_paths, ROOT, CACHE)
print(f"Processed {len(manifest_df)}")
manifest_df.head(3)

  6%|▌         | 490/7999 [22:56<5:42:39,  2.74s/it][src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!
 11%|█▏        | 901/7999 [41:52<5:16:32,  2.68s/it][src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
 15%|█▍        | 1181/7999 [54:25<5:01:44,  2.66s/it][src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
 28%|██▊       | 2264/7999 [2:34:06<4:36:20,  2.89s/it]   [src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3360) too large for available bit count (3240)
 28%|██▊       | 2266/7999 [2:34:12<4:33:33,  2.86s/it][src/libmpg123/layer3.c:INT123_do_layer3():1776] error: part2_3_length (3328) too large for available bit count (3240)
 55%|█████▌    | 4422/7999 [6:09:13<3:38:27,  3.66s/it]   Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
  y, sr = librosa.load(path, sr=SR, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in

failed to process mp3 at fma_small/099/099134.mp3/




failed to process mp3 at fma_small/108/108925.mp3/




failed to process mp3 at fma_small/133/133297.mp3/


100%|██████████| 7999/7999 [9:34:16<00:00,  4.31s/it]


Processed 7995


Unnamed: 0,audio,npz_path,npz,n_beats,dim
0,fma_small/000/000002.mp3,cache/beat_tokens/000/000002.npz,000002.npz,82,542
1,fma_small/000/000005.mp3,cache/beat_tokens/000/000005.npz,000005.npz,51,542
2,fma_small/000/000010.mp3,cache/beat_tokens/000/000010.npz,000010.npz,56,542


In [None]:
manifest_df.to_csv(CSV)
print("location:", CSV.resolve())

if len(manifest_df):
    print(f"tokens w size size {manifest_df.iloc[0]["dim"]}")
    print(f"median file: {manifest_df["n_beats"].median()}")

location: /Users/user/Desktop/coding comps/transition/cache/fma_small.csv
tokens w size size 542
median file: 61.0


then split into test/train

In [None]:
def split(tokens_df, train=0.8, val=0.1):
    copy = tokens_df.sort_values("npz_path").reset_index(drop=True).copy()
    if len(copy)==0: return copy
    idx = np.arange(len(copy))
    
    rng = np.random.RandomState(SEED)
    rng.shuffle(idx)
    
    split = np.array(["test"] * len(copy), dtype=object) 
    split[idx[: int(train * len(copy))]] = "train"
    split[idx[int(train * len(copy)) : int((train + val) * len(copy))]] = "val"
    
    copy["split"] = split
    return copy

    
    
manifest_df = split(manifest_df)
manifest_df.to_csv(CSV)

manifest_df.head(3)

Unnamed: 0,audio,npz_path,npz,n_beats,dim,split
0,fma_small/000/000002.mp3,cache/beat_tokens/000/000002.npz,000002.npz,82,542,train
1,fma_small/000/000005.mp3,cache/beat_tokens/000/000005.npz,000005.npz,51,542,val
2,fma_small/000/000010.mp3,cache/beat_tokens/000/000010.npz,000010.npz,56,542,train
