In [34]:
import librosa
import soundfile as sf
import numpy as np

STRIDE_SIZE = 10.
FRAME_SIZE = 25.
N_MFCC = 16

def get_mfcc_features(song_data, sample_rate, stride_size = STRIDE_SIZE, frame_size = FRAME_SIZE):
    mfccs = librosa.feature.mfcc(song_data, sample_rate, 
                                 n_mfcc=N_MFCC,
                                 hop_length=int(STRIDE_SIZE / 1000. * sample_rate), 
                                 n_fft=int(FRAME_SIZE / 1000. * sample_rate))
    mfcc_delta1 = librosa.feature.delta(mfccs, order=1)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
    mfccs = np.asarray(mfccs)
    mfcc_delta1 = np.asarray(mfcc_delta1)
    mfcc_delta2 = np.asarray(mfcc_delta2)

    return np.concatenate((mfccs.T, mfcc_delta1.T, mfcc_delta2.T), axis=1)


In [35]:
file = "../wavs/pop/pop.00008.wav"
data, samplerate = sf.read(file)
mfcc_fts = get_mfcc_features(data, samplerate)
file = "../wavs/pop/pop.00007.wav"
data, samplerate = sf.read(file)
mfcc_fts2 = get_mfcc_features(data, samplerate)
mfcc_tot = np.concatenate((mfcc_fts, mfcc_fts2), axis=0)
print(np.shape(mfcc_tot))

(6014, 16)


In [37]:
import os

ROOT_DIRECTORY = "../wavs"
GENRE_COUNT = 5
SONGS_PER_GENRE = 25
SOUND_SIZE = 3
X = np.empty((GENRE_COUNT * SONGS_PER_GENRE * 1000, SOUND_SIZE, N_MFCC))

song_count = 0
for directory in next(os.walk(ROOT_DIRECTORY))[1][:GENRE_COUNT]:
    for filename in os.listdir(ROOT_DIRECTORY + "/" + directory)[:SONGS_PER_GENRE]:
        fp = ROOT_DIRECTORY + "/" + directory + "/" + filename
        data, sr = sf.read(fp)
        mfcc_fts = get_mfcc_features(data, samplerate)[:3000, :]
        mfcc_fts_grouped = [mfcc_fts[i:i+SOUND_SIZE] for i in range(len(mfcc_fts)//SOUND_SIZE)]
        X[song_count * 1000 : (song_count+1) * 1000, :] = mfcc_fts_grouped

In [38]:
#Initialize Sound HMMs
from hmmlearn import hmm
from sklearn.cluster import KMeans

NUM_SOUND_HMMS = 20
X_means = np.mean(X, axis=1)

print("Initial clustering of sounds via KMeans")
kmeans = KMeans(n_clusters=NUM_SOUND_HMMS, n_init=10).fit(X_means)
clusters = [ [] for c in range(NUM_SOUND_HMMS)]
for index, label in enumerate(kmeans.labels_):
    clusters[label].append(X[index])

Initial clustering of sounds via KMeans


In [41]:
print("Clusters:", len(clusters))

sound_HMMs = []

print("Initializing GMMHMM Models")
for i in range(NUM_SOUND_HMMS):
    model = hmm.GMMHMM(n_components=3, n_mix=100, covariance_type="full")
    sound_HMMs.append(model)

print("Fitting GMMHMM Models using initial sound clustering")
for index, cluster in enumerate(clusters):
    print(np.shape(cluster))
    soundHMMs[index] = sound_HMMs[index].fit(cluster)

Clusters: 20
Initializing GMMHMM Models
Fitting GMMHMM Models using initial sound clustering
(124000, 3, 16)


ValueError: Found array with dim 3. Estimator expected <= 2.

In [40]:
new_clusters = [ [] for c in range(NUM_SOUND_HMMS)]
for x in X:
    best_model = None
    best_model_index = -1
    best_score = -1e9
    for index, model in enumerate(sound_HMMs):
        curr_score = model.score(x)
        if curr_score > best_score:
            best_score = curr_score
            best_model = model
            best_model_index = index
    new_labels[best_model_index].append(best_model_index)

for index, cluster in enumerate(new_clusters):
    soundHMMs[index] = sound_HMMs[index].fit(cluster)

NotFittedError: This GMMHMM instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.