In [1]:
from pathlib import Path
import scipy.io.wavfile as wav
import numpy as np

genres = {'reggae': 0, 'metal': 1, 'blues': 2, 'country': 3,
         'classical': 4, 'pop': 5, 'jazz': 6, 'rock': 7,
         'hiphop': 8, 'disco': 9}
#Genres
# 0: reggae
# 1: metal
# 2: blues
# 3: country
# 4: classical
# 5: pop
# 6: jazz
# 7: rock
# 8: hiphop
# 9: disco

rate = 22050
length = 28 #seconds
sig_length = rate*length

song_sigs = []
song_genres = []
pathlist = Path('../genres').glob('**/*.wav')

i = 0
for path in pathlist:
    p = str(path)
    genre = p.split('/')[2]
#     print(genre)
    (rate, sig) = wav.read(p)
    sig = sig[:sig_length]
    song_sigs.append(sig)
    song_genres.append(genres[genre])
    
print(len(song_sigs))
song_sigs = np.array(song_sigs)
song_genres = np.array(song_genres)
print(song_sigs.shape)
print(song_genres.shape)

# data = {'signals': song_sigs, 'genres': song_genres}
# np.savez('raw_signal', data)

1000
(1000, 617400)
(1000,)


In [22]:
from python_speech_features import mfcc
from python_speech_features import delta

frame_list = []
for song in song_sigs:
    mfcc_feat = mfcc(song, rate, nfft=551)
    d_mfcc_feat = delta(mfcc_feat, 2)
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    ret = np.hstack((mfcc_feat, d_mfcc_feat))
    ret = np.hstack((ret, dd_mfcc_feat))
    
    frame_list.append(ret.T)
print(frame_list[0].shape)

(39, 2793)


In [23]:
frame_list = np.concatenate(frame_list, axis=1).T
print(frame_list.shape)

(2793000, 39)


In [16]:
from sklearn.ensemble import RandomForestClassifier

perm = np.random.permutation(np.arange(frame_list.shape[0]))
train = perm[0:100000]
test = perm[500000:550000]
frame_genre = np.repeat(np.arange(10), frame_list.shape[0]/10)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features='sqrt', oob_score=True)
rf.fit(frame_list[train], frame_genre[train])

print("RF Out-of-bag error rate: ", 1-rf.oob_score_)
rf.score(frame_list[test], frame_genre[test])

RF Out-of-bag error rate:  0.51718


0.51168

In [19]:
(rate, sig) = wav.read('../test/HipHopTest/2.wav')
mfcc_feat = mfcc(sig, rate, nfft=551)
d_mfcc_feat = delta(mfcc_feat, 2)
dd_mfcc_feat = delta(d_mfcc_feat, 2)
tmp = np.hstack((mfcc_feat, d_mfcc_feat))
tmp = np.hstack((tmp, dd_mfcc_feat))
print(mfcc_feat.shape)

s = rf.predict(tmp)

print(np.argsort(np.bincount(s))[::-1])
print(np.bincount(s))

(28221, 13)
[5 0 3 8 6 4 9 7 2 1]
[7380   99  371 2734 1833 8636 2352  496 2648 1672]


In [None]:
s = rf.predict(mfcc_feat)


In [3]:
import matplotlib.pyplot as plt
from python_speech_features import mfcc
from python_speech_features import delta
from sklearn.preprocessing import StandardScaler

mfcc_list = []
scaler = StandardScaler()
# Convert the song clip into a stacked vector
for song in song_sigs:
    mfcc_feat = mfcc(song, rate, nfft=551)
    scaled_mfcc_feat = scaler.fit_transform(mfcc_feat.T).T
    
    d_mfcc_feat = delta(mfcc_feat, 2)
    scaled_d_mfcc_feat = scaler.fit_transform(d_mfcc_feat.T).T
    
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    scaled_dd_mfcc_feat = scaler.fit_transform(dd_mfcc_feat.T).T
    
    ret = np.vstack((scaled_mfcc_feat, scaled_d_mfcc_feat))
    ret = np.vstack((ret, scaled_dd_mfcc_feat))
    
    mfcc_list.append(ret)
    
#     break
mfcc_list = np.array(mfcc_list)


KeyboardInterrupt: 

In [None]:
flat_list = []
for m in mfcc_list:
    flat_list.append(m.flatten())
flat_list = np.array(flat_list)

In [None]:
print(np.max(flat_list), np.min(flat_list))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(flat_list)

In [None]:
y_song=np.repeat(np.arange(10), 100)

plt.scatter(pca_data[:,0], pca_data[:,1],
           c=y_song, cmap=plt.cm.get_cmap('Paired', 10))
plt.title('PCA')
plt.colorbar(ticks=range(10))
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, max_features='sqrt', oob_score=True)
rf.fit(flat_list[train], y_song[train])

print("RF Out-of-bag error rate: ", 1-rf.oob_score_)
rf.score(flat_list[test], y_song[test])

In [None]:
from sklearn import svm

perm = np.random.permutation(np.arange(1000))
train = perm[0:700]
test = perm[700:1000]

clf = svm.SVC()
clf.fit(flat_list[train], y_song[train])
clf.score(flat_list[test], y_song[test])