In [69]:
from pathlib import Path
import scipy.io.wavfile as wav
import numpy as np
from python_speech_features import mfcc
from python_speech_features import delta

genres = {'reggae': 0, 'metal': 1, 'blues': 2, 'country': 3,
         'classical': 4, 'pop': 5, 'jazz': 6, 'rock': 7,
         'hiphop': 8, 'disco': 9}
#Genres
# 0: reggae
# 1: metal
# 2: blues
# 3: country
# 4: classical
# 5: pop
# 6: jazz
# 7: rock
# 8: hiphop
# 9: disco

def avgNSamples(a, N=10):
    tmp = np.cumsum(a, 0)[N-1::N]/float(N)
    tmp[1:] = tmp[1:] - tmp[:-1]
    return tmp

rate = 22050
length = 27 #seconds
sig_length = rate*length

sample_features = []
sample_genre = []
pathlist = Path('../genres').glob('**/*.wav')

i = 0
for path in pathlist:
    p = str(path)
    genre = p.split('/')[2]
    (rate, sig) = wav.read(p)
    sig = sig[:sig_length]
    
    for i in range(int(sig_length/(3*rate))):
        sample_genre.append(genres[genre])
        
        part = sig[i*(3*rate):(i+1)*(3*rate)]
        mfcc_feat = mfcc(part, rate, nfft=551)
        d_mfcc_feat = delta(mfcc_feat, 2)
        dd_mfcc_feat = delta(d_mfcc_feat, 2)
        
        #Average every 10 samples
        mfcc_feat = avgNSamples(mfcc_feat)
        d_mfcc_feat = avgNSamples(d_mfcc_feat)
        dd_mfcc_feat = avgNSamples(dd_mfcc_feat)
        #Cast to a single 1x351 vector
        sample = np.hstack((np.hstack((mfcc_feat.flatten(), d_mfcc_feat.flatten())), dd_mfcc_feat.flatten()))
        sample_features.append(sample)
    
sample_genre = np.array(sample_genre)
sample_features = np.array(sample_features)
print(sample_features.shape)
print(sample_genre.shape)

# Split our data into test and train where we split on samples
# So all the samples of one song are included in either test or train
# for i in range(10):
#     t = np.random.permutation((i+1)*100)
#     print(t)

# traindata = {}
# testdata = {}
# data = {'signals': song_sigs, 'genres': song_genres}
# np.savez('raw_signal', data)

(9000, 1131)
(9000,)


In [70]:
train_features = []
train_genre = []
test_features = []
test_genre = []
for i in range(10):
    trainsize = 70
    perm = np.random.permutation(100) + i*100
    train = perm[:trainsize]
    test = perm[trainsize:]
    p_song = 9
    for t in train:
        train_features.extend(sample_features[t*p_song:(t+1)*p_song])
        train_genre.extend(sample_genre[t*p_song:(t+1)*p_song])
    for t in test:
        test_features.extend(sample_features[t*p_song:(t+1)*p_song])
        test_genre.extend(sample_genre[t*p_song:(t+1)*p_song])

train_features = np.array(train_features)
train_genre = np.array(train_genre)

test_features = np.array(test_features)
test_genre = np.array(test_genre)

traindata = {'features': train_features, 'genre': train_genre}
testdata = {'features': test_features, 'genre': test_genre}

In [71]:
from sklearn.ensemble import RandomForestClassifier

scaler = StandardScaler().fit(traindata['features'])
scaled_features = scaler.transform(traindata['features'])

rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, max_features='sqrt', oob_score=True)
rf.fit(scaled_features, traindata['genre'])

# rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, max_features='sqrt', oob_score=True)
# rf.fit(traindata['features'], traindata['genre'])

print("RF Out-of-bag error rate: ", 1-rf.oob_score_)
rf.score(scaler.transform(testdata['features']), testdata['genre'])

RF Out-of-bag error rate:  0.4280952380952381


0.5385185185185185

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, whiten=True)
pca_data = pca.fit_transform(sample_features)

y_song = np.repeat(np.arange(10), 2800)
plt.scatter(pca_data[:,0], pca_data[:,1],
           c=y_song, cmap=plt.cm.get_cmap('Paired', 10))
plt.title('PCA')
plt.colorbar(ticks=range(10))
plt.show()

# for i in range(10):
#     plt.scatter(pca_data[i*2800:(i+1)*2800,0], pca_data[i*2800:(i+1)*2800,1])
#     plt.show()

In [72]:
def predict(songs):
    predictions = []
    for song in songs:
        predictions.append(np.argmax(np.bincount(rf.predict(song))))
#         print(rf.predict(song))
        print(np.bincount(rf.predict(song)))
        print(np.argsort(np.bincount(rf.predict(song)))[::-1][:3])
    
    return predictions

predictions = []
for i in range(300):
    pred = rf.predict(scaler.transform(testdata['features'][i*9:(i+1)*9]))
    predictions.append(np.argmax(np.bincount(pred)))
#     print(np.bincount(pred))

correct = np.repeat(np.arange(10), 30)
print(np.where(predictions==correct)[0].shape[0]/300)

0.5533333333333333


In [53]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm

scaler = StandardScaler().fit(traindata['features'])
scaled_features = scaler.transform(traindata['features'])
print('sc')

perm = np.random.permutation(6300)
clf = svm.SVC(C=0.001, kernel='poly', gamma=2)
classifier = clf.fit(scaled_features[perm], traindata['genre'][perm])


sc


KeyboardInterrupt: 

In [16]:
clf.score(scaler.transform(testdata['features']), testdata['genre'])

0.37296296296296294

In [39]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(max_iter=100, hidden_layer_sizes=(300,100))

clf.fit(scaled_features[perm], traindata['genre'][perm])

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(300, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [40]:
clf.score(scaler.transform(testdata['features']), testdata['genre'])

0.4792592592592593

In [14]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
X = scaler.fit_transform(traindata['features'])

C_range = np.logspace(-2, 10, 3)
gamma_range = np.logspace(-9, 3, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, traindata['genre'])

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

KeyboardInterrupt: 

In [None]:
from python_speech_features import mfcc
from python_speech_features import delta

frame_list = []
for song in song_sigs:
    mfcc_feat = mfcc(song, rate, nfft=551)
    d_mfcc_feat = delta(mfcc_feat, 2)
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    ret = np.hstack((mfcc_feat, d_mfcc_feat))
    ret = np.hstack((ret, dd_mfcc_feat))
    
    frame_list.append(ret.T)
print(frame_list[0].shape)

In [None]:
frame_list = np.concatenate(frame_list, axis=1).T
print(frame_list.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier

perm = np.random.permutation(np.arange(frame_list.shape[0]))
train = perm[0:100000]
test = perm[500000:550000]
frame_genre = np.repeat(np.arange(10), frame_list.shape[0]/10)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features='sqrt', oob_score=True)
rf.fit(frame_list[train], frame_genre[train])

print("RF Out-of-bag error rate: ", 1-rf.oob_score_)
rf.score(frame_list[test], frame_genre[test])

In [None]:
(rate, sig) = wav.read('../test/HipHopTest/2.wav')
mfcc_feat = mfcc(sig, rate, nfft=551)
d_mfcc_feat = delta(mfcc_feat, 2)
dd_mfcc_feat = delta(d_mfcc_feat, 2)
tmp = np.hstack((mfcc_feat, d_mfcc_feat))
tmp = np.hstack((tmp, dd_mfcc_feat))
print(mfcc_feat.shape)

s = rf.predict(tmp)

print(np.argsort(np.bincount(s))[::-1])
print(np.bincount(s))

In [None]:
s = rf.predict(mfcc_feat)


In [None]:
import matplotlib.pyplot as plt
from python_speech_features import mfcc
from python_speech_features import delta
from sklearn.preprocessing import StandardScaler

mfcc_list = []
scaler = StandardScaler()
# Convert the song clip into a stacked vector
for song in song_sigs:
    mfcc_feat = mfcc(song, rate, nfft=551)
    scaled_mfcc_feat = scaler.fit_transform(mfcc_feat.T).T
    
    d_mfcc_feat = delta(mfcc_feat, 2)
    scaled_d_mfcc_feat = scaler.fit_transform(d_mfcc_feat.T).T
    
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    scaled_dd_mfcc_feat = scaler.fit_transform(dd_mfcc_feat.T).T
    
    ret = np.vstack((scaled_mfcc_feat, scaled_d_mfcc_feat))
    ret = np.vstack((ret, scaled_dd_mfcc_feat))
    
    mfcc_list.append(ret)
    
#     break
mfcc_list = np.array(mfcc_list)


In [None]:
flat_list = []
for m in mfcc_list:
    flat_list.append(m.flatten())
flat_list = np.array(flat_list)

In [None]:
print(np.max(flat_list), np.min(flat_list))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_data = pca.fit_transform(flat_list)

In [None]:
y_song=np.repeat(np.arange(10), 100)

plt.scatter(pca_data[:,0], pca_data[:,1],
           c=y_song, cmap=plt.cm.get_cmap('Paired', 10))
plt.title('PCA')
plt.colorbar(ticks=range(10))
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, max_features='sqrt', oob_score=True)
rf.fit(flat_list[train], y_song[train])

print("RF Out-of-bag error rate: ", 1-rf.oob_score_)
rf.score(flat_list[test], y_song[test])

In [None]:
from sklearn import svm

perm = np.random.permutation(np.arange(1000))
train = perm[0:700]
test = perm[700:1000]

clf = svm.SVC()
clf.fit(flat_list[train], y_song[train])
clf.score(flat_list[test], y_song[test])