In [1]:
import copy
from pathlib import Path

import scipy.io.wavfile as wav
import numpy as np
from sklearn import preprocessing

from python_speech_features import mfcc
from python_speech_features import delta

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score


In [2]:
genres = {'reggae': 0, 'metal': 1, 'blues': 2, 'country': 3,
         'classical': 4, 'pop': 5, 'jazz': 6, 'rock': 7,
         'hiphop': 8, 'disco': 9}
#Genres
# 0: reggae
# 1: metal
# 2: blues
# 3: country
# 4: classical
# 5: pop
# 6: jazz
# 7: rock
# 8: hiphop
# 9: disco

def_rate = 22050

def readSongs(duration):
    pathlist = Path('../genres').glob('**/*.wav')
    
    song_sigs = {'reggae': [], 'metal': [], 'blues': [], 'country': [],
         'classical': [], 'pop': [], 'jazz': [], 'rock': [],
         'hiphop': [], 'disco': []}
    
    train_sigs = copy.deepcopy(song_sigs)
    test_sigs = copy.deepcopy(song_sigs)
    
    for path in pathlist:
        p = str(path)
        genre = p.split('/')[2]
        (rate, sig) = wav.read(p)
        sig = sig[:rate*duration]
        song_sigs[genre].append(sig)
    
    for key in song_sigs:
        perm = np.random.permutation(100)
        train = perm[:70]
        test = perm[70:]
        
        train_sigs[key] = np.array(song_sigs[key])[train]
        test_sigs[key] = np.array(song_sigs[key])[test]
    return train_sigs, test_sigs

def extractFeatures(signals, sample_len):
    features = []
    labels = []
    for key in signals:
        for sig in signals[key]:
            no_samples = int((len(sig)/def_rate)/sample_len)
            part_len = int(def_rate*sample_len)
            for i in range(no_samples):
                part = sig[i*part_len:(i+1)*part_len]
                mfcc_feat = mfcc(part, def_rate, nfft=551)
                d_mfcc_feat = delta(mfcc_feat, 2)
                dd_mfcc_feat = delta(d_mfcc_feat, 2)
                
                #Cast to single vector
                sample = np.hstack((np.hstack((mfcc_feat.flatten(), d_mfcc_feat.flatten())), dd_mfcc_feat.flatten()))
                features.append(sample)
                labels.append(genres[key])
                
    return features, labels

def getData(duration, sample_len):
    train_signal, test_signal = readSongs(duration)
    
    train_feat, train_label = extractFeatures(train_signal, sample_len)
    test_feat, test_label = extractFeatures(test_signal, sample_len)
    
    scaler = preprocessing.StandardScaler().fit(train_feat)
    train_feat = scaler.transform(train_feat)
    test_feat = scaler.transform(test_feat)
    
    return train_feat, train_label, test_feat, test_label, scaler

def createForest(duration, sample_len):
    trf, trl, tef, tel, scaler = getData(duration, sample_len)
    
    rf = RandomForestClassifier(n_jobs=-1, n_estimators=500, max_features='sqrt', oob_score=True)
    rf.fit(trf, trl)
    
    return rf, scaler, tef, tel

def songWeightedPred(rf, feat, label, no_samples):
    truth = []
    pred = []
    for i in range(int(feat.shape[0]/no_samples)):
        truth.append(label[i*no_samples])
        
        pr = rf.predict_proba(feat[i*no_samples:(i+1)*no_samples,:])
        pr = np.sum(pr, axis=0)/no_samples
        pred.append(np.argmax(pr))
    return truth, pred

def testModel(duration, sample_len):
    rf, scaler, tef, tel = createForest(duration, sample_len)
    
    tar_names = list(genres.keys())
    pred = rf.predict(tef)
    print(confusion_matrix(tel, pred))
    print(classification_report(tel, pred, target_names=tar_names))
          
    tr, pr = songWeightedPred(rf, tef, tel, int(duration/sample_len))
    print(np.where(tr==np.array(pr))[0].shape[0]/300)

    print(confusion_matrix(tr, pr))
    print(classification_report(tr, pr, target_names=tar_names))
    return rf, scaler


In [3]:
print('=============== 28/28 =================')
testModel(28, 28)
print('\n\n\n\n')
print('=============== 28/14 =================')
testModel(28, 14)
print('\n\n\n\n')
print('=============== 25/5 =================')
testModel(25, 5)
print('\n\n\n\n')
print('=============== 27/3 =================')
testModel(27, 3)
print('\n\n\n\n')
print('=============== 28/2 =================')
testModel(28, 2)
print('\n\n\n\n')
print('=============== 28/1 =================')
testModel(28, 1)
print('\n\n\n\n')
print('=============== 28/0.5 =================')
testModel(28, 0.5)
print('\n\n\n\n')
print('=============== 28/0.25 =================')
testModel(28, 0.25)
print('\n\n\n\n')
print('=============== 28/0.05 =================')
rf, scaler = testModel(28, 0.05)

[[ 5109   516  1190  1645   551  2447  1063   558  2440  1281]
 [  254 13847   485    84    34   256   197   641   429   573]
 [ 1086  1905  8820   989   671     9  1608   732   342   638]
 [ 1227  1027  2084  5090  1077  1304  1580  1830   404  1177]
 [  363    83   206   342 14108    43  1097   344    36   178]
 [  515    83    19   629   208 12664   398   310  1196   778]
 [  735   424  1502  1257  3275   370  7658   787   299   493]
 [ 1145  2030  2123  2131   709  1441  1025  3106   534  2556]
 [ 1248  2007  1133   564   238  2799   458   536  5746  2071]
 [  938  1252   585  1383   447  4669   764  1350   933  4479]]
             precision    recall  f1-score   support

     reggae       0.40      0.30      0.35     16800
      metal       0.60      0.82      0.69     16800
      blues       0.49      0.53      0.50     16800
    country       0.36      0.30      0.33     16800
  classical       0.66      0.84      0.74     16800
        pop       0.49      0.75      0.59     168

In [24]:
sample_len = 3
y = np.arange(sample_len)
x = np.array([0,1,2,3])
print(x)
x = x*3
print(x)

fun = lambda x: print(x)
t = list(map(lambda x: x + y, x))
print(np.array(t))
te_idx = [0,3]

new = np.array(t)[te_idx]
print(new.flatten())

[0 1 2 3]
[0 3 6 9]
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[ 0  1  2  9 10 11]


In [26]:
no_samples = 2
signals = [0,1,2,3]
list_len = np.arange(no_samples)

t = np.array(list(map(lambda x: x + list_len, np.arange(len(signals)))))

print(t)

[[0 1]
 [1 2]
 [2 3]
 [3 4]]


In [36]:
def avgNSamples(a, N=2):
    tmp = np.cumsum(a, 0)[N-1::N]/float(N)
    tmp[1:] = tmp[1:] - tmp[:-1]
    return tmp

x = np.array([[0.7,0.2,0.3], [0.2,0.3,0.4], [0.4,0.5,0.6], [0.4,0.5,0.6]])
t = avgNSamples(x)
print(t)
print(np.argmax(t, axis=1))

[[0.45 0.25 0.35]
 [0.4  0.5  0.6 ]]
[0 2]


In [37]:
t = [1,2,3,4,5,6]
print(t[::3])

[1, 4]
