In [138]:
import copy
from pathlib import Path

import scipy.io.wavfile as wav
import numpy as np
from sklearn import preprocessing

from python_speech_features import mfcc
from python_speech_features import delta

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score


In [None]:
genres = {'reggae': 0, 'metal': 1, 'blues': 2, 'country': 3,
         'classical': 4, 'pop': 5, 'jazz': 6, 'rock': 7,
         'hiphop': 8, 'disco': 9}
#Genres
# 0: reggae
# 1: metal
# 2: blues
# 3: country
# 4: classical
# 5: pop
# 6: jazz
# 7: rock
# 8: hiphop
# 9: disco

def_rate = 22050

def readSongs(duration):
    pathlist = Path('../genres').glob('**/*.wav')
    
    song_sigs = {'reggae': [], 'metal': [], 'blues': [], 'country': [],
         'classical': [], 'pop': [], 'jazz': [], 'rock': [],
         'hiphop': [], 'disco': []}
    
    train_sigs = copy.deepcopy(song_sigs)
    test_sigs = copy.deepcopy(song_sigs)
    
    for path in pathlist:
        p = str(path)
        genre = p.split('/')[2]
        (rate, sig) = wav.read(p)
        sig = sig[:rate*duration]
        song_sigs[genre].append(sig)
    
    for key in song_sigs:
        #################
        #100-70-70 breyting
        perm = np.random.permutation(100)
        train = perm[:70]
        test = perm[70:]
        ###############3
        
        train_sigs[key] = np.array(song_sigs[key])[train]
        test_sigs[key] = np.array(song_sigs[key])[test]
    return train_sigs, test_sigs

def extractFeatures(signals, sample_len):
    features = []
    labels = []
    for key in signals:
        print(key)
        for sig in signals[key]:
            no_samples = int((len(sig)/def_rate)/sample_len)
            part_len = def_rate*sample_len
            for i in range(no_samples):
                part = sig[i*part_len:(i+1)*part_len]
                mfcc_feat = mfcc(part, def_rate, nfft=551)
                d_mfcc_feat = delta(mfcc_feat, 2)
                dd_mfcc_feat = delta(d_mfcc_feat, 2)
                
                #Cast to single vector
                sample = np.hstack((np.hstack((mfcc_feat.flatten(), d_mfcc_feat.flatten())), dd_mfcc_feat.flatten()))
                features.append(sample)
                labels.append(genres[key])
                
    return features, labels

def getData(duration, sample_len):
    train_signal, test_signal = readSongs(duration)
    
    train_feat, train_label = extractFeatures(train_signal, sample_len)
    test_feat, test_label = extractFeatures(test_signal, sample_len)
    
    scaler = preprocessing.StandardScaler().fit(train_feat)
    train_feat = scaler.transform(train_feat)
    test_feat = scaler.transform(test_feat)
    
    return train_feat, train_label, test_feat, test_label

def createForest(duration, sample_len):
    trf, trl, tef, tel = getData(duration, sample_len)
    
    rf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features='sqrt', oob_score=True)
    rf.fit(trf, trl)
    
    return rf, tef, tel

def songWeightedPred(rf, feat, label, no_samples):
    truth = []
    pred = []
    for i in range(int(feat.shape[0]/no_samples)):
        if i%10 == 0:
            print(i)
        truth.append(label[i*7])
        
        pr = rf.predict_proba(feat[i*no_samples:(i+1)*no_samples,:])
        pr = np.sum(pr, axis=0)/no_samples
        pred.append(np.argmax(pr))
    return truth, pred

def testModel(duration, sample_len):
    rf, tef, tel = createForest(duration, sample_len)
    
    tar_names = list(genres.keys())
    pred = rf.predict(tef)
    print(confusion_matrix(tel, pred))
    print(classification_report(tel, pred, target_names=tar_names))
          
    tr, pr = songWeightedPred(rf, tef, tel, int(duration/sample_len))
    print(np.where(tr==np.array(pr))[0].shape[0]/300)

    print(confusion_matrix(tr, pr))
    print(classification_report(tr, pr, target_names=tar_names))

rf_21_3 = testModel(21, 3)

reggae
metal
blues
country
classical
pop
jazz


In [76]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, max_features='sqrt', oob_score=True)
rf.fit(trf_21_3, trl_21_3)
print("RF Out-of-bag error rate: ", 1-rf.oob_score_)

rf.score(tef_21_3, tel_21_3)

RF Out-of-bag error rate:  0.4648979591836735


0.5352380952380953

In [87]:
pred = rf.predict(tef_21_3)
cm = confusion_matrix(tel_21_3, pred)
print(cm)

target_names = ['reggae', 'metal', 'blues', 'country', 'classical',
               'pop', 'jazz', 'rock', 'hiphop', 'disco']
print(classification_report(tel_21_3, pred, target_names=target_names))

[[ 79   3   8  14   5  32  12   6  39  12]
 [  0 179   3   0   0   0   0  25   0   3]
 [  2  39 117   6  10   0  17  12   3   4]
 [  7  13  16  66   2  12  39  35   6  14]
 [  0   0   0   3 188   0  10   7   0   2]
 [  6   0   0  16   1 172   7   1   4   3]
 [  1   0  11  13  24  27 127   1   3   3]
 [  6  34  35  14   6  25  31  44   4  11]
 [ 27  15   1   1   4  50   7  12  84   9]
 [ 19  17   4   7   0  48   8  23  16  68]]
             precision    recall  f1-score   support

     reggae       0.54      0.38      0.44       210
      metal       0.60      0.85      0.70       210
      blues       0.60      0.56      0.58       210
    country       0.47      0.31      0.38       210
  classical       0.78      0.90      0.84       210
        pop       0.47      0.82      0.60       210
       jazz       0.49      0.60      0.54       210
       rock       0.27      0.21      0.23       210
     hiphop       0.53      0.40      0.46       210
      disco       0.53      0.32      

In [123]:
def songPred(forest, feat, label):
    truth = []
    pred = []
    for i in range(int(feat.shape[0]/7)):
        if i%10 == 0:
            print(i)
        truth.append(label[i*7])
        
        pr = rf.predict_proba(feat[i*7:(i+1)*7,:])
#         print(pred)
        pr = np.sum(pr, axis=0)/7
#         print(pred)
#         print(np.argmax(pred))
        pred.append(np.argmax(pr))
    return truth, pred
    
tr, pr = songPred(rf, tef_21_3, tel_21_3)

print(np.where(tr==pred)[0].shape[0])


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
0




In [143]:
print(np.where(tr==np.array(pr))[0].shape[0]/300)

cm = confusion_matrix(tr, pr)
print(cm)

# target_names = ['reggae', 'metal', 'blues', 'country', 'classical',
#                'pop', 'jazz', 'rock', 'hiphop', 'disco']
print(classification_report(tr, pr, target_names=target_names))

0.5766666666666667
[[12  0  1  1  0  5  3  1  6  1]
 [ 0 26  0  0  0  0  0  4  0  0]
 [ 0  6 16  1  2  0  2  2  1  0]
 [ 1  1  1 12  0  2  5  5  0  3]
 [ 0  0  0  0 29  0  1  0  0  0]
 [ 0  0  0  3  0 26  1  0  0  0]
 [ 0  0  1  1  4  4 19  0  0  1]
 [ 0  4  5  1  0  3  6  8  0  3]
 [ 4  1  0  0  0  8  1  3 12  1]
 [ 4  0  0  1  0  7  1  2  2 13]]
             precision    recall  f1-score   support

     reggae       0.57      0.40      0.47        30
      metal       0.68      0.87      0.76        30
      blues       0.67      0.53      0.59        30
    country       0.60      0.40      0.48        30
  classical       0.83      0.97      0.89        30
        pop       0.47      0.87      0.61        30
       jazz       0.49      0.63      0.55        30
       rock       0.32      0.27      0.29        30
     hiphop       0.57      0.40      0.47        30
      disco       0.59      0.43      0.50        30

avg / total       0.58      0.58      0.56       300



In [146]:
print(list(genres.keys()))

['reggae', 'metal', 'blues', 'country', 'classical', 'pop', 'jazz', 'rock', 'hiphop', 'disco']
