# music audio tagging example

### musicnn allows predicting music tags with just two lines of code

-------

In the following, you will learn how to use `musicnn` to automatically tag your music.
To start, let's consider this music clip:

In [None]:
import glob
import os
from musicnn.tagger import top_tags
from musicnn.extractor import extractor
from musicnn.configuration import MTT_LABELS_DICT_IDX, MSD_LABELS_DICT_IDX

import deepdish as dd

%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize, minmax_scale, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


In [2]:
#file_name = './audio/joram-moments_of_clarity-08-solipsism-59-88.mp3'
#file_name = '/media/dirceusilva/EAAC-94C8/BasesDados/GenerosMusicais/FonogramasPorGeneroMusical/BAIAO/3715207.ogg'
file_name = '/media/dirceusilva/EAAC-94C8/BasesDados/GenerosMusicais/FonogramasPorGeneroMusical/CLASSICA/11560486.ogg'

dir_base = '/media/dirceusilva/EAAC-94C8/BasesDados/GenerosMusicais/FonogramasPorGeneroMusical'
dir_out = '/home/dirceusilva/Documentos/teste/coversbr'

dir_out_results = '/mnt/HD2T/Testes/coversbr/MTT_musicnn' #os.path.join(dir_out,'results.h5')

In [None]:

extensions = ['**/*.ogg','**/*.wav','**/*.mp3']
files = []
for e in extensions:
    files_path = os.path.join(dir_base,e)
    files.extend(glob.glob(files_path, recursive = True))

print(len(files))

Run these two code lines to predict the `top3` most likely tags according to our `MTT model`:

In [3]:

# 'MTT_musicnn', 'MSD_musicnn', and 'MSD_musicnn_big'.
model_type='MTT_musicnn'
if model_type == 'MSD_musicnn':
    tags_idx = MSD_LABELS_DICT_IDX
else:
    tags_idx = MTT_LABELS_DICT_IDX
    
    
# tags = top_tags(file_name, model=model_type, topN=3)
# print(tags)

-----------------------------
### Are you interested in the temporal evolution of these tags? 

Instead of predicting song-level tags, you can also plot the **Taggram**:

In [None]:
fon_by_genres = {}

for i,filename in enumerate(files):
    print(i,filename)
    taggram, tags, features = extractor(filename, model=model_type, extract_features=True)
    tags_likelihood_mean = np.mean(taggram, axis=0)
        
    #path = filename.split(os.path.sep)
    file = os.path.splitext(os.path.basename(filename))[0]
    #genre = path[-2]
    
    filepath = os.path.join(dir_out_results, file + '.h5')
    
    #if not genre in fon_by_genres:
    #    fon_by_genres[genre] = []
    
    results = {}
    results['filename'] = filename
    results['taggram'] = taggram
    results['likelihood'] = tags_likelihood_mean
    results['tags'] = tags
    results['features'] = features
        
    #fon_by_genres[genre].append(results)
    
    dd.io.save(filepath,results)

In [None]:

fon_by_genres = {}

files_path = os.path.join(dir_out_results,'**/*.h5')

for filepath in glob.iglob(files_path, recursive = True):
    
    print(filepath)
    
    fonogram = dd.io.load(filepath)
    filename = fonogram['filename']
    
    path = filename.split(os.path.sep)
    genre = path[-2]
    #file = os.path.splitext(os.path.basename(filename))[0]

    if not genre in list(fon_by_genres.keys()):
        fon_by_genres[genre] = []
        
    fon_by_genres[genre].append(filepath)
    
    
print(list(fon_by_genres.keys()))

## Cria vetor de likelihoods

In [None]:
genres = list(fon_by_genres.keys())

seq_fon = [] 
seq_genres = []
likelihood = []
penultimate = []

for g in genres:
    for f in fon_by_genres[g]:
        
        fonogram = dd.io.load(f)
        LLK = fonogram['likelihood']
        embedded = fonogram['features']['penultimate']
        
        seq_fon.append(os.path.basename(fonogram['filename']))
        seq_genres.append(g)
        
        likelihood.append(LLK) #[tags_idx['genre']])
        penultimate.append(embedded)
        
        
likelihood = np.asarray(likelihood)
penultimate = np.asarray(penultimate)


In [None]:
dd.io.save('vectors_' + model_type + '.npy',(likelihood,penultimate, seq_fon, seq_genres))



## Leitura Arquivos

In [20]:
likelihood,penultimate, seq_fon, seq_genres = dd.io.load('vectors_' + model_type + '.npy')
    
print(penultimate[0].shape)
print(len(seq_fon))

(39, 200)
840


## prepara matriz

In [21]:
le = LabelEncoder()
le.fit(embedded_labels)
labels = le.transform(embedded_labels)

In [22]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
#X_train, X_test = train_test_split(np.arange(len(seq_genres)), train_size=0.80, random_state=42)
#print(X_train)
#print(X_test)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
sss.get_n_splits(seq_fon,seq_genres)
for train_index, test_index in sss.split(seq_fon, seq_genres):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))


TRAIN: 672 TEST: 168


In [25]:
embedded_train = None
embedded_train_labels = []

for idx in train_index:
    emb = penultimate[idx]
    g = seq_genres[idx]
    
    samples = emb.shape[0]
    print(emb.shape)
    
    if embedded_train is None:
        embedded_train = emb.copy()
    else:
        embedded_train = np.concatenate((embedded_train,emb),axis=0)
        
    labels = [g]*samples
    embedded_train_labels.extend(labels)    

print(embedded_train.shape)
print(len(embedded_train_labels))

embedded_tst = None
embedded_tst_labels = []

for idx in test_index:
    emb = penultimate[idx]
    g = seq_genres[idx]
    
    samples = emb.shape[0]
    print(emb.shape)
    
    if embedded_tst is None:
        embedded_tst = emb.copy()
    else:
        embedded_tst = np.concatenate((embedded_tst,emb),axis=0)
        
    labels = [g]*samples
    embedded_tst_labels.extend(labels)    

print(embedded_tst.shape)
print(len(embedded_tst_labels))


(73, 200)
(70, 200)
(64, 200)
(80, 200)
(60, 200)
(64, 200)
(89, 200)
(49, 200)
(88, 200)
(85, 200)
(67, 200)
(76, 200)
(133, 200)
(44, 200)
(63, 200)
(72, 200)
(61, 200)
(82, 200)
(94, 200)
(44, 200)
(77, 200)
(65, 200)
(153, 200)
(71, 200)
(95, 200)
(77, 200)
(83, 200)
(61, 200)
(73, 200)
(90, 200)
(83, 200)
(86, 200)
(39, 200)
(65, 200)
(121, 200)
(71, 200)
(71, 200)
(62, 200)
(67, 200)
(65, 200)
(82, 200)
(93, 200)
(59, 200)
(62, 200)
(75, 200)
(62, 200)
(65, 200)
(90, 200)
(80, 200)
(61, 200)
(131, 200)
(86, 200)
(92, 200)
(72, 200)
(87, 200)
(66, 200)
(97, 200)
(51, 200)
(69, 200)
(69, 200)
(62, 200)
(65, 200)
(4, 200)
(58, 200)
(70, 200)
(55, 200)
(71, 200)
(64, 200)
(75, 200)
(68, 200)
(80, 200)
(88, 200)
(59, 200)
(70, 200)
(74, 200)
(59, 200)
(57, 200)
(47, 200)
(56, 200)
(101, 200)
(101, 200)
(50, 200)
(66, 200)
(51, 200)
(58, 200)
(40, 200)
(82, 200)
(68, 200)
(57, 200)
(71, 200)
(80, 200)
(56, 200)
(62, 200)
(104, 200)
(133, 200)
(65, 200)
(56, 200)
(50, 200)
(146, 200)
(1

(72, 200)
(59, 200)
(104, 200)
(102, 200)
(135, 200)
(73, 200)
(147, 200)
(83, 200)
(55, 200)
(74, 200)
(88, 200)
(55, 200)
(65, 200)
(63, 200)
(75, 200)
(57, 200)
(104, 200)
(78, 200)
(63, 200)
(164, 200)
(104, 200)
(122, 200)
(73, 200)
(64, 200)
(13477, 200)
13477


In [26]:
knn = KNeighborsClassifier(n_neighbors=11,weights='distance', n_jobs = -1)
knn.fit(embedded_train, embedded_train_labels)


KNeighborsClassifier(n_jobs=-1, n_neighbors=11, weights='distance')

In [55]:
from collections import Counter
def my_mode(sample):
    c = Counter(sample)
    return [k for k, v in c.items() if v == c.most_common(1)[0][1]]

In [68]:
from sklearn.metrics import plot_confusion_matrix
from statistics import mode

y_pred_all = []
labels_genres = []

for idx in test_index:
    emb = penultimate[idx]
    g = seq_genres[idx]
    
    y_pred = knn.predict(emb)
    y_summary = my_mode(y_pred)
    print(y_summary, g)
    
    y_pred_all.append(y_summary[0])
    labels_genres.append(g)    
    
#print(y_pred_all)


# plt.rcParams["figure.figsize"] = (15,15)
# plot_confusion_matrix(knn, embedded_tst, embedded_tst_labels)  
# plt.xticks(rotation=90)
# plt.savefig('/home/dirceusilva/Documentos/teste/coversbr/confusion_matrix_knn.eps', format='eps')
#plt.show() 
#print(cm)

['ELECTRONIC'] DANCE MUSIC
['AXE MUSIC'] WORLD MUSIC
['CATOLICO'] EVANGELICA
['AXE MUSIC'] SAMBA
['GOSPEL'] SOUL
['SAMBA ENREDO'] PARTIDO ALTO
['MPB'] PAGODE
['EVANGELICA'] BALADA
['ROCK'] BLUES
['Piano Tenso'] Piano Tenso
['EVANGELICA'] AXE MUSIC
['CLASSICA'] INSTRUMENTAL
['ROCK'] ROCK
['BOSSA NOVA'] BOSSA NOVA
['CATOLICO'] BLUES
['PARTIDO ALTO'] JAZZ
['FREVO', 'BAIAO'] CHORO
['SAMBA'] MPB
['SAMBA'] AXE MUSIC
['XOTE'] TANGO
['SAMBA'] SAMBA
['CALYPSO'] PAGODE
['CHORO', 'FORRO', 'SAMBA'] BOSSA NOVA
['EVANGELICA'] ROCK POP
['BALADA'] CATOLICO
['SAMBA ENREDO'] SAMBA ENREDO
['SAMBA'] BOSSA NOVA
['AXE MUSIC'] POP ROCK
['CLASSICA'] CLASSICA
['BALADA'] CALYPSO
['RAP'] RAP
['CATOLICO'] GOSPEL
['FORRO'] FORRO
['BLUES', 'BAIAO', 'SAMBA'] BAIAO
['BOSSA NOVA'] ROCK
['FUNK'] RAP
['CATOLICO'] EVANGELICA
['ELECTRONIC'] DANCE MUSIC
['RAP'] GOSPEL
['SAMBA ENREDO'] SAMBA ENREDO
['CLASSICA'] BOLERO
['GOSPEL'] CALYPSO
['FUNK'] FUNK
['FREVO'] CHORO
['BOLERO', 'AXE MUSIC', 'BAIAO'] WORLD MUSIC
['ROCK'] CATO

NameError: name 'fon_by_genres' is not defined

In [71]:
labels = list(set(seq_genres))
cm = confusion_matrix(labels_genres, y_pred_all,labels=labels)

f = open('confusion_matrix.csv',"w")

f.write(",")
for l in labels:
    f.write("%s," % l)
f.write('\n')

for i in range(len(labels)):
    f.write("%s,"% labels[i])
    for j in range(len(labels)):
        f.write("%d,"% cm[i,j])
    f.write('\n')

f.close()


#np.savetxt('confusion_matrix.csv',cm,delimiter=',',fmt='%d')
#np.savetxt('seq_genres.csv',seq_genres,delimiter=',')

## Normalização

In [None]:
likelihood = minmax_scale(likelihood, axis=1)

## Cálculo conforme artigo DA-Tacos

In [None]:
def f_measure(u,v, pct=1.0, topN=10):
    
    u_idx = u.argsort()[-topN:][::-1]
    v_idx = v.argsort()[-topN:][::-1]
    
    #u_pct = np.percentile(u, 90)
    #v_pct = np.percentile(v, 90)
        
    #u_idx = list(np.where(u >= pct)[0])   
    #v_idx = list(np.where(v >= pct)[0])

    common = set(u_idx).intersection(v_idx)
    
    if len(common) > 0:
        if len(u) > 0 and len(v) > 0:
            p = len(common)/len(u_idx)
            r = len(common)/len(v_idx)
        else:
            return 0.0
    
        return 2*p*r/(p+r) 
    else:
        return 0.0


def fmetric(p,top):
    return lambda u, v: f_measure(u,v,p,top)


## Cálculo de Distância e cria vetor de true_scores e false_scores

In [None]:
pct = np.percentile(likelihood.flatten(), 90)
topN = 10

D = squareform(pdist(likelihood, metric=fmetric(pct,topN)))

same = []
impostor = []

for i in range(D.shape[0]):
    for j in range(i+1,D.shape[1]):
        if seq_genres[i] == seq_genres[j]:
            same.append(D[i,j])
        else:
            impostor.append(D[i,j])

            
true_scores = np.array(same)
false_scores = np.array(impostor)

## Funções para plotagem do DET

In [None]:
def __DETsort__(x, col=''):
    """DETsort Sort rows, the first in ascending, the remaining in descending
    thereby postponing the false alarms on like scores.
    based on SORTROWS
    
    :param x: the array to sort
    :param col: not used here

    :return: a sorted vector of scores
    """
    assert x.ndim > 1, 'x must be a 2D matrix'
    if col == '':
        list(range(1, x.shape[1]))

    ndx = np.arange(x.shape[0])

    # sort 2nd column ascending
    ind = np.argsort(x[:, 1], kind='mergesort')
    ndx = ndx[ind]

    # reverse to descending order
    ndx = ndx[::-1]

    # now sort first column ascending
    ind = np.argsort(x[ndx, 0], kind='mergesort')

    ndx = ndx[ind]
    sort_scores = x[ndx, :]
    return sort_scores

def __compute_roc__(true_scores, false_scores):
    """Computes the (observed) miss/false_alarm probabilities
    for a set of detection output scores.
    
    true_scores (false_scores) are detection output scores for a set of
    detection trials, given that the target hypothesis is true (false).
    (By convention, the more positive the score,
    the more likely is the target hypothesis.)
    
    :param true_scores: a 1D array of target scores
    :param false_scores: a 1D array of non-target scores

    :return: a tuple of two vectors, Pmiss,Pfa
    """
    num_true = true_scores.shape[0]
    num_false = false_scores.shape[0]
    assert num_true > 0, "Vector of target scores is empty"
    assert num_false > 0, "Vector of nontarget scores is empty"

    total = num_true + num_false

    Pmiss = np.zeros((total + 1))
    Pfa = np.zeros((total + 1))

    scores = np.zeros((total, 2))
    scores[:num_false, 0] = false_scores
    scores[:num_false, 1] = 0
    scores[num_false:, 0] = true_scores
    scores[num_false:, 1] = 1

    scores = __DETsort__(scores)

    sumtrue = np.cumsum(scores[:, 1], axis=0)
    sumfalse = num_false - (np.arange(1, total + 1) - sumtrue)

    Pmiss[0] = 0
    Pfa[0] = 1
    Pmiss[1:] = sumtrue / num_true
    Pfa[1:] = sumfalse / num_false
    return Pmiss, Pfa

def DETCurve(true_scores, false_scores):
    """
    Given false positive and false negative rates, produce a DET Curve.
    The false positive rate is assumed to be increasing while the false
    negative rate is assumed to be decreasing.
    """

    fns, fps = __compute_roc__(true_scores, false_scores)

    axis_min = min(fps[0],fns[-1])

    #plt.figure()
    fig,ax = plt.subplots()
    plt.plot(fps,fns)
    plt.yscale('log')
    plt.xscale('log')
    ticks_to_use = [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1,2,5,10,20,50]
    ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    ax.set_xticks(ticks_to_use)
    ax.set_yticks(ticks_to_use)
    plt.axis([0.001,50,0.001,50])

## Ploagem da curva DET

In [None]:
 
    
# same_bin = np.zeros((len(same),))
# impostor_bin = np.ones((len(impostor),))

# y_true = np.concatenate((same_bin,impostor_bin))
# y_score = np.concatenate((np.array(same),np.array(impostor)))

DETCurve(true_scores, false_scores)

#plt.show()

## Plotagem de histograma

In [None]:

plt.figure()

plt.rcParams["figure.figsize"] = (10,8) 


minbin, maxbin = 0.0,1.0
bins = np.linspace(minbin, maxbin, 50)

kwargs = dict(histtype='stepfilled', alpha=0.3, density=True, bins=bins, ec="k")

plt.hist(true_scores,label='true', **kwargs)
plt.hist(false_scores,label='false', **kwargs)
plt.legend(loc='upper left')
plt.title('Distances')

plt.show()


In [None]:
in_length = 3 # seconds  by default, the model takes inputs of 3 seconds with no overlap

plt.rcParams["figure.figsize"] = (10,8) # set size of the figures
fontsize = 12 # set figures font size

In [None]:
fig, ax = plt.subplots()

taggram = results[0]['taggram']
tags = results[0]['tags']

# title
ax.title.set_text('Taggram')
ax.title.set_fontsize(fontsize)

# x-axis title
ax.set_xlabel('(seconds)', fontsize=fontsize)

# y-axis
y_pos = np.arange(len(tags))
ax.set_yticks(y_pos)
ax.set_yticklabels(tags, fontsize=fontsize-1)

# x-axis
x_pos = np.arange(taggram.shape[0])
x_label = np.arange(in_length/2, in_length*taggram.shape[0], 3)
ax.set_xticks(x_pos)
ax.set_xticklabels(x_label, fontsize=fontsize)

# depict taggram
ax.imshow(taggram.T, interpolation=None, aspect="auto")
plt.show()

----------------------------------
### How did we compute the top3 tags?

Note that the Taggram can be interpreted as the temporal evolution of the tags likelihood.

From the Taggram, we can compute the **song-level tags likelihood** by simply averaging through time:

In [None]:
tags_likelihood_mean = np.mean(taggram, axis=0) # averaging the Taggram through time 

This song-level tags likelihood has the following shape:

In [None]:
fig, ax = plt.subplots()

# title
ax.title.set_text('Tags likelihood (mean of the taggram)')
ax.title.set_fontsize(fontsize)

# y-axis title
ax.set_ylabel('(likelihood)', fontsize=fontsize)

# y-axis
ax.set_ylim((0, 1))
ax.tick_params(axis="y", labelsize=fontsize)

# x-axis
ax.tick_params(axis="x", labelsize=fontsize-1)
pos = np.arange(len(tags))
ax.set_xticks(pos)
ax.set_xticklabels(tags, rotation=90)

# depict song-level tags likelihood
ax.bar(pos, tags_likelihood_mean)
plt.show()

From the above tags likelihood, the `top_tag()` function computes the **top3** tags.