In [4]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import glob, os
from sklearn import preprocessing
import matplotlib.pyplot as plt
from librosa import display
import librosa
import numpy as np
import datetime
from collections import deque
import progressbar
from sklearn.model_selection import cross_val_score
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
import numpy as np
import scipy.stats.stats as st

# Matriculation number: 01425616
random_state = 1425616

In [5]:
# We need to construct our data set; unfortunately, we don't simply have a "loadGTZanDataSet()" function in SK-learn...
# So we need to 
## Download our data set & extract it (one-time effort)
## Run an audio feature extraction
## Create the create the ground truth (label assignment, target, ...) 


# path to our audio folder
# For the first run, download the images from http://kronos.ifs.tuwien.ac.at/GTZANmp3_22khz.zip, and unzip them to your folder
#imagePath="../../ML_Data/GTZANmp3_22khz/"
imagePath="..\\..\\ML_Data\\GTZANmp3_22khz_sub\\"


# Find all songs in that folder; there are like 1.000 different ways to do this in Python, we chose this one :-)
os.chdir(imagePath)
fileNames = glob.glob("*/*.mp3")
numberOfFiles=len(fileNames)
targetLabels=[]

print( 'Found ' + str(numberOfFiles) + " files\n")

# The first step - create the ground truth (label assignment, target, ...) 
# For that, iterate over the files, and obtain the class label for each file
# Basically, the class name is in the full path name, so we simply use that
for fileName in fileNames:
    pathSepIndex = fileName.index("\\")
    targetLabels.append(fileName[:pathSepIndex])

# sk-learn can only handle labels in numeric format - we have them as strings though...
# Thus we use the LabelEncoder, which does a mapping to Integer numbers
le = preprocessing.LabelEncoder()
le.fit(targetLabels) # this basically finds all unique class names, and assigns them to the numbers
print( "Found the following classes: " + str(list(le.classes_)))

# now we transform our labels to integers
target = le.transform(targetLabels); 
print( "Transformed labels (first elements: " + str(target[0:150]))

# If we want to find again the label for an integer value, we can do something like this:
# print list(le.inverse_transform([0, 18, 1]))

print( "... done label encoding")

Found 22 files

Found the following classes: ['disco', 'metal']
Transformed labels (first elements: [0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1]
... done label encoding


In [6]:
# Now we do the actual feature extraction

# This is a helper function that computes the differences between adjacent array values
def differences(seq):
    iterable = iter(seq)
    prev = next(iterable)
    for element in iterable:
        yield element - prev
        prev = element

# This is a helper function that computes various statistical moments over a series of values, including mean, median, var, min, max, skewness and kurtosis (a total of 7 values)
def statistics(numericList):
    return [np.mean(numericList), np.median(numericList), np.var(numericList), np.float64(st.skew(numericList)), np.float64(st.kurtosis(numericList)), np.min(numericList), np.max(numericList)]



print( "Extracting features using librosa" + " (" + str(datetime.datetime.now()) + ")")

# compute some features based on BPMs, MFCCs, Chroma
data_bpm=[]
data_bpm_statistics=[]
data_mfcc=[]
data_chroma=[]

# This takes a bit, so let's show it with a progress bar
with progressbar.ProgressBar(max_value=len(fileNames)) as bar:
    for indexSample, fileName in enumerate(fileNames):
        # Load the audio as a waveform `y`, store the sampling rate as `sr`
        y, sr = librosa.load(fileName)

        # run the default beat tracker
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        # from this, we simply use the tempo as BPM feature
        data_bpm.append([tempo])

        # Then we compute a few statistics on the beat timings
        beat_times = librosa.frames_to_time(beat_frames, sr=sr)
        # from the timings, compute the time differences between the beats
        beat_intervals = np.array(deque(differences(beat_times)))

        # And from this, take some statistics
        # There might be a few files where the beat timings are not determined properly; we ignore them, resp. give them 0 values
        if len(beat_intervals) < 1:
            print( "Errors with beat interval in file " + fileName + ", index " + str(indexSample) + ", using 0 values instead")
            data_bpm_statistics.append([tempo, 0, 0, 0, 0, 0, 0, 0])
        else:
            bpm_statisticsVector=[]
            bpm_statisticsVector.append(tempo) # we also include the raw value of tempo
            for stat in statistics(beat_intervals):  # in case the timings are ok, we actually compute the statistics
                bpm_statisticsVector.append(stat) # and append it to the vector, which finally has 1 + 7 features
            data_bpm_statistics.append(bpm_statisticsVector)

        # Next feature are MFCCs; we take 12 coefficients; for each coefficient, we have around 40 values per second
        mfccs=librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
        mfccVector=[]
        for mfccCoefficient in mfccs: # we transform this time series by taking again statistics over the values
            mfccVector.append(statistics(mfccCoefficient))

        # Finally, this vector should have 12 * 7 features
        data_mfcc.append(np.array(mfccVector).flatten())


        # Last feature set - chroma (which is roughly similar to actual notes)
        chroma=librosa.feature.chroma_stft(y=y, sr=sr);
        chromaVector=[]
        for chr in chroma: # similar to before, we get a number of time-series
            chromaVector.append(statistics(chr)) # and we resolve that by taking statistics over the time series
        # Finally, this vector should be be 12 * 7 features
        data_chroma.append(np.array(chromaVector).flatten())

        bar.update(indexSample)

print( ".... done" + " (" + str(datetime.datetime.now()) + ")")

Extracting features using librosa (2022-05-22 12:26:39.104238)


  return f(*args, **kwargs)
  0% (0 of 22) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  4% (1 of 22) |#                        | Elapsed Time: 0:00:03 ETA:   0:01:11
  9% (2 of 22) |##                       | Elapsed Time: 0:00:06 ETA:   0:01:09
 13% (3 of 22) |###                      | Elapsed Time: 0:00:10 ETA:   0:01:05
 18% (4 of 22) |####                     | Elapsed Time: 0:00:13 ETA:   0:01:02
 22% (5 of 22) |#####                    | Elapsed Time: 0:00:17 ETA:   0:00:58
 27% (6 of 22) |######                   | Elapsed Time: 0:00:20 ETA:   0:00:54
 31% (7 of 22) |#######                  | Elapsed Time: 0:00:24 ETA:   0:00:50
 36% (8 of 22) |#########                | Elapsed Time: 0:00:27 ETA:   0:00:47
 40% (9 of 22) |##########               | Elapsed Time: 0:00:30 ETA:   0:00:45
 45% (10 of 22) |##########              | Elapsed Time: 0:00:34 ETA:   0:00:42
 50% (11 of 22) |############            | Elapsed Time: 0:00:38 ETA:   0:00:39
 54% (12 of 

.... done (2022-05-22 12:27:56.156840)


In [7]:
# k-NN
# kd tree was chosen to gain results within a reasonable amount of time

def kNN (dataSetName, X_train, X_test, y_train, y_test, n_neighbors_values):

    results = []

    for n_neighbors in n_neighbors_values:
            classifier = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

            # Train classifier
            start_time = datetime.datetime.now()
            
            #classifier.fit(X_train, y_train.values.ravel())
            classifier.fit(X_train, y_train.ravel())

            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

            # Store results
            result = type('',(object,),{'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
            results.append(result)

    # Print results
    print(dataSetName)
    print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
    for res in results:
        print('k-NN (' + str(res.n_neigbors) + '-NN) | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
    print()


In [8]:
def perceptron(dataSetName, X_train, X_test, y_train, y_test, alpha_values):

        results = []

        for alpha in alpha_values:
                classifier = Perceptron(alpha=alpha, random_state=random_state)

                # Train classifier
                start_time = datetime.datetime.now()
                classifier.fit(X_train, y_train.ravel())
                end_time = datetime.datetime.now()
                training_time_sec = (end_time - start_time).total_seconds()

                # Predict test set on trained classifier
                start_time = datetime.datetime.now()
                y_test_predicted = classifier.predict(X_test)
                end_time = datetime.datetime.now()
                testing_time_sec = (end_time - start_time).total_seconds()

                # Compute metrics
                acc = metrics.accuracy_score(y_test, y_test_predicted)
                f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

                # Store results
                result = type('',(object,),{'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
                results.append(result)

        # Print results
        print(dataSetName)
        print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
        for res in results:
                print('Perceptron (alpha: ' + str(res.alpha) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
        print()

In [12]:
# Finally, we do classification

# These are our feature sets; we will use each of them individually to train classifiers
trainingSets = [ 
                 ('music_bmp', data_bpm),
                 ('music_bpm_statistics', data_bpm_statistics),
                 ('music_chroma', data_chroma),
                 ('music_mfcc', data_mfcc)
               ]

# set up a number of classifiers
#classifiers = [neighbors.KNeighborsClassifier(),
 #              naive_bayes.GaussianNB(),
  #             tree.DecisionTreeClassifier(),
   #            ensemble.RandomForestClassifier(),
    #           svm.SVC(),
     #          svm.LinearSVC(),
      #        ]

def perceptron_music(data_set_name, X_train, X_test, y_train, y_test):
    alpha_values = [0.0001, 0.001, 0.01]
    perceptron(data_set_name, X_train, X_test, y_train, y_test, alpha_values)

def kNN_music(data_set_name, X_train, X_test, y_train, y_test):
    n_neighbors_values = [1, 2, 3]
    kNN(data_set_name, X_train, X_test, y_train, y_test, n_neighbors_values)
      
classifiers = [
                #('', ),
                #('', ),
                #('', ),
                perceptron_music,
                kNN_music
              ]

data_set_y = target

for indexDataset, train_data_set in enumerate(trainingSets):

    data_set_X = train_data_set[1]

    X, y = shuffle(data_set_X, data_set_y, random_state=random_state)

    # Prepare a train/test set split: split 2/3 1/3 into training & test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

    for indexClassifier, classifier in enumerate(classifiers):
        # do the actual classification
        classifier(train_data_set[0], X_train, X_test, y_train, y_test)   
    

music_bmp
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Perceptron (alpha: 0.0001) | 0.25 | 0.1 | 0.000986 sec | 0.0 sec
Perceptron (alpha: 0.001) | 0.25 | 0.1 | 0.000392 sec | 0.0 sec
Perceptron (alpha: 0.01) | 0.25 | 0.1 | 0.001 sec | 0.0 sec

music_bmp
Algorithm | acc | f1 | training_time_sec | testing_time_sec
k-NN (1-NN) | 0.625 | 0.645 | 0.000565 sec | 0.000957 sec
k-NN (2-NN) | 0.5 | 0.5 | 0.0 sec | 0.001536 sec
k-NN (3-NN) | 0.5 | 0.533 | 0.0 sec | 0.001529 sec

music_bpm_statistics
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Perceptron (alpha: 0.0001) | 0.375 | 0.325 | 0.000999 sec | 0.00097 sec
Perceptron (alpha: 0.001) | 0.375 | 0.325 | 0.000998 sec | 0.0 sec
Perceptron (alpha: 0.01) | 0.375 | 0.325 | 0.000996 sec | 0.0 sec

music_bpm_statistics
Algorithm | acc | f1 | training_time_sec | testing_time_sec
k-NN (1-NN) | 0.625 | 0.645 | 0.0 sec | 0.000999 sec
k-NN (2-NN) | 0.625 | 0.577 | 0.0 sec | 0.002045 sec
k-NN (3-NN) | 0.5 | 0.533 | 0.00098 s