# Machine Learning
# Exercise 2 - More Comparative Evaluation
<br/>Student:
<br/>se21m024
<br/>Matriculation number: 1425616
<br/>Thomas Stummer
<br/><br/>The interpretation of the data can be found in the document <b><i>se21m024_Stummer_ml_ex2_comp_eval.pdf</i></b>.
<br/><br/>
The library <i>Surprise</i> (https://surprise.readthedocs.io/en/stable/index.html) was used to create the following results. The code is highly inspired by the example code provided by the libries official documentation.
<br/><br/>
Small data set: Heart Failure Prediction<br>
The data set was provided by Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020) (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5) and downloaded from https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data.
<br/><br/>
Big data set: Covertype<br>
The data set was provided by Jock A. Blackard and Colorado State University and downloaded from https://archive.ics.uci.edu/ml/datasets/Covertype.
<br/><br/>
Music data set<br>
Downloaded from Moodle

## Import Dependencies

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd
import glob, os
from sklearn import preprocessing
import matplotlib.pyplot as plt
from librosa import display
import librosa
import numpy as np
import datetime
from collections import deque
import progressbar
from sklearn.model_selection import cross_val_score
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
import numpy as np
import scipy.stats.stats as st
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Matriculation number: 1425616
random_state = 1425616

## Data Set 1: Small Data Set: Heart Failure Prediction

In [None]:
#data_set = pd.read_csv("..\\Exercise 5\\Data\\heartfailure\\heart_failure_clinical_records_dataset.csv")
heart_failure_data_set = pd.read_csv("C:\Repositories\_Tom\FH\Sem2\DataScience\Exercise 5\Data\heartfailure\heart_failure_clinical_records_dataset.csv")

# Split data in input features (X) and target (y) feature
# The target feature is 'DEATH_EVENT' that indicates weither the person has died
# Column 'time' is not used as feature due to the direct connection to the target feature 'death_event': https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data/discussion/178372
heart_failure_data_set_X = heart_failure_data_set.loc[:,:'smoking']
heart_failure_data_set_y = heart_failure_data_set.loc[:,'DEATH_EVENT':]

## Data Set 2: Big Data Set: Covertype

In [None]:
#covertype_data_set = pd.read_csv("./Data/covtype/covtype.data", header=None)
covertype_data_set = pd.read_csv("C:\\Repositories\\_Tom\\FH\\Sem2\\DataScience\\Exercise 5\\Data\\covtype\\covtype.data", header=None)

# Take only a subset of 10 000 data points to reduce calculation time (the whole data set took far to long on my notebook)
covertype_data_set = shuffle(covertype_data_set, random_state=random_state)
covertype_data_set = covertype_data_set[:10000]

# Split data in input features (X) and target (y) feature
# The target feature is 'Forest cover type class' in column 54 than can be any value between 1 and 7 and indicates which type of vegetation is growing there mainly.
covertype_data_set_X = covertype_data_set.loc[:,:53]
covertype_data_set_y = covertype_data_set.loc[:,54:]

## Data Set 3: Music

In [None]:
# We need to construct our data set; unfortunately, we don't simply have a "loadGTZanDataSet()" function in SK-learn...
# So we need to 
## Download our data set & extract it (one-time effort)
## Run an audio feature extraction
## Create the create the ground truth (label assignment, target, ...) 

# path to our audio folder
# For the first run, download the images from http://kronos.ifs.tuwien.ac.at/GTZANmp3_22khz.zip, and unzip them to your folder
imagePath="../../ML_Data/GTZANmp3_22khz/"

# Find all songs in that folder; there are like 1.000 different ways to do this in Python, we chose this one :-)
os.chdir(imagePath)
fileNames = glob.glob("*/*.mp3")
numberOfFiles=len(fileNames)
targetLabels=[]

print( 'Found ' + str(numberOfFiles) + " files\n")

# The first step - create the ground truth (label assignment, target, ...) 
# For that, iterate over the files, and obtain the class label for each file
# Basically, the class name is in the full path name, so we simply use that
for fileName in fileNames:
    pathSepIndex = fileName.index("\\")
    targetLabels.append(fileName[:pathSepIndex])

# sk-learn can only handle labels in numeric format - we have them as strings though...
# Thus we use the LabelEncoder, which does a mapping to Integer numbers
le = preprocessing.LabelEncoder()
le.fit(targetLabels) # this basically finds all unique class names, and assigns them to the numbers
print( "Found the following classes: " + str(list(le.classes_)))

# now we transform our labels to integers
target = le.transform(targetLabels); 
music_target = target
print( "Transformed labels (first elements: " + str(target[0:150]))

# If we want to find again the label for an integer value, we can do something like this:
# print list(le.inverse_transform([0, 18, 1]))

print( "... done label encoding")

In [None]:
# Now we do the actual feature extraction

# This is a helper function that computes the differences between adjacent array values
def differences(seq):
    iterable = iter(seq)
    prev = next(iterable)
    for element in iterable:
        yield element - prev
        prev = element

# This is a helper function that computes various statistical moments over a series of values, including mean, median, var, min, max, skewness and kurtosis (a total of 7 values)
def statistics(numericList):
    return [np.mean(numericList), np.median(numericList), np.var(numericList), np.float64(st.skew(numericList)), np.float64(st.kurtosis(numericList)), np.min(numericList), np.max(numericList)]

print( "Extracting features using librosa" + " (" + str(datetime.datetime.now()) + ")")

# compute some features based on BPMs, MFCCs, Chroma
data_bpm=[]
data_bpm_statistics=[]
data_mfcc=[]
data_chroma=[]

# This takes a bit, so let's show it with a progress bar
with progressbar.ProgressBar(max_value=len(fileNames), ) as bar:
    for indexSample, fileName in enumerate(fileNames):
        # Load the audio as a waveform `y`, store the sampling rate as `sr`
        y, sr = librosa.load(fileName)

        # run the default beat tracker
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        # from this, we simply use the tempo as BPM feature
        data_bpm.append([tempo])

        # Then we compute a few statistics on the beat timings
        beat_times = librosa.frames_to_time(beat_frames, sr=sr)
        # from the timings, compute the time differences between the beats
        beat_intervals = np.array(deque(differences(beat_times)))

        # And from this, take some statistics
        # There might be a few files where the beat timings are not determined properly; we ignore them, resp. give them 0 values
        if len(beat_intervals) < 1:
            print( "Errors with beat interval in file " + fileName + ", index " + str(indexSample) + ", using 0 values instead")
            data_bpm_statistics.append([tempo, 0, 0, 0, 0, 0, 0, 0])
        else:
            bpm_statisticsVector=[]
            bpm_statisticsVector.append(tempo) # we also include the raw value of tempo
            for stat in statistics(beat_intervals):  # in case the timings are ok, we actually compute the statistics
                bpm_statisticsVector.append(stat) # and append it to the vector, which finally has 1 + 7 features
            data_bpm_statistics.append(bpm_statisticsVector)

        # Next feature are MFCCs; we take 12 coefficients; for each coefficient, we have around 40 values per second
        mfccs=librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
        mfccVector=[]
        for mfccCoefficient in mfccs: # we transform this time series by taking again statistics over the values
            mfccVector.append(statistics(mfccCoefficient))

        # Finally, this vector should have 12 * 7 features
        data_mfcc.append(np.array(mfccVector).flatten())


        # Last feature set - chroma (which is roughly similar to actual notes)
        chroma=librosa.feature.chroma_stft(y=y, sr=sr);
        chromaVector=[]
        for chr in chroma: # similar to before, we get a number of time-series
            chromaVector.append(statistics(chr)) # and we resolve that by taking statistics over the time series
        # Finally, this vector should be be 12 * 7 features
        data_chroma.append(np.array(chromaVector).flatten())

        bar.update(indexSample)

print( ".... done" + " (" + str(datetime.datetime.now()) + ")")

## Classifiers

In [None]:
# k-NN
# kd tree was chosen to gain results within a reasonable amount of time
def kNN (dataSetName, X_train, X_test, y_train, y_test, n_neighbors_values):

    results = []
    bestResult = None

    for n_neighbors in n_neighbors_values:
            classifier = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

            # Train classifier
            start_time = datetime.datetime.now()
            
            #classifier.fit(X_train, y_train.values.ravel())
            classifier.fit(X_train, y_train.ravel())

            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

            # Store results
            result = type('',(object,),{'algorithm': 'k-NN', 'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1, 'y_test_predicted': y_test_predicted})()
            results.append(result)

            # Cache best result for confusion matrix return value
            if(bestResult is None or bestResult.f1 < result.f1):
                bestResult = result

    # Print results
    print(dataSetName)
    print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
    for res in results:
        print('k-NN (' + str(res.n_neigbors) + '-NN) | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
    print()

    return bestResult


In [None]:
# perceptron
def perceptron(dataSetName, X_train, X_test, y_train, y_test, alpha_values):

        results = []
        bestResult = None

        for alpha in alpha_values:
                classifier = Perceptron(alpha=alpha, random_state=random_state)

                # Train classifier
                start_time = datetime.datetime.now()
                classifier.fit(X_train, y_train.ravel())
                end_time = datetime.datetime.now()
                training_time_sec = (end_time - start_time).total_seconds()

                # Predict test set on trained classifier
                start_time = datetime.datetime.now()
                y_test_predicted = classifier.predict(X_test)
                end_time = datetime.datetime.now()
                testing_time_sec = (end_time - start_time).total_seconds()

                # Compute metrics
                acc = metrics.accuracy_score(y_test, y_test_predicted)
                f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

                # Store results
                result = type('',(object,),{'algorithm': 'perceptron', 'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1, 'y_test_predicted': y_test_predicted})()
                results.append(result)

                # Cache best result for confusion matrix return value
                if(bestResult is None or bestResult.f1 < result.f1):
                        bestResult = result

        # Print results
        print(dataSetName)
        print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
        for res in results:
                print('Perceptron (alpha: ' + str(res.alpha) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
        print()
        
        return bestResult

In [None]:
# decision tree
def decision_tree(dataSetName, X_train, X_test, y_train, y_test, max_features_values):

    results = []
    bestResult = None

    for max_features in max_features_values:
            classifier = DecisionTreeClassifier(max_features=max_features, random_state=random_state) 

            # Train classifier
            start_time = datetime.datetime.now()
            classifier.fit(X_train, y_train.ravel())
            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

            # Store results
            result = type('',(object,),{'algorithm': 'decision_tree', 'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1, 'y_test_predicted': y_test_predicted})()
            results.append(result)

            # Cache best result for confusion matrix return value
            if(bestResult is None or bestResult.f1 < result.f1):
                bestResult = result

    # Print results
    print(dataSetName)
    print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
    for res in results:
        print('Decision Tree (max features: ' + str(res.max_features) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
    print()

    return bestResult

In [None]:
# svm
def svm_svc(dataSetName, X_train, X_test, y_train, y_test):

    results = []
    bestResult = None
    classifier = make_pipeline(StandardScaler(), SVC(random_state=random_state))

    # Train classifier
    start_time = datetime.datetime.now()
    classifier.fit(X_train, y_train.ravel())
    end_time = datetime.datetime.now()
    training_time_sec = (end_time - start_time).total_seconds()

    # Predict test set on trained classifier
    start_time = datetime.datetime.now()
    y_test_predicted = classifier.predict(X_test)
    end_time = datetime.datetime.now()
    testing_time_sec = (end_time - start_time).total_seconds()

    # Compute metrics
    acc = metrics.accuracy_score(y_test, y_test_predicted)
    f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

    # Store results
    result = type('',(object,),{'algorithm': 'svm', 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1, 'y_test_predicted': y_test_predicted})()
    results.append(result)

    # Cache best result for confusion matrix return value
    if(bestResult is None or bestResult.f1 < result.f1):
        bestResult = result

    # Print results
    print(dataSetName)
    print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
    for res in results:
        print('SVM | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
    print()

    return bestResult

In [None]:
# random forests
def random_forests(dataSetName, X_train, X_test, y_train, y_test, numbers_of_trees, max_features_values):

    results = []
    bestResult = None

    for number_of_trees in numbers_of_trees:
        for max_features in max_features_values:

            classifier = RandomForestClassifier(n_estimators=number_of_trees, max_features=max_features, random_state=random_state)

            # Train classifier
            start_time = datetime.datetime.now()
            classifier.fit(X_train, y_train.ravel())
            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

            # Store results
            result = type('',(object,),{'algorithm': 'random_forests', 'number_of_trees': number_of_trees, 'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1, 'y_test_predicted': y_test_predicted})()
            results.append(result)

            # Cache best result for confusion matrix return value
            if(bestResult is None or bestResult.f1 < result.f1):
                bestResult = result

    # Print results
    print(dataSetName)
    print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
    for res in results:
        print('Random Forests (num trees: ' + str(res.number_of_trees) + ', max features: ' + str(res.max_features) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')
    print()

    return bestResult
    


## Classification Application

In [None]:
def perceptron_with_parameters(data_set_name, X_train, X_test, y_train, y_test):
    alpha_values = [0.0001, 0.001, 0.01]
    return perceptron(data_set_name, X_train, X_test, y_train, y_test, alpha_values)

def kNN_with_parameters(data_set_name, X_train, X_test, y_train, y_test):
    n_neighbors_values = [1, 2, 3]
    return kNN(data_set_name, X_train, X_test, y_train, y_test, n_neighbors_values)

def decision_tree_with_parameters(dataSetName, X_train, X_test, y_train, y_test):
    max_features_values = [None, 'sqrt', 'log2'] 
    return decision_tree(dataSetName, X_train, X_test, y_train, y_test, max_features_values)

def random_forests_with_parameters(dataSetName, X_train, X_test, y_train, y_test):
    numbers_of_trees = [10, 100]
    max_features_values = ['sqrt', 'log2'] 
    return random_forests(dataSetName, X_train, X_test, y_train, y_test, numbers_of_trees, max_features_values)

classifiers = [
                perceptron_with_parameters,
                kNN_with_parameters,
                decision_tree_with_parameters,
                svm_svc,
                random_forests_with_parameters
              ]

trainingSets = [
                 ('heart_failure_prediction', heart_failure_data_set_X, heart_failure_data_set_y), 
                 ('covertype', covertype_data_set_X, covertype_data_set_y), 
                 ('music_bmp', data_bpm, music_target),
                 ('music_bpm_statistics', data_bpm_statistics, music_target),
                 ('music_chroma', data_chroma, music_target),
                 ('music_mfcc', data_mfcc, music_target)
               ]

bestResult = None

for indexDataset, train_data_set in enumerate(trainingSets):

    data_set_X = train_data_set[1]
    data_set_y = train_data_set[2]

    X, y = shuffle(data_set_X, data_set_y, random_state=random_state)

    # Prepare a train/test set split: split 2/3 1/3 into training & test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

    if(train_data_set[0] == 'heart_failure_prediction' or train_data_set[0] == 'covertype'):
        y_train = y_train.values

    for indexClassifier, classifier in enumerate(classifiers):
        # do the actual classification
        result = classifier(train_data_set[0], X_train, X_test, y_train, y_test)

        if(train_data_set[0] != 'heart_failure_prediction' and train_data_set[0] != 'covertype'):
            # Cache best result for confusion matrix return value
            if(bestResult is None or bestResult.f1 < result.f1):
                bestResult = result
                bestResult.y_test = y_test
                    
    

## Confusion Matrix

In [None]:
# Confusion Matrix
y_test = le.inverse_transform(bestResult.y_test)
y_test_predicted = le.inverse_transform(bestResult.y_test_predicted)

conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_test_predicted, labels=list(le.classes_)),
    index = ['true:' + str(label) for label in list(le.classes_)],
    columns = ['pred:' + str(label) for label in list(le.classes_)]
)

print(conf_matrix)
