# Data Science
# Exercise 5 - Comparative Experimentation
<br/>Student:
<br/>se21m024
<br/>Thomas Stummer
<br/><br/>The interpretation of the data can be found in the document <b><i>se21m024_Stummer_ex5_comp_exp.pdf</i></b>.
<br/><br/>
The library <i>Surprise</i> (https://surprise.readthedocs.io/en/stable/index.html) was used to create the following results. The code is highly inspired by the example code provided by the libries official documentation.
<br/><br/>
Big data set: Covertype<br>
The data set was provided by Jock A. Blackard and Colorado State University and downloaded from https://archive.ics.uci.edu/ml/datasets/Covertype.
<br/><br/>
Small data set: Heart Failure Prediction<br>
The data set was provided by Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020) (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5) and downloaded from https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data.

# Import necessary dependencies

In [43]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd

# Matriculation number: 01425616

# Small data set: Heart Failure Prediction

## Import data set

In [44]:
data_set = pd.read_csv("./Data/heartfailure/heart_failure_clinical_records_dataset.csv")

# The target feature is 'DEATH_EVENT' that indicates weither the person has died
# Column 'time' is not used as feature due to the direct connection to the target feature 'death_event': https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data/discussion/178372
data_set_X = data_set.loc[:,:'smoking']
data_set_y = data_set.loc[:,'DEATH_EVENT':]


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## k-NN

In [45]:
results = []
n_neighbors_values = [5, 10, 15]

# kd tree was chosen to gain results within a reasonable amount of time
for n_neighbors in n_neighbors_values:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Heart Failure Prediction')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('k-NN (' + str(res.n_neigbors) + '-NN) | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Heart Failure Prediction
Algorithm | acc | f1 | training_time_sec | testing_time_sec
k-NN (5-NN) | 0.616 | 0.549 | 0.001997 sec | 0.003997 sec
k-NN (10-NN) | 0.677 | 0.554 | 0.001999 sec | 0.004003 sec
k-NN (15-NN) | 0.687 | 0.577 | 0.002001 sec | 0.003998 sec


## Perceptron

In [46]:
results = []
alpha_values = [0.0001, 0.001, 0.01]

for alpha in alpha_values:
        classifier = Perceptron(alpha=alpha, random_state=random_state)

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Heart Failure Prediction')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('Perceptron (alpha: ' + str(res.alpha) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Heart Failure Prediction
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Perceptron (alpha: 0.0001) | 0.687 | 0.559 | 0.002 sec | 0.000998 sec
Perceptron (alpha: 0.001) | 0.687 | 0.559 | 0.002 sec | 0.001001 sec
Perceptron (alpha: 0.01) | 0.687 | 0.559 | 0.002 sec | 0.000999 sec


## Decision tree

In [47]:
results = []
max_features_values = [None, 'auto', 'sqrt', 'log2'] # auto = sqrt

for max_features in max_features_values:
        classifier = DecisionTreeClassifier(max_features=max_features, random_state=random_state) 

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Heart Failure Prediction')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('Decision tree (max features: ' + str(res.max_features) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Heart Failure Prediction
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Decision tree (max features: None) | 0.697 | 0.702 | 0.003001 sec | 0.001 sec
Decision tree (max features: auto) | 0.626 | 0.628 | 0.002002 sec | 0.001999 sec
Decision tree (max features: sqrt) | 0.626 | 0.628 | 0.001999 sec | 0.000998 sec
Decision tree (max features: log2) | 0.626 | 0.628 | 0.001 sec | 0.001 sec


# Big data set: Covertype

## Import data set

In [48]:
# Student ID: se21m024 -> random_state = 21024
random_state = 21024

data_set = pd.read_csv("./Data/covtype/covtype.data", header=None)

# Split data in input features (X) and target (y) feature
# The target feature is 'Forest cover type class' in column 54 than can be any value between 1 and 7 and indicates which type of vegetation is growing there mainly.
data_set_X = data_set.loc[:,:53]
data_set_y = data_set.loc[:,54:]

X, y = shuffle(data_set_X, data_set_y, random_state=random_state)

# Prepare a train/test set split: split 2/3 1/3 into training & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

data_set.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


## k-NN

In [49]:
results = []
n_neighbors_values = [5, 10, 15]

# kd tree was chosen to gain results within a reasonable amount of time
for n_neighbors in n_neighbors_values:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Covertype')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('k-NN (' + str(res.n_neigbors) + '-NN) | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Covertype
Algorithm | acc | f1 | training_time_sec | testing_time_sec
k-NN (5-NN) | 0.965 | 0.965 | 12.033436 sec | 20.152396 sec
k-NN (10-NN) | 0.955 | 0.954 | 12.148761 sec | 24.076917 sec
k-NN (15-NN) | 0.946 | 0.946 | 12.482787 sec | 31.48592 sec


## Perceptron

In [50]:
results = []
alpha_values = [0.0001, 0.001, 0.01]

for alpha in alpha_values:
        classifier = Perceptron(alpha=alpha, random_state=random_state)

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Covertype')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('Perceptron (alpha: ' + str(res.alpha) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Covertype
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Perceptron (alpha: 0.0001) | 0.584 | 0.563 | 9.840077 sec | 0.047 sec
Perceptron (alpha: 0.001) | 0.584 | 0.563 | 9.665985 sec | 0.041996 sec
Perceptron (alpha: 0.01) | 0.584 | 0.563 | 9.708394 sec | 0.045001 sec


## Decision tree

In [51]:
results = []
max_features_values = [None, 'auto', 'sqrt', 'log2']    # auto = sqrt

for max_features in max_features_values:
        classifier = DecisionTreeClassifier(max_features=max_features, random_state=random_state) 

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        f1 = f1_score(y_true=y_test, y_pred=y_test_predicted, average='weighted')

        # Store results
        result = type('',(object,),{'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'f1': f1})()
        results.append(result)

# Print results
print('Covertype')
print('Algorithm | acc | f1 | training_time_sec | testing_time_sec')
for res in results:
    print('Decision tree (max features: ' + str(res.max_features) + ') | ' + str(round(res.acc, 3)) + ' | ' + str(round(res.f1, 3)) + ' | ' + str(res.training_time_sec) + ' sec | ' + str(res.testing_time_sec) + ' sec')

Covertype
Algorithm | acc | f1 | training_time_sec | testing_time_sec
Decision tree (max features: None) | 0.933 | 0.933 | 5.58124 sec | 0.096001 sec
Decision tree (max features: auto) | 0.877 | 0.877 | 1.483151 sec | 0.085 sec
Decision tree (max features: sqrt) | 0.877 | 0.877 | 1.212917 sec | 0.084999 sec
Decision tree (max features: log2) | 0.87 | 0.87 | 0.959233 sec | 0.095001 sec
