# Data Science
# Exercise 5 - Comparative Experimentation
<br/>Student:
<br/>se21m024
<br/>Thomas Stummer
<br/><br/>The interpretation of the data can be found in the document <b><i>se21m024_Stummer_ex5_comp_exp.pdf</i></b>.
<br/><br/>
The library <i>Surprise</i> (https://surprise.readthedocs.io/en/stable/index.html) was used to create the following results. The code is highly inspired by the example code provided by the libries official documentation.
<br/><br/>
Big data set: Covertype<br>
The data set was provided by Jock A. Blackard and Colorado State University and downloaded from https://archive.ics.uci.edu/ml/datasets/Covertype.
<br/><br/>
Small data set: Heart Failure Prediction<br>
The data set was provided by Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020) (https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5) and downloaded from https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data.

# Import necessary dependencies

In [35]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import datetime
import pandas as pd

# Small data set: Heart Failure Prediction

## Import data set

In [36]:
# Student ID: se21m024 -> random_state = 21024
random_state = 21024

data_set = pd.read_csv("./Data/heartfailure/heart_failure_clinical_records_dataset.csv")

# Split data in input features (X) and target (y) feature
# The target feature is 'DEATH_EVENT' that indicates weither the person has died
# Column 'time' is not used as feature due to the direct connection to the target feature 'death_event': https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data/discussion/178372
data_set_X = data_set.loc[:,:'smoking']
data_set_y = data_set.loc[:,'DEATH_EVENT':]

X, y = shuffle(data_set_X, data_set_y, random_state=random_state)

# Prepare a train/test set split: split 2/3 1/3 into training & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

data_set.head(5)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## k-NN

In [31]:
results = []
n_neighbors_values = [5, 10, 15]

for n_neighbors in n_neighbors_values:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

        # Store results
        result = type('',(object,),{'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
        results.append(result)

# Print results
print('Heart Failure Prediction')
print('k-NN')
print('n_neighbors | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.n_neigbors) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))


k-NN
n_neighbors | training_time_sec | testing_time_sec | acc | precision
5 | 0.003016 | 0.005243 | 0.6565656565656566 | 0.6565656565656566
10 | 0.001586 | 0.003549 | 0.6767676767676768 | 0.6767676767676768
15 | 0.001992 | 0.003529 | 0.6565656565656566 | 0.6565656565656566


## Perceptron

In [33]:
results = []
penalty_values = [None, 'l2', 'l1', 'elasticnet']
alpha_values = [0.0001, 0.001, 0.01, 0.1, 1]

for penalty in penalty_values:
        for alpha in alpha_values:
                classifier = Perceptron(penalty=penalty, alpha=alpha, random_state=random_state)

                # Train classifier
                start_time = datetime.datetime.now()
                classifier.fit(X_train, y_train.values.ravel())
                end_time = datetime.datetime.now()
                training_time_sec = (end_time - start_time).total_seconds()

                # Predict test set on trained classifier
                start_time = datetime.datetime.now()
                y_test_predicted = classifier.predict(X_test)
                end_time = datetime.datetime.now()
                testing_time_sec = (end_time - start_time).total_seconds()

                # Compute metrics
                acc = metrics.accuracy_score(y_test, y_test_predicted)
                precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

                # Store results
                result = type('',(object,),{'penalty': penalty, 'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
                results.append(result)

# Print results
print('Heart Failure Prediction')
print('Perceptron')
print('penalty | alpha | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.penalty) + ' | ' + str(res.alpha) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))

Perceptron
penalty | alpha | training_time_sec | testing_time_sec | acc | precision
None | 0.0001 | 0.002002 | 0.000999 | 0.7070707070707071 | 0.7070707070707071
None | 0.001 | 0.001 | 0.001 | 0.7070707070707071 | 0.7070707070707071
None | 0.01 | 0.002003 | 0.000999 | 0.7070707070707071 | 0.7070707070707071
None | 0.1 | 0.000999 | 0.001019 | 0.7070707070707071 | 0.7070707070707071
None | 1 | 0.002 | 0.001 | 0.7070707070707071 | 0.7070707070707071
l2 | 0.0001 | 0.001 | 0.001 | 0.7070707070707071 | 0.7070707070707071
l2 | 0.001 | 0.001 | 0.001 | 0.7070707070707071 | 0.7070707070707071
l2 | 0.01 | 0.001 | 0.002 | 0.7070707070707071 | 0.7070707070707071
l2 | 0.1 | 0.001999 | 0.001 | 0.7070707070707071 | 0.7070707070707071
l2 | 1 | 0.001996 | 0.001001 | 0.7070707070707071 | 0.7070707070707071
l1 | 0.0001 | 0.001985 | 0.001002 | 0.7070707070707071 | 0.7070707070707071
l1 | 0.001 | 0.002 | 0.001998 | 0.7070707070707071 | 0.7070707070707071
l1 | 0.01 | 0.003001 | 0.001001 | 0.7070707070707071 

## Decision tree

In [34]:
results = []
splitter_values = ['best', 'random']
max_features_values = [None, 'auto', 'sqrt', 'log2']

for splitter in splitter_values:
    for max_features in max_features_values:
            classifier = DecisionTreeClassifier(splitter=splitter, max_features=max_features, random_state=random_state) 

            # Train classifier
            start_time = datetime.datetime.now()
            classifier.fit(X_train, y_train.values.ravel())
            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

            # Store results
            result = type('',(object,),{'splitter': splitter, 'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
            results.append(result)

# Print results
print('Heart Failure Prediction')
print('Decision tree')
print('splitter | max_features | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.splitter) + ' | ' + str(res.max_features) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))

Perceptron
splitter | max_features | training_time_sec | testing_time_sec | acc | precision
best | None | 0.004004 | 0.001044 | 0.7878787878787878 | 0.7878787878787878
best | auto | 0.002477 | 0.002 | 0.6666666666666666 | 0.6666666666666666
best | sqrt | 0.001999 | 0.001514 | 0.6666666666666666 | 0.6666666666666666
best | log2 | 0.002515 | 0.001 | 0.6666666666666666 | 0.6666666666666666
random | None | 0.001954 | 0.001513 | 0.7373737373737373 | 0.7373737373737373
random | auto | 0.001 | 0.001002 | 0.6767676767676768 | 0.6767676767676768
random | sqrt | 0.001952 | 0.0 | 0.6767676767676768 | 0.6767676767676768
random | log2 | 0.000956 | 0.001 | 0.6767676767676768 | 0.6767676767676768


# Big data set: Covertype

## Import data set

In [37]:
# Student ID: se21m024 -> random_state = 21024
random_state = 21024

data_set = pd.read_csv("./Data/covtype/covtype.data", header=None)

# Split data in input features (X) and target (y) feature
# The target feature is 'Forest cover type class' in column 54 than can be any value between 1 and 7 and indicates which types of vegetation is growing there mainly.
data_set_X = data_set.loc[:,:53]
data_set_y = data_set.loc[:,54:]

X, y = shuffle(data_set_X, data_set_y, random_state=random_state)

# Prepare a train/test set split: split 2/3 1/3 into training & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_state)

data_set.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


## k-NN

In [45]:
results = []
n_neighbors_values = [5, 10, 15]

# kd tree was chosen to gain results within a reasonable amount of time
for n_neighbors in n_neighbors_values:
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

        # Train classifier
        start_time = datetime.datetime.now()
        classifier.fit(X_train, y_train.values.ravel())
        end_time = datetime.datetime.now()
        training_time_sec = (end_time - start_time).total_seconds()

        # Predict test set on trained classifier
        start_time = datetime.datetime.now()
        y_test_predicted = classifier.predict(X_test)
        end_time = datetime.datetime.now()
        testing_time_sec = (end_time - start_time).total_seconds()

        # Compute metrics
        acc = metrics.accuracy_score(y_test, y_test_predicted)
        precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

        # Store results
        result = type('',(object,),{'n_neigbors': n_neighbors, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
        results.append(result)

# Print results
print('Covertype')
print('k-NN')
print('n_neighbors | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.n_neigbors) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))

Covertype
k-NN
n_neighbors | training_time_sec | testing_time_sec | acc | precision
100 | 11.895484 | 75.03818 | 0.8544337467533145 | 0.8544337467533145


## Perceptron

In [42]:
results = []
penalty_values = [None, 'l2', 'l1', 'elasticnet']
alpha_values = [0.0001, 0.001, 0.01, 0.1, 1]

for penalty in penalty_values:
        for alpha in alpha_values:
                classifier = Perceptron(penalty=penalty, alpha=alpha, random_state=random_state)

                # Train classifier
                start_time = datetime.datetime.now()
                classifier.fit(X_train, y_train.values.ravel())
                end_time = datetime.datetime.now()
                training_time_sec = (end_time - start_time).total_seconds()

                # Predict test set on trained classifier
                start_time = datetime.datetime.now()
                y_test_predicted = classifier.predict(X_test)
                end_time = datetime.datetime.now()
                testing_time_sec = (end_time - start_time).total_seconds()

                # Compute metrics
                acc = metrics.accuracy_score(y_test, y_test_predicted)
                precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

                # Store results
                result = type('',(object,),{'penalty': penalty, 'alpha': alpha, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
                results.append(result)

# Print results
print('Covertype')
print('Perceptron')
print('penalty | alpha | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.penalty) + ' | ' + str(res.alpha) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))

KeyboardInterrupt: 

## Decision tree

In [43]:
results = []
splitter_values = ['best', 'random']
max_features_values = [None, 'auto', 'sqrt', 'log2']

for splitter in splitter_values:
    for max_features in max_features_values:
            classifier = DecisionTreeClassifier(splitter=splitter, max_features=max_features, random_state=random_state) 

            # Train classifier
            start_time = datetime.datetime.now()
            classifier.fit(X_train, y_train.values.ravel())
            end_time = datetime.datetime.now()
            training_time_sec = (end_time - start_time).total_seconds()

            # Predict test set on trained classifier
            start_time = datetime.datetime.now()
            y_test_predicted = classifier.predict(X_test)
            end_time = datetime.datetime.now()
            testing_time_sec = (end_time - start_time).total_seconds()

            # Compute metrics
            acc = metrics.accuracy_score(y_test, y_test_predicted)
            precision = metrics.precision_score(y_test, y_test_predicted, average="micro")

            # Store results
            result = type('',(object,),{'splitter': splitter, 'max_features': max_features, 'training_time_sec': training_time_sec, 'testing_time_sec': testing_time_sec, 'acc': acc, 'precision': precision})()
            results.append(result)

# Print results
print('Covertype')
print('Decision tree')
print('splitter | max_features | training_time_sec | testing_time_sec | acc | precision')
for res in results:
    print(str(res.splitter) + ' | ' + str(res.max_features) + ' | ' + str(res.training_time_sec) + ' | ' + str(res.testing_time_sec) + ' | ' + str(res.acc) + ' | ' + str(res.precision))

Covertype
Decision tree
splitter | max_features | training_time_sec | testing_time_sec | acc | precision
best | None | 5.29174 | 0.075005 | 0.9334546820073644 | 0.9334546820073644
best | auto | 1.11917 | 0.084 | 0.877319619890056 | 0.877319619890056
best | sqrt | 1.132032 | 0.086001 | 0.877319619890056 | 0.877319619890056
best | log2 | 0.927012 | 0.086954 | 0.8704037885820981 | 0.8704037885820981
random | None | 2.187817 | 0.079999 | 0.9280930872980275 | 0.9280930872980275
random | auto | 0.735 | 0.108 | 0.8607602198879698 | 0.8607602198879698
random | sqrt | 0.734016 | 0.105982 | 0.8607602198879698 | 0.8607602198879698
random | log2 | 0.639484 | 0.113994 | 0.8411914423107013 | 0.8411914423107013
