#### Load the dataset.

In [1]:

import time
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample

df = pd.read_csv("../dataset/original-noneeg-dataset.csv", sep="|", dtype = {"hr": "float64", "label": "int8"})


#### Display the dataset size.

In [2]:
df.shape

(41992, 2)

#### Build a helper function to convert the set data to the required format to perform the undersampling.

In [3]:

number_of_steps = 30

def build_time_window_structure(df):
    # Splits the dataset into "time windows" to be used as a time series list.
    # The function groups each 30 dataset records (CSV lines) into one record.
    # Each record contains 30 steps and each step contains 1 feature value.
    # Parameters:
    #    df: Dataframe to be splitted.
    # Return:
    #    First list contains all time windows.
    #    Second list contains all target values.
    print("\nStarting build_time_window_structure function.")
    initial_line_number = 0
    first_feat_index = 0
    last_feat_index = 1
    X_array = []
    y_array = []
    while initial_line_number < len(df["label"]):
        target_value = df["label"][initial_line_number]
        sub_matrix = df.iloc[initial_line_number : (initial_line_number + number_of_steps), first_feat_index : last_feat_index]
        sub_matrix_values = sub_matrix.values
        new_line = sub_matrix_values.flatten()
        size_diff = number_of_steps - len(new_line)
        if size_diff > 0:
            last_value = new_line[len(new_line) - 1]
            new_line = np.append(new_line, [last_value] * size_diff)
        X_array.append(new_line)
        y_array.append(target_value)
        initial_line_number += number_of_steps
    print("Quantity of samples (features) => ", len(X_array))
    print("Quantity os samples (labels) => ", len(y_array))
    print("Finishing build_time_window_structure function.")
    return X_array, y_array


#### Perform undersampling for balancing the dataset.

In [4]:

# Correct dataset imbalance through undersampling.
print("\nStarting undersampling process.")
X_list, y_list = build_time_window_structure(df)
rus = RandomUnderSampler(random_state = 42)
X_arr = np.array(X_list)
y_arr = np.array(y_list)
X_resampled, y_resampled = rus.fit_resample(X_arr, y_arr)
print("\nQuantity of resampled samples => ", len(y_resampled))



Starting undersampling process.

Starting build_time_window_structure function.
Quantity of samples (features) =>  1400
Quantity os samples (labels) =>  1400
Finishing build_time_window_structure function.

Quantity of resampled samples =>  800


#### GridSearch to evaluate KNN hyperparameters.

In [5]:

# Normalize data using box-cox method.
pt = PowerTransformer()
pt.fit(X_resampled)
X = pt.transform(X_resampled)
y = y_resampled

model = KNeighborsClassifier(n_jobs = 4)
params = {
    "n_neighbors": [3, 5, 7, 9, 11, 13, 15],
    "p": [1, 2],
    "leaf_size" : [1, 5]
}
strat_k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
grid_search_cv = GridSearchCV(model, cv = strat_k_fold, param_grid = params, n_jobs = 4, verbose = 5)

# Train the KNN model and show the best hyperparameter values.
start_time = time.time()
print("\nStarting training at: ", time.strftime("%H:%M:%S", time.localtime()))
grid_search_cv.fit(X, y)
elapsed_seconds = time.time() - start_time
print("\nTime taken for training: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds)))
print("\nBest values:")
print(grid_search_cv.best_params_)



Starting training at:  22:52:08
Fitting 10 folds for each of 28 candidates, totalling 280 fits

Time taken for training:  00:00:01

Best values:
{'leaf_size': 1, 'n_neighbors': 11, 'p': 1}


#### Train the model using cross validation (10 fold) and display metrics.

In [6]:

# Normalize data using box-cox method.
pt = PowerTransformer()
pt.fit(X_resampled)
X = pt.transform(X_resampled)
y = y_resampled

strat_k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

# Train the KNN model and show the best hyperparameter values.
start_time = time.time()
print("\nStarting training at: ", time.strftime("%H:%M:%S", time.localtime()))

train_accuracy_by_fold = []
test_accuracy_by_fold = []
y_predclass_for_report = []
y_testclass_for_report = []
fold_number = 1
for train_index, test_index in strat_k_fold.split(X, y):
    print("Training fold {}".format(fold_number))
    model = KNeighborsClassifier(n_neighbors = 11, p = 1, leaf_size = 1) 
    model.fit(X[train_index], y[train_index])
    train_pred_classes = model.predict(X[train_index])
    test_pred_classes = model.predict(X[test_index])
    train_accuracy = accuracy_score(y[train_index] , train_pred_classes)
    test_accuracy = accuracy_score(y[test_index] , test_pred_classes)
    train_accuracy_by_fold.append(train_accuracy)
    test_accuracy_by_fold.append(test_accuracy)
    y_predclass_for_report.extend(test_pred_classes)
    y_testclass_for_report.extend(y[test_index])
    fold_number += 1

elapsed_seconds = time.time() - start_time
print("\nTime taken for training: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds)))
print("\n")

# Show metrics.
for i in range(len(train_accuracy_by_fold)):
    print("Fold {} - Train Accuracy {:.4f} - Test Accuracy {:.4f}".format((i + 1),
                            train_accuracy_by_fold[i], test_accuracy_by_fold[i]))
print("\nMean Train Accuracy: {:.4f} - Std: {:.4f} ".format(np.mean(train_accuracy_by_fold), np.std(train_accuracy_by_fold)))
print("Mean Test Accuracy: {:.4f} - Std: {:.4f} ".format(np.mean(test_accuracy_by_fold), np.std(test_accuracy_by_fold)))

print("\nEvaluate other metrics:")
print(classification_report(y_testclass_for_report, y_predclass_for_report))



Starting training at:  22:53:17
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Training fold 6
Training fold 7
Training fold 8
Training fold 9
Training fold 10

Time taken for training:  00:00:00


Fold 1 - Train Accuracy 0.5042 - Test Accuracy 0.5000
Fold 2 - Train Accuracy 0.5181 - Test Accuracy 0.4125
Fold 3 - Train Accuracy 0.5181 - Test Accuracy 0.4625
Fold 4 - Train Accuracy 0.5042 - Test Accuracy 0.5000
Fold 5 - Train Accuracy 0.5028 - Test Accuracy 0.4625
Fold 6 - Train Accuracy 0.5250 - Test Accuracy 0.4625
Fold 7 - Train Accuracy 0.5167 - Test Accuracy 0.4375
Fold 8 - Train Accuracy 0.5153 - Test Accuracy 0.4125
Fold 9 - Train Accuracy 0.5236 - Test Accuracy 0.3875
Fold 10 - Train Accuracy 0.5250 - Test Accuracy 0.4000

Mean Train Accuracy: 0.5153 - Std: 0.0082 
Mean Test Accuracy: 0.4437 - Std: 0.0380 

Evaluate other metrics:
              precision    recall  f1-score   support

           0       0.40      0.45      0.42       200
       

#### Train the model using cross validation (10 fold) and display metrics.
- Data augmentation (5x).

In [11]:

# Data augmentation (5x).
print("\nStarting data augmentation.")
X_all_oversampled = []
y_all_oversampled = []
X_resampled_values = X_resampled
for count in range(0, 4):
    X_oversampled, y_oversampled = resample(X_resampled_values[y_resampled == count],
                                            y_resampled[y_resampled == count],
                                            replace = True,
                                            n_samples = 1000,
                                            random_state = 42)
    X_all_oversampled.extend(X_oversampled)
    y_all_oversampled.extend(y_oversampled)
X_resampled_arr = np.array(X_all_oversampled)
y_resampled_arr = np.array(y_all_oversampled)
print("\nQuantity of samples generated by oversampling => ", len(y_resampled_arr))

# Normalize data using box-cox method.
pt = PowerTransformer()
pt.fit(X_resampled_arr)
X = pt.transform(X_resampled_arr)
y = y_resampled_arr

strat_k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

# Train the KNN model and show the best hyperparameter values.
start_time = time.time()
print("\nStarting training at: ", time.strftime("%H:%M:%S", time.localtime()))

train_accuracy_by_fold = []
test_accuracy_by_fold = []
y_predclass_for_report = []
y_testclass_for_report = []
fold_number = 1
for train_index, test_index in strat_k_fold.split(X, y):
    print("Training fold {}".format(fold_number))
    model = KNeighborsClassifier(n_neighbors = 11, p = 1, leaf_size = 1) 
    model.fit(X[train_index], y[train_index])
    train_pred_classes = model.predict(X[train_index])
    test_pred_classes = model.predict(X[test_index])
    train_accuracy = accuracy_score(y[train_index] , train_pred_classes)
    test_accuracy = accuracy_score(y[test_index] , test_pred_classes)
    train_accuracy_by_fold.append(train_accuracy)
    test_accuracy_by_fold.append(test_accuracy)
    y_predclass_for_report.extend(test_pred_classes)
    y_testclass_for_report.extend(y[test_index])
    fold_number += 1

elapsed_seconds = time.time() - start_time
print("\nTime taken for training: ", time.strftime("%H:%M:%S", time.gmtime(elapsed_seconds)))
print("\n")

# Show metrics.
for i in range(len(train_accuracy_by_fold)):
    print("Fold {} - Train Accuracy {:.4f} - Test Accuracy {:.4f}".format((i + 1),
                            train_accuracy_by_fold[i], test_accuracy_by_fold[i]))
print("\nMean Train Accuracy: {:.4f} - Std: {:.4f} ".format(np.mean(train_accuracy_by_fold), np.std(train_accuracy_by_fold)))
print("Mean Test Accuracy: {:.4f} - Std: {:.4f} ".format(np.mean(test_accuracy_by_fold), np.std(test_accuracy_by_fold)))

print("\nEvaluate other metrics:")
print(classification_report(y_testclass_for_report, y_predclass_for_report))



Starting data augmentation.

Quantity of samples generated by oversampling =>  4000

Starting training at:  23:19:05
Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Training fold 6
Training fold 7
Training fold 8
Training fold 9
Training fold 10

Time taken for training:  00:00:01


Fold 1 - Train Accuracy 0.7594 - Test Accuracy 0.6350
Fold 2 - Train Accuracy 0.7711 - Test Accuracy 0.6675
Fold 3 - Train Accuracy 0.7722 - Test Accuracy 0.6775
Fold 4 - Train Accuracy 0.7797 - Test Accuracy 0.6825
Fold 5 - Train Accuracy 0.7664 - Test Accuracy 0.6650
Fold 6 - Train Accuracy 0.7686 - Test Accuracy 0.6850
Fold 7 - Train Accuracy 0.7619 - Test Accuracy 0.6850
Fold 8 - Train Accuracy 0.7611 - Test Accuracy 0.6700
Fold 9 - Train Accuracy 0.7789 - Test Accuracy 0.6600
Fold 10 - Train Accuracy 0.7656 - Test Accuracy 0.7000

Mean Train Accuracy: 0.7685 - Std: 0.0067 
Mean Test Accuracy: 0.6727 - Std: 0.0169 

Evaluate other metrics:
              precision    reca