In [None]:
import import_ipynb as ipynb
import os
from pathlib import Path
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

##### Change directories and load Lab 1

In [None]:
os.chdir("../Lab 1")
lab_1 = ipynb.NotebookLoader().load_module('lab1')
os.chdir("../Lab 2")

# Required Preparations

In [None]:
lab_2_positions = ['Sitting', 'Running', 'Jumping']

##### 'recordings' are used if we want to plot each recording.

In [None]:
recordings : dict[str: pd.DataFrame] = lab_1.read_recordings(lab_2_positions)

##### merger() will read files based on the positions and concatenate the acceleratometer and rate_gyro columns, and a return a dictionary

In [None]:
merged_dfs : dict = lab_1.merger(lab_2_positions)
lab_1.split_train_test_sets(merged_dfs)

##### Display the four classes

In [None]:
class_dfs = lab_1.merge_recordings_by_rows(lab_2_positions)

In [None]:
print(*class_dfs, sep='\n')

# Classification of different positions

#### We will use the K-nearest neighbor algorithm and decision trees in this lab. In this part you should only work with the stationary positions:
- Standing
- Sitting
- Laying down

In [None]:
stationary_positions = ['Standing_up', 'Sitting', 'Laying_down']

In [None]:
stationary_positions_dfs = []

for _class in Path("../Binaries/Class dataframes").rglob("*.pkl"):
    for stat_pos in stationary_positions:
        if str(_class).__contains__(stat_pos):
            stationary_positions.append(pd.read_pickle(_class))

#### We start with working with the KNN algorithm. You should use crossvalidation for evaluating the model.

In [None]:
knn = KNeighborsClassifier()

In [None]:
df = pd.concat(stationary_positions_dfs, axis='rows', ignore_index=True)

In [None]:
X = df.drop('class', axis='columns')
y = df['class']

In [None]:
df

##### Make a choice of the number of subsets that you use for your crossvalidation. Motivate the choice

In [None]:
folds=10

#### For the gridsearch use GridSearchCV which you find in the previous weeks ́ exercise. The hyperparameter you will use for the gridsearch is k, that is the number of neighbours.

##### Make a choice of the range of k-values you will use. Motivate the choice.

In [None]:
k_range = list(range(1, 50))

##### Train the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

grid = GridSearchCV(knn,
                    param_grid=dict(n_neighbors=k_range),
                    cv=folds,
                    scoring='accuracy',
                    return_train_score=False)

In [None]:
grid_search = grid.fit(X_train, y_train)

##### What is the optimal k-value?

In [None]:
grid.best_params_

##### What is the accuracy?

In [None]:
grid.best_score_

##### Plot the accuracy as a function of the k-value. You need to extract that information from the model. How sensitive is the performance of the model for different k-values?

In [None]:
mean_test_score = pd.DataFrame(grid.cv_results_['mean_test_score'])
mean_test_score.rename({0: "Accuracy"}, axis='columns',inplace=True)
mean_test_score.index.names = ['k']

In [None]:
mean_test_score.plot(title = 'Accuracy')

#### Now it is time to look at the test set with the optimal k-value

##### Use the model on the test set and acquire both the accuracy and the plot the confusion matrix

In [None]:
y_pred = grid.predict(X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred)

##### Explain the difference between the accuracy for the validation set and the test set

In [None]:
print(f"Validation set accuracy: {grid.best_score_}\nTest set accuracy: {accuracy_score(y_test, y_pred)}")

##### Explain the result in the confusion matrix

In [None]:
confusion_matrix(y_test, y_pred)