In [None]:
import pandas as pd
from common import get_set, sns_cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier

##### Hyperparameters

In [None]:
folds = KFold(n_splits=20)
k_range = list(range(1, 50))

param_grid = {
    'max_depth': list(range(1, 50)),
    'criterion': ['gini', 'entropy']
}

In [None]:
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(),
                        param_grid=dict(n_neighbors=k_range),
                        cv=folds,
                        scoring='accuracy',
                        return_train_score=False)

grid_dt = GridSearchCV(estimator=DecisionTreeClassifier(),
                       param_grid=param_grid,
                       cv=folds,
                       scoring='accuracy',
                       return_train_score=False)

In [None]:
classes = get_set("Class Dataframes")

# Feature Transformation

#### Transform all features using rolling with the parameter 10. Tip: store the transformed dataframes in new pickle files. You never know if you need them later

In [None]:
ROLLING_WINDOW = 10

In [None]:
for _class in classes:
    transformed_df = pd.rolling(window=ROLLING_WINDOW).transform(lambda x: x + ROLLING_WINDOW)
    transformed_df.to_pickle(f"../Binaries/Transformed/{_class['class'][0]}.pkl")

#### What does the parameter (.rolling(10)) mean?

In [None]:
transformed_classes = get_set(folder="Transformed")
transformed_df = pd.concat(transformed_classes, axis='rows', ignore_index=True)

# Make the classification again (same classes as above) as previously, with KNN or Decision Trees. Compare the results with previous, untransformed features. Is the result improved?

In [None]:
X = transformed_df.drop('class')
y = transformed_df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True)

# KNN with transformed

In [None]:
grid_knn.fit(X_train, y_train)

##### What is the optimal k-value?

In [None]:
grid_knn.best_params_

##### What is the accuracy?

In [None]:
grid_knn.best_score_

##### Display scores

In [None]:
scores = pd.DataFrame(grid_knn.cv_results_)

In [None]:
scores

##### Plot the accuracy as a function of the k-value.

In [None]:
scores['mean_test_score'].plot(title='Accuracy', xlabel='k')

##### Predict

In [None]:
y_pred_knn = grid_knn.predict(X=X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred_knn)

##### Confusion matrix

In [None]:
sns_cm(y_test, y_pred_knn)

# Decision tree transformed

In [None]:
grid_dt.fit(X_train, y_train)

##### Best hyper parameters

In [None]:
grid_dt.best_params_

##### What is the accuracy?

In [None]:
grid_dt.best_score_

##### Display scores

In [None]:
scores_dt = pd.DataFrame(grid_knn.cv_results_)

##### Does the criterion have an impact on the accuracy? Make an investigation so you can answer this question

In [None]:
scores_dt

##### Tree

In [None]:
plot_tree(
    decision_tree=grid_dt.best_estimator_,
    feature_names=['ax', 'ay', 'az', 'gx', 'gy', 'gz'],
    class_names=['Squats', 'Jumping', 'Running', 'Walking'],
    fontsize=7,
    max_depth=3
)

##### Predict

In [None]:
y_pred_dt = grid_knn.predict(X=X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_pred_dt)

##### Confusion matrix

In [None]:
sns_cm(y_test, y_pred_dt)

#### Try with a larger parameter, for example, 50. Is the result improved?