In [8]:
##############################################################
#                                                            #
#    Mark Hoogendoorn and Burkhardt Funk (2017)              #
#    Machine Learning for the Quantified Self                #
#    Springer                                                #
#    Chapter 7                                               #
#                                                            #
##############################################################

import os
import copy
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import time
start = time.time()

from sklearn.model_selection import train_test_split

from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.LearningAlgorithms import RegressionAlgorithms
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.FeatureSelection import FeatureSelectionRegression
from util import util
from util.VisualizeDataset import VisualizeDataset

# Read the result from the previous chapter, and make sure the index is of the type datetime.
DATASET_PATH = Path('./intermediate_datafiles/125')
N_FORWARD_SELECTION = 20

dataset = pd.read_csv(DATASET_PATH / 'dataset_result_fe.csv', index_col=0)
dataset.index = pd.to_datetime(dataset.index)

# Create one target column
activityColumns = dataset.filter(regex='^activity').columns.tolist()
dataset['activity'] = dataset[activityColumns].idxmax(axis=1)
dataset.drop(activityColumns, axis=1, inplace=True)
dataset = dataset.dropna()

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['activity'], 
                                                                               'exact', 0.7, filter=False, temporal=False)

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

basic_features = ['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y', 'gyr_z',
       'linear_x', 'linear_y', 'linear_z', 'loc_Latitude', 'loc_Longitude',
       'loc_Height', 'loc_Velocity', 'loc_Direction', 'loc_Horizontal',
       'loc_Vertical', 'mag_x', 'mag_y', 'mag_z', 'prox_Distance']

time_features = [name for name in dataset.columns if '_temp_' in name]
freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
print('#basic features: ', len(basic_features))

print('#time features: ', len(time_features))
print('#frequency features: ', len(freq_features))

features_after_chapter_3 = list(set().union(basic_features))
features_after_chapter_4 = list(set().union(basic_features, time_features, freq_features))

Training set length is:  1584
Test set length is:  679
#basic features:  20
#time features:  60
#frequency features:  168


In [9]:
fs = FeatureSelectionClassification()

features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
                                                                  train_X[features_after_chapter_4],
                                                                  test_X[features_after_chapter_4],
                                                                  train_y,
                                                                  test_y,
                                                                  gridsearch=False)

Added feature0
Added feature1
Added feature2
Added feature3
Added feature4
Added feature5
Added feature6
Added feature7
Added feature8
Added feature9
Added feature10
Added feature11
Added feature12
Added feature13
Added feature14
Added feature15
Added feature16
Added feature17
Added feature18
Added feature19


In [10]:
for idx in range(min(len(ordered_features), len(ordered_scores))):
    print(ordered_features[idx], ordered_scores[idx])

acc_z_temp_min_ws_120 0.7128129602356407
acc_y_temp_min_ws_120 0.8350515463917526
linear_z_temp_median_ws_120 0.8792341678939617
mag_y_temp_max_ws_120 0.8895434462444771
loc_Longitude 0.9027982326951399
mag_y 0.9189985272459499
gyr_z_freq_4.0_Hz_ws_20 0.9189985272459499
acc_y_freq_1.2_Hz_ws_20 0.9189985272459499
gyr_z_freq_1.2_Hz_ws_20 0.9189985272459499
gyr_y_pse 0.9189985272459499
linear_z_freq_1.2_Hz_ws_20 0.9189985272459499
linear_y_freq_2.0_Hz_ws_20 0.9189985272459499
linear_x_freq_0.0_Hz_ws_20 0.9189985272459499
mag_z_pse 0.9189985272459499
linear_x_freq_0.4_Hz_ws_20 0.9189985272459499
linear_z_freq_1.6_Hz_ws_20 0.9189985272459499
mag_x_freq_0.4_Hz_ws_20 0.9189985272459499
gyr_z_freq_0.8_Hz_ws_20 0.9189985272459499
linear_x_freq_0.8_Hz_ws_20 0.9189985272459499
mag_z_freq_3.2_Hz_ws_20 0.9189985272459499


In [11]:
features

['acc_z_temp_min_ws_120',
 'acc_y_temp_min_ws_120',
 'linear_z_temp_median_ws_120',
 'mag_y_temp_max_ws_120',
 'loc_Longitude',
 'mag_y',
 'gyr_z_freq_4.0_Hz_ws_20',
 'acc_y_freq_1.2_Hz_ws_20',
 'gyr_z_freq_1.2_Hz_ws_20',
 'gyr_y_pse',
 'linear_z_freq_1.2_Hz_ws_20',
 'linear_y_freq_2.0_Hz_ws_20',
 'linear_x_freq_0.0_Hz_ws_20',
 'mag_z_pse',
 'linear_x_freq_0.4_Hz_ws_20',
 'linear_z_freq_1.6_Hz_ws_20',
 'mag_x_freq_0.4_Hz_ws_20',
 'gyr_z_freq_0.8_Hz_ws_20',
 'linear_x_freq_0.8_Hz_ws_20',
 'mag_z_freq_3.2_Hz_ws_20']