Using TSFEL to create more relevant features


In [26]:
#importing necessary libraries
import pandas as pd
import numpy as np
import tsfel
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
import os


In [27]:
#paths to train data and test data
train_folder = '/Users/shreya/Documents/fork-it/HAR/Combined/Train'
test_folder = '/Users/shreya/Documents/fork-it/HAR/Combined/Test'

In [28]:
#function to load data
def load_data(folder):
    data = []
    labels = []

    #iterate through the activities
    for activity in os.listdir(folder):
        activity_path = os.path.join(folder,activity)

        #checking if the path is a directory
        if os.path.isdir(activity_path):
            #iterating through the files of each subject's folder
            for file in os.listdir(activity_path):
                if file.endswith('.csv'):
                    file_path = os.path.join(activity_path, file)
                    df = pd.read_csv(file_path)
                    #appending accelerometer data
                    data.append(df[['accx','accy','accz']])
                    #adding activity label
                    labels.extend([activity]*len(df))
    
    #concatenating all the data in a single dataframe
    df_combined = pd.concat(data, ignore_index=True)
    df_combined['labels'] = labels

    return df_combined

In [32]:
#loading training data
df_train = load_data(train_folder)

#loading test data
df_test = load_data(test_folder)

#calculating total acceleration
df_train['total_acc'] = np.sqrt(df_train['accx']**2 + df_train['accy']**2 + df_train['accz']**2)
df_test['total_acc'] = np.sqrt(df_test['accx']**2 + df_test['accy']**2 + df_test['accz']**2)

#rearranging
df_train = df_train[['accx','accy','accz','total_acc','labels']]
df_test = df_test[['accx','accy','accz','total_acc','labels']]

print(df_train)

            accx      accy      accz  total_acc   labels
0       0.569501 -0.203172 -0.152077   0.623489  WALKING
1       0.584063 -0.208670 -0.059063   0.623026  WALKING
2       0.643652 -0.227615  0.029840   0.683364  WALKING
3       0.736044 -0.272858  0.038750   0.785947  WALKING
4       0.862503 -0.288378  0.021114   0.909681  WALKING
...          ...       ...       ...        ...      ...
470523  0.117154  0.937592 -0.346806   1.006518   LAYING
470524  0.119710  0.937131 -0.346094   1.006144   LAYING
470525  0.109197  0.937901 -0.344062   1.004967   LAYING
470526  0.100659  0.937448 -0.344856   1.003925   LAYING
470527  0.104422  0.938384 -0.345324   1.005345   LAYING

[470528 rows x 5 columns]


In [61]:
cfg = tsfel.get_features_by_domain()

X = tsfel.time_series_features_extractor(cfg, df_train[['accx','accy','accz']])

*** Feature extraction started ***


  X = tsfel.time_series_features_extractor(cfg, df_train[['accx','accy','accz']])



*** Feature extraction finished ***


In [62]:
print(X)

   accx_Absolute energy  accx_Area under the curve  accx_Autocorrelation  \
0         385367.432935                 3839.91369               45177.0   

   accx_Average power  accx_Centroid  accx_ECDF Percentile Count_0  \
0           81.901237    1926.615951                       94105.0   

   accx_ECDF Percentile Count_1  accx_ECDF Percentile_0  \
0                      376422.0                0.471498   

   accx_ECDF Percentile_1  accx_ECDF_0  ...  accz_Wavelet variance_0  \
0                1.023845     0.000002  ...                 0.003636   

   accz_Wavelet variance_1  accz_Wavelet variance_2  accz_Wavelet variance_3  \
0                 0.014017                 0.022586                 0.033519   

   accz_Wavelet variance_4  accz_Wavelet variance_5  accz_Wavelet variance_6  \
0                 0.047699                 0.062864                 0.074835   

   accz_Wavelet variance_7  accz_Wavelet variance_8  accz_Zero crossing rate  
0                 0.081612               