# How to Use Feature Extraction Toolbox

In [1]:
import pandas as pd

from tifex_py.feature_extraction import settings, extraction

In [2]:
# Load dataset
filename = "/home/scai9/feature_dataset/USCHAD_data.csv"
dataset = pd.read_csv(filename)

# Get subset of dataset to use for testing
dataset = dataset.loc[dataset["subject"]==1][0:5000]

### Parameter Settings for Feature Extraction

There are three categories of features: Statistical, Spectral, and Time Frequency. Each have a class holding parameters required for feature calculation. A given configuration can be saved in either a json or yaml format.

In [3]:
# Initialization of the feature extraction parameters
statistical_params = settings.StatisticalFeatureParams(25)
spectral_params = settings.SpectralFeatureParams(25)
time_freq_params = settings.TimeFrequencyFeatureParams(25)

# Save and load the parameters
statistical_params.to_json("statistical_params.json")
statistical_params_2 = settings.StatisticalFeatureParams.from_json("statistical_params.json")

assert statistical_params.get_settings_as_dict() == statistical_params_2.get_settings_as_dict()

### Data Format for Feature Calculators

The individual statistial feature calculator functions support univariate series inputs. See example below.

In [4]:
from tifex_py.feature_extraction.statistical_feature_calculators import calculate_area_under_squared_curve

area = calculate_area_under_squared_curve(dataset["accx"].values)
print(f"Area Under Squared Curve: {area}")

Area Under Squared Curve: 2488.43278078524


### Using Feature Extraction Functions

In `tifex_py.feature_extraction.extraction` there are functions to extract features from each subcategory: Statistical, Spectral, and Time-Frequency. There is also a function available to extraction features from all three categories.  `njobs` also specifies the number of cores to use. A basic example of each is shown below.


In [5]:
# Calculate statistical features
features = extraction.calculate_statistical_features(dataset, statistical_params, columns=["accx"], njobs=-1)

print(features.head())

          mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
accx -0.704724             NaN            NaN         -0.709874   

      trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
accx          -0.711949         -0.714596          -0.717405   

      trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  adf_usedlag  \
accx         -0.719037     0.704724               0.703829  ...         32.0   

      has_duplicates  max_has_duplicates  min_has_duplicates  large_std  \
accx            True               False               False      False   

      lempel_ziv_complexity     cid_ce  benford_correlation  \
accx                 0.0598  34.857094            -0.313439   

      number_cwt_peaks_1  number_cwt_peaks_5  
accx                 400                 311  

[1 rows x 187 columns]


In [6]:
# Calculate spectral features
features = extraction.calculate_spectral_features(dataset, spectral_params, columns=["accx"], njobs=-1)

print(features.head())

Error calculating feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75']: index -1 is out of bounds for axis 0 with size 0
Feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75'] will be set to Nan.


  valley_width_mode = mode(valley_widths)[0]
  valley_width_mode = mode(valley_widths)[0]


      spectral_centroid_order_1  spectral_centroid_order_2  \
accx                   1.881512                   14.26679   

      spectral_centroid_order_3  spectral_centroid_order_4  \
accx                 125.264091                1189.120649   

      spectral_centroid_order_5  spectral_variance  spectral_skewness  \
accx               11875.828418          10.726702           1.652527   

      spectral_kurtosis  median_frequency  spectral_flatness  ...  \
accx           4.448157          0.585938           0.303453  ...   

      spectral_valley_width_std  spectral_subdominant_valley  \
accx                   0.884647                    17.464206   

      spectral_valley_count  spectral_peak_broadness  \
accx                    784                 3.185422   

      spectral_valley_broadness  spectral_range  spectral_trimmed_mean  \
accx                   3.186462            12.5               6.519935   

      harmonic_product_spectrum    smoothness    roughness  
accx        

In [7]:
# Calculate time frequency features
features = extraction.calculate_time_frequency_features(dataset[0:3000], time_freq_params, columns=["accx", "accy", "accz"], njobs=1)

print(features.head())

      tkeo_mean  tkeo_geometric_mean  tkeo_harmonic_mean  \
accx  -0.000237             0.001189        1.536900e-14   
accy  -0.000109             0.000479        7.894626e-15   
accz  -0.000313             0.000374        6.615248e-15   

      tkeo_trimmed_mean_0.1  tkeo_trimmed_mean_0.15  tkeo_trimmed_mean_0.2  \
accx              -0.000003                0.000027               0.000038   
accy              -0.000037               -0.000036              -0.000035   
accz              -0.000003                0.000010               0.000016   

      tkeo_trimmed_mean_0.25  tkeo_trimmed_mean_0.3  tkeo_mean_of_abs  \
accx                0.000048               0.000051          0.008779   
accy               -0.000040              -0.000048          0.003708   
accz                0.000014               0.000012          0.005751   

      tkeo_geometric_mean_of_abs  ...  stft_adf_usedlag  stft_has_duplicates  \
accx                         0.0  ...               8.0                 T

In [8]:
# Calculate all features
features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], njobs=6)

print(features.head())

  valley_width_mode = mode(valley_widths)[0]
  valley_width_mode = mode(valley_widths)[0]


Error calculating feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75']: index -1 is out of bounds for axis 0 with size 0
Feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75'] will be set to Nan.


  valley_width_mode = mode(valley_widths)[0]
  valley_width_mode = mode(valley_widths)[0]
  valley_width_mode = mode(valley_widths)[0]
  valley_width_mode = mode(valley_widths)[0]


          mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
accx -0.704724             NaN            NaN         -0.709874   
accy  0.583404        0.577975       0.572182          0.587889   
accz -0.506294             NaN            NaN         -0.499774   

      trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
accx          -0.711949         -0.714596          -0.717405   
accy           0.586373          0.581839           0.581099   
accz          -0.496107         -0.495148          -0.494809   

      trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  \
accx         -0.719037     0.704724               0.703829  ...   
accy          0.581090     0.583404               0.577975  ...   
accz         -0.494613     0.506294               0.502510  ...   

      stft_adf_usedlag  stft_has_duplicates  stft_max_has_duplicates  \
accx              16.0                 True                    False   
accy              14.0                 True                  

Arrays, DataFrames and Series are all acceptable input data formats. If the input is a DataFrame, the columns parameters specifies the columns to analyze (as seen in the previous examples). If not, they are the ordered names of the components of signal. 

In [9]:
# 2D array input
print(dataset[["accx", "accy", "accz"]].values.shape)
features = extraction.calculate_statistical_features(dataset[["accx", "accy", "accz"]].values, statistical_params, columns=["accx", "accy", "accz"], njobs=1)

print(features.head())

# Series input
features = extraction.calculate_statistical_features(dataset["accx"].values, statistical_params, columns=["accx"], njobs=1)

print(features.head())

(5000, 3)
          mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
accx -0.704724             NaN            NaN         -0.709874   
accy  0.583404        0.577975       0.572182          0.587889   
accz -0.506294             NaN            NaN         -0.499774   

      trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
accx          -0.711949         -0.714596          -0.717405   
accy           0.586373          0.581839           0.581099   
accz          -0.496107         -0.495148          -0.494809   

      trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  adf_usedlag  \
accx         -0.719037     0.704724               0.703829  ...         32.0   
accy          0.581090     0.583404               0.577975  ...         31.0   
accz         -0.494613     0.506294               0.502510  ...         32.0   

      has_duplicates  max_has_duplicates  min_has_duplicates  large_std  \
accx            True               False               False      Fa

### Extracting a Subset of Features

Optionally, a subset of the available features can be selected for extraction through a list of feature names in the parameter classes.

In [10]:
statistical_params = settings.StatisticalFeatureParams(25, calculators=["detrended_fluctuation_analysis", "mode", "std"])
spectral_params = settings.SpectralFeatureParams(25, calculators=["spectral_variance"])
time_freq_params = settings.TimeFrequencyFeatureParams(25, calculators=["tkeo_features"],tkeo_sf_params=statistical_params)

features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], njobs=1)
print(features.head())

           std      mode            detrended_fluctuation_analysis_segments  \
accx  0.033738 -0.725808  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   
accy  0.077468  0.577970  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   
accz  0.062115 -0.486109  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   

                  detrended_fluctuation_analysis_values  spectral_variance  \
accx  [0.030752754807630688, 0.03365113703450742, 0....          10.726702   
accy  [0.04164051951457158, 0.047843961327387836, 0....           5.883212   
accz  [0.03748312789928865, 0.0479917901052525, 0.04...           9.877341   

      tkeo_std  tkeo_mode       tkeo_detrended_fluctuation_analysis_segments  \
accx  0.016841        0.0  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   
accy  0.007834        0.0  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   
accz  0.012551        0.0  [22, 25, 30, 33, 45, 50, 55, 66, 75, 90, 99, 1...   

             tkeo_detrended_fluctuation_analysis_