# Feature extraction

Objective: Extract ts features using tsfresh library..

In [1]:
import pandas as pd
import numpy as np

from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

### Data retrieval

#### Read files of the all generators and concatenate into a single file

In [2]:
df_l = []

for generator in range(1,5):

    df = pd.read_csv(f'../data/prepared/ap{generator}_data.csv')

    df['time'] = df.groupby('Run').cumcount(ascending=True)
    df['generator'] = generator

    df.rename(columns={'Vazão Turbinado':'Vazão Turbinada'}, inplace=True)

    df = df.drop(columns=['Gerador - Energia Consumida', 'Gerador - Energia Ativa Fornecida',
       'Gerador - Energia Reativa Fornecida',  'Gerador - Horímetro Elétrico', 'Gerador - Horímetro Mecânico'])

    df_l.append(df)

df = pd.concat(df_l)

df.head()

Unnamed: 0,E3TimeStamp,Gerador - Corrente Média,Gerador - Corrente R,Gerador - Corrente S,Gerador - Corrente T,Gerador - Frequência,Gerador - Potência Ativa Média,Gerador - Potência Reativa Média,Gerador - Tensão RN,Gerador - Tensão SN,...,Posição da Comporta,Vazão Turbinada,RegV - Rotor,Gerador - Potência Aparente Média,EVI,isMissing,Breakdown,Run,time,generator
0,2018-07-01 00:00:00,228.0,227.0,230.0,227.0,60.0,5660.0,126.0,8.27,8.25,...,100.0,0.0,90.300003,5662.0,0.690869,0.0,False,0,0,1
1,2018-07-01 00:05:00,233.0,233.0,235.0,232.0,59.990002,5708.0,123.0,8.26,8.24,...,100.0,0.0,91.599998,5709.0,0.873441,0.0,False,0,1,1
2,2018-07-01 00:10:00,235.0,234.0,237.0,234.0,60.0,5838.0,112.0,8.27,8.25,...,100.0,0.0,91.300003,5839.0,0.705691,0.0,False,0,2,1
3,2018-07-01 00:15:00,232.0,227.0,230.0,231.0,60.0,5636.0,132.0,8.26,8.24,...,100.0,0.0,89.199997,5638.0,0.691231,0.0,False,0,3,1
4,2018-07-01 00:20:00,235.0,233.0,237.0,234.0,60.009998,5794.0,130.0,8.27,8.25,...,100.0,0.0,91.199997,5795.0,0.817129,0.0,False,0,4,1


#### Create ID column and select features to feature extraction

In [3]:
df['id'] = df['generator'].astype(str) + '-' + df['Run'].astype(str)

ts_columns = df.columns[1:48].tolist() + ['id', 'time']

df_ts = df[ts_columns].copy()

df_ts = df_ts.dropna()

df_ts = df_ts.groupby('id').head(100)

### Higher-order statistics (HOS) feature extraction

- Mean, variance, skewness and kurtosis

In [4]:
%%time
df_hos_feature = df_ts.drop(columns='time').groupby('id').agg(['mean', 'var', 'skew', pd.DataFrame.kurt])
df_hos_feature.to_csv('../data/preprocessed/hos_features_rev2.zip', compression='zip')

CPU times: user 4.68 s, sys: 46.8 ms, total: 4.73 s
Wall time: 4.73 s


### Time Series FeatuRe Extraction on basis of Scalable Hypothesis tests (tsfresh)

#### Extract features

In [5]:
%%time
extracted_features = extract_features(df_ts, column_id="id", column_sort="time")
extracted_features.to_csv('../data/preprocessed/tsfresh_features_all_rev2.zip', compression='zip')

Feature Extraction: 100%|███████████████████████| 40/40 [03:31<00:00,  5.28s/it]


CPU times: user 1min 7s, sys: 28.4 s, total: 1min 35s
Wall time: 4min 41s


#### Select features based on breakdown status

In [6]:
%%time
y = df.groupby('id')['Breakdown'].last().values

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

features_filtered.to_csv('../data/preprocessed/tsfresh_features_sel_rev2.zip', compression='zip')

 'Gerador - Potência Ativa Média__fft_coefficient__attr_"real"__coeff_52'
 'Gerador - Potência Ativa Média__fft_coefficient__attr_"real"__coeff_53'
 ... 'Gerador - Frequência__fft_coefficient__attr_"angle"__coeff_98'
 'Gerador - Frequência__fft_coefficient__attr_"angle"__coeff_99'
 'Gerador - Frequência__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


CPU times: user 26.6 s, sys: 3.4 s, total: 30 s
Wall time: 46.9 s
