In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('max_columns',None)
pd.set_option('display.max_rows', 1000)

### Synopsis

1. We use the `tsfresh` and `catch22` Python packages to extract 2500+ features.
2. We use the `tsfresh` feature filtering module to identify "relevant" features from the 2500+ features, resulting in a smaller set of 1200+ features.
3. We use the statistical measure of mutual information (MI) to rank the 2500+ features. In particular, out of the top 250 features according to MI, over 75% survive the `tsfresh` filtering.
4. We save both feature sets to datasets for use in subsequent classification task.

# Introduction

The TPS April 2022 competition is about binary classification of sequences each of which is associated with 13 sensor time series. Visual inspection of most of these time series does not reveal obvious features that distinguish between the two classes. Subtle statistical features are non-intuitive to the human mind and are best discovered by generating as many features as possible mechanistically and then resorting to feature selection and/or machine learning algorithms downstream.

In this notebook, we use the tsfresh library (already used in the kaggle [benchmark notebook](https://www.kaggle.com/code/ryanholbrook/tps-april-2022-benchmark)) and the catch22 library to generate features mechanistically. The only non-time-series feature we include is the "repeated subject count" which has been found to correlate with the target variable (e.g., see [this discussion](https://www.kaggle.com/competitions/tabular-playground-series-apr-2022/discussion/318527)).

The main goal of this notebook is to generate and save datasets with large number of features for the subsequent classification task. Since feature extraction takes a long time, it makes sense to separate the feature generation step from the classification step so that different classifcation algorithms can be experimented using the same dataset. No classifier will be built in this notebook, although there will be an indication of feature importance using the statistical measure of mutual information (MI).

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
labels=pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test_data = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
train_data

In [None]:
test_data

# catch22 features

These are 22 features with acronym for [CAnonical Time-series CHaracteristics](https://github.com/chlubba/catch22).

In [None]:
pip install catch22

In [None]:
import catch22

def catch22_seq(x):
    features = []
    for i in range(13):
        sensor = 'sensor_{:02d}'.format(i)
        ts = x[['step',sensor]].sort_values(by='step')[sensor].to_numpy()
        features.append(catch22.catch22_all(ts)['values'])
    return np.concatenate(features)        

In [None]:
# run on a dummy time series to get the names of the features
catch22_names = catch22.catch22_all([0]*60)['names']
catch22_names

# tsfresh features

Most of our features will come from the [tsfresh](https://tsfresh.readthedocs.io/en/latest/) package. The main challenge is to avoid running out of memory. Certain features alone (e.g., cwt) would generate hundreds of features *per sensor*. Our solution is to call the extraction module separately for certain voluminous features and optimize memory usage of the output right away.

In [None]:
# settings for the bulk of the tsfresh features (sans cwt and ar)
tsfresh_default_settings = {'abs_energy': None,
 'absolute_maximum': None,
 'absolute_sum_of_changes': None,
 'agg_autocorrelation': [{'f_agg': 'mean', 'maxlag': 10}, {'f_agg': 'median', 'maxlag': 10}, {'f_agg': 'var', 'maxlag': 10}, {'f_agg': 'ptp', 'maxlag': 10}],
 'augmented_dickey_fuller': [{'attr': 'teststat'}, {'attr': 'pvalue'}, {'attr': 'usedlag'}],
 'autocorrelation': [{'lag': 0}, {'lag': 1}, {'lag': 2}, {'lag': 3}, {'lag': 4}, {'lag': 5}, {'lag': 6}, {'lag': 7}, {'lag': 8}, {'lag': 9}],
 'benford_correlation': None,
 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'binned_entropy': [{'max_bins': 10}],
 'cid_ce': [{'normalize': True}, {'normalize': False}], 
 'fft_aggregated': [{'aggtype': 'centroid'}, {'aggtype': 'variance'}, {'aggtype': 'skew'}, {'aggtype': 'kurtosis'}],
 'fourier_entropy': [{'bins': 2}, {'bins': 3}, {'bins': 5}, {'bins': 10}, {'bins': 100}],  
 'index_mass_quantile': [{'q': 0.1}, {'q': 0.2}, {'q': 0.3}, {'q': 0.4}, {'q': 0.6}, {'q': 0.7}, {'q': 0.8}, {'q': 0.9}],
 'kurtosis': None,
 'maximum': None,
 'mean': None,
 'mean_abs_change':None,
 'mean_second_derivative_central': None,
 'minimum':None,
 'number_crossing_m': [{'m': 0}],
 'number_cwt_peaks': [{'n': 1}, {'n': 5}],
 'permutation_entropy': [{'tau': 1, 'dimension': 3}, {'tau': 1, 'dimension': 4}, {'tau': 1, 'dimension': 5}, {'tau': 1, 'dimension': 6}, {'tau': 1, 'dimension': 7}],
 'partial_autocorrelation': [{'lag': 0}, {'lag': 1}, {'lag': 2}, {'lag': 3}, {'lag': 4}, {'lag': 5}, {'lag': 6}, {'lag': 7}, {'lag': 8}, {'lag': 9}],
 'quantile': [{'q': 0.1}, {'q': 0.3}, {'q': 0.5}, {'q': 0.7}, {'q': 0.9}],
 'sample_entropy':None,
 'skewness':None,
 'spkt_welch_density': [{'coeff': 2}, {'coeff': 5}, {'coeff': 8}],
 'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}], 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'variance':None,
 'variation_coefficient': None}

In [None]:
from tsfresh.feature_extraction.extraction import extract_features
from tsfresh.utilities.dataframe_functions import impute

def optimize_memory(df):
    floats = df.select_dtypes(include=['float']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    ints = df.select_dtypes(include=['int']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df

def make_features(data):
    X=data[['sequence','subject']].groupby('sequence').mean()
    X['subject']=X['subject'].astype('int')
    X=X.reset_index()    
    # tsfresh features
    settings = {'cwt_coefficients': [{'widths': (2, 5, 10, 20), 'coeff': i, 'w': j} for i in range(0,60,3) for j in [2,5,10, 20]]}
    X = X.join(optimize_memory(extract_features(data.drop(['subject'],axis=1),column_id='sequence',column_sort='step',default_fc_parameters=settings,
                                n_jobs=1,impute_function=impute)), on=['sequence'])
    settings = {'ar_coefficient': [{'coeff': i, 'k': 10} for i in range(11)]}
    X = X.join(optimize_memory(extract_features(data.drop(['subject'],axis=1),column_id='sequence',column_sort='step',default_fc_parameters=settings,
                                n_jobs=1,impute_function=impute)), on=['sequence'])
    features_df = optimize_memory(extract_features(data.drop(['subject'],axis=1),column_id='sequence',column_sort='step',default_fc_parameters=tsfresh_default_settings,
                                n_jobs=4,impute_function=impute))
    features_df = features_df.reindex(columns=sorted(features_df.columns)) # multi-threaded extraction may result in random ordering of columns
    X = X.join(features_df, on=['sequence'])
    del features_df
    # subject frequency
    subjects, counts = np.unique(data.subject,return_counts=True)
    X=X.join(pd.DataFrame(counts.reshape((-1,1)),columns=['subject_frequency'],index=subjects),on=['subject'])
    # catch22 features
    from tqdm import tqdm
    sequences = data.sequence.unique()
    rows = []
    for seq in tqdm(sequences,desc='Catch22 Extraction'):
        rows.append(catch22_seq(data[data.sequence==seq]))
    X = X.join(optimize_memory(
        pd.DataFrame(np.array(rows),columns=['sensor_{0:02d}__c22_{1}'.format(i,catch22_names[j]) for i in range(13) for j in range(22)],index=sequences)),
               on=['sequence'])
    
    return X

Let's run it on the training data. It would take several hours.

In [None]:
%%time

X = make_features(train_data)

In [None]:
X

# Feature relevance

Some classification algorithms work better when unimportant features are removed. There are different algorithms for feature selection. `tsfresh` provides its own feature selection module that calculates "relevance" based on statistical hypothesis testing. Let's give it a try.

One caveat is that some catch22 features contain `NaN`. For the purpose of calculating relevance (and later on, mutual information), we fill these missing values with a dummy value `1e9`. We leave the `NaN`s alone when we export the data to files.

In [None]:
%%time

from tsfresh.feature_selection.relevance import calculate_relevance_table

rtable = calculate_relevance_table(X.fillna(1e9).drop(['subject'],axis=1).sort_values(by='sequence').set_index('sequence'),
                                   labels.sort_values(by='sequence')['state'],
                                  ml_task='classification',n_jobs=4)
rtable.head()

According to the relevance table, about 50% of the features are relevant.

In [None]:
rtable.relevant.mean()

In [None]:
relevant_cols = [c for c in X.drop(['sequence','subject'],axis=1).columns if rtable.set_index('feature').loc[c].relevant]

# Feature importance by mutual information

We briefly evaluate feature importance using the statistical notion of mutual information (MI). It captures relationship between a single feature and the target variable. It does not address if the features are mutually dependent or not. 

Here are the top 250 features according to MI.

In [None]:
from sklearn.feature_selection import mutual_info_classif

N = 250
features = X.drop(['sequence','subject'],axis=1).columns
topN = pd.Series(mutual_info_classif(X.drop(['sequence','subject'],axis=1).fillna(1e9),
                                 labels.state,discrete_features=False,random_state=42),
             index=features).sort_values(ascending=False).head(N)
topN

More than 75% of the top 250 survives the `tsfresh` relevance test.

In [None]:
len([f for f in topN.index if f in relevant_cols])/N

# Saving to files

OK time to save the full feature set and the reduced set to files for later use.

In [None]:
X.to_csv('tps042022_train.csv',index=False)
X[['sequence','subject']+relevant_cols].to_csv('tps042022_train_r.csv',index=False)
del X

Finally, generate features for the test data.

In [None]:
%%time

X_test = make_features(test_data)

In [None]:
X_test

In [None]:
X_test.to_csv('tps042022_test.csv',index=False)
X_test[['sequence','subject']+relevant_cols].to_csv('tps042022_test_r.csv',index=False)