In [1]:
input_path = '../input/tabular-playground-series-apr-2022/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.stats import kurtosis
from tsfresh.feature_extraction.extraction import extract_features

class ElementaryExtractor(BaseEstimator, TransformerMixin):
    features_to_use = ['med_abs_val_00',
        'max_abs_val_00',
        'sum_abs_diff_00',
        'l2_sum_00',
        'l2_sum_diff_00',
        'l2_sum_diff2_00',
        'kurt_00',
        'sm_00',
        'kurt_diff_00',
        'mean_01',
        'med_abs_val_01',
        'l2_sum_diff2_01',
        'sm_01',
        'iqr_diff_01',
        'mean_02',
        'med_abs_val_02',
        'max_abs_val_02',
        'med_abs_diff_02',
        'max_abs_diff_02',
        'l2_sum_diff_02',
        'l2_sum_diff2_02',
        'std_02',
        'kurt_02',
        'std_diff_02',
        'iqr_diff_02',
        'kurt_diff_02',
        'med_abs_val_03',
        'med_abs_diff_03',
        'max_abs_diff_03',
        'sum_abs_diff_03',
        'sm_03',
        'iqr_diff_03',
        'mean_04',
        'med_abs_val_04',
        'max_abs_val_04',
        'med_abs_diff_04',
        'max_abs_diff_04',
        'l2_sum_04',
        'l2_sum_diff2_04',
        'iqr_04',
        'kurt_04',
        'sm_04',
        'kurt_diff_04',
        'mean_05',
        'med_abs_diff_05',
        'sum_abs_diff_05',
        'sm_05',
        'mean_06',
        'med_abs_val_06',
        'med_abs_diff_06',
        'max_abs_diff_06',
        'l2_sum_diff2_06',
        'kurt_06',
        'iqr_diff_06',
        'kurt_diff_06',
        'med_abs_val_07',
        'sum_abs_diff_07',
        'l2_sum_07',
        'l2_sum_diff_07',
        'l2_sum_diff2_07',
        'iqr_07',
        'sm_07',
        'iqr_diff_07',
        'kurt_diff_07',
        'max_abs_diff_08',
        'sum_abs_diff_08',
        'l2_sum_08',
        'l2_sum_diff_08',
        'l2_sum_diff2_08',
        'iqr_08',
        'kurt_08',
        'iqr_diff_08',
        'kurt_diff_08',
        'mean_09',
        'max_abs_diff_09',
        'sum_abs_diff_09',
        'l2_sum_09',
        'l2_sum_diff2_09',
        'sm_09',
        'iqr_diff_09',
        'kurt_diff_09',
        'mean_10',
        'med_abs_val_10',
        'max_abs_diff_10',
        'l2_sum_diff2_10',
        'std_10',
        'kurt_10',
        'sm_10',
        'std_diff_10',
        'kurt_diff_10',
        'mean_11',
        'sum_abs_diff_11',
        'l2_sum_diff_11',
        'sm_11',
        'iqr_diff_11',
        'kurt_diff_11',
        'max_abs_diff_12',
        'sum_abs_diff_12',
        'l2_sum_12',
        'l2_sum_diff2_12',
        'iqr_12',
        'kurt_12',
        'sm_12',
        'kurt_diff_12',
        'up_sum_02',
        'up_max_02',
        'up_mean_02',
        'down_count_02']
    
    def fit(self, X):
        return self
    
    def transform(self, X, y=None):
        seq_no = X['sequence'].iloc[::60]
        x = X.loc[:, 'sensor_00':'sensor_12'].values.reshape(-1, 60, 13)
        features = dict()
        for i in range(13):
            channel = x[:, :, i]
            # mean
            features[f'mean_{i:0>2}'] = np.mean(channel, axis=1)
            # median of absolute values
            features[f'med_abs_val_{i:0>2}'] = np.median(np.abs(channel), axis=1)
            # maximum of absolute values
            features[f'max_abs_val_{i:0>2}'] = np.max(np.abs(channel), axis=1)
            #median of absolute diff
            features[f'med_abs_diff_{i:0>2}'] = np.median(np.abs(np.diff(channel, axis=1)), axis=1)
            # maximum of absolute diff
            features[f'max_abs_diff_{i:0>2}'] = np.max(np.abs(np.diff(channel, axis=1)), axis=1)
            # absolute sum of difference
            features[f'sum_abs_diff_{i:0>2}'] = np.sum(np.abs(np.diff(channel, axis=1)), axis=1)
            # square sum
            features[f'l2_sum_{i:0>2}'] = np.linalg.norm(channel, axis=1)
            # square sum of difference
            features[f'l2_sum_diff_{i:0>2}'] = np.linalg.norm(np.diff(channel, axis=1), axis=1)
            # square sum of 2-diff
            features[f'l2_sum_diff2_{i:0>2}'] = np.linalg.norm(np.diff(np.diff(channel, axis=1), axis=1), axis=1)
            # standard deviation
            features[f'std_{i:0>2}'] = np.std(channel, axis=1)
            features[f'iqr_{i:0>2}'] = np.quantile(channel, 0.75, axis=1) - np.quantile(channel, 0.25, axis=1)
            features[f'kurt_{i:0>2}'] = kurtosis(channel, axis=1)
            features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)

            features[f'std_diff_{i:0>2}'] = np.std(np.diff(channel, axis=1), axis=1)
            features[f'iqr_diff_{i:0>2}'] = np.quantile(np.diff(channel, axis=1), 0.75, axis=1) - np.quantile(np.diff(channel, axis=1), 0.25, axis=1)
            features[f'kurt_diff_{i:0>2}'] = kurtosis(np.diff(channel, axis=1), axis=1)

        sensor_02 = x[:, :, 2]
        features[f'up_count_02'] = np.sum(np.diff(sensor_02, axis=1) >= 0, axis=1)
        features[f'up_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_max_02'] = np.max(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_mean_02'] = np.nan_to_num(features[f'up_max_02'] / features[f'up_count_02'], posinf=40)

        features[f'down_count_02'] = np.sum(np.diff(sensor_02, axis=1) < 0, axis=1)
        features[f'down_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_min_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
        
        return pd.DataFrame(features, index=seq_no)[self.features_to_use]
    
class TsfreshExtractor(BaseEstimator, TransformerMixin):
    sensorwise_fcs = [{'agg_autocorrelation': [{'f_agg': 'var', 'maxlag': 40}],
        'agg_linear_trend': [{'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 3, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10},
        {'coeff': 9, 'k': 10}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}],
        'variation_coefficient': [{}]},
        {'absolute_sum_of_changes': [{}],
        'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 5, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 5, 'f_agg': 'var'}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.4, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': False, 'f_agg': 'var'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.2, 'qh': 0.6, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.6, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 1.0, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.4, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.4, 'qh': 1.0, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.6, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.6, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.8, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}],
        'cid_ce': [{'normalize': True}],
        'cwt_coefficients': [{'widths': (2, 5, 10, 20), 'coeff': 1, 'w': 2}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'abs'}],
        'matrix_profile': [{'threshold': 0.98, 'feature': 'min'}],
        'partial_autocorrelation': [{'lag': 2}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}],
        'quantile': [{'q': 0.1}],
        'ratio_value_number_to_time_series_length': [{}],
        'spkt_welch_density': [{'coeff': 2}],
        'standard_deviation': [{}],
        'time_reversal_asymmetry_statistic': [{'lag': 1}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}]},
        {'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'min'},
        {'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'max'},
        {'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 10, 'k': 10},
        {'coeff': 2, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 2}, {'lag': 6}],
        'cid_ce': [{'normalize': True}],
        'energy_ratio_by_chunks': [{'num_segments': 10, 'segment_focus': 1},
        {'num_segments': 10, 'segment_focus': 3},
        {'num_segments': 10, 'segment_focus': 5},
        {'num_segments': 10, 'segment_focus': 6},
        {'num_segments': 10, 'segment_focus': 7},
        {'num_segments': 10, 'segment_focus': 9}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 0, 'attr': 'real'},
        {'coeff': 3, 'attr': 'abs'},
        {'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'friedrich_coefficients': [{'coeff': 1, 'm': 3, 'r': 30},
        {'coeff': 3, 'm': 3, 'r': 30}],
        'index_mass_quantile': [{'q': 0.2}, {'q': 0.3}, {'q': 0.7}],
        'kurtosis': [{}],
        'large_standard_deviation': [{'r': 0.25}],
        'number_peaks': [{'n': 10}, {'n': 5}],
        'partial_autocorrelation': [{'lag': 4}, {'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 5}],
        'ratio_beyond_r_sigma': [{'r': 0.5}, {'r': 1}, {'r': 2}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}],
        'time_reversal_asymmetry_statistic': [{'lag': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'cwt_coefficients': [{'widths': (2, 5, 10, 20), 'coeff': 10, 'w': 20}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'partial_autocorrelation': [{'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}]},
        {'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'max'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'abs'},
        {'coeff': 1, 'attr': 'imag'}],
        'number_crossing_m': [{'m': 0}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'kurtosis': [{}]},
        {'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'var'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 3, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10},
        {'coeff': 8, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 6}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'quantile': [{'q': 0.9}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_autocorrelation': [{'f_agg': 'var', 'maxlag': 40}],
        'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'var'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10}, {'coeff': 10, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'pvalue'}, {'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 1}, {'lag': 2}, {'lag': 5}, {'lag': 6}],
        'change_quantiles': [{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': True, 'f_agg': 'var'}],
        'cid_ce': [{'normalize': True}],
        'fft_aggregated': [{'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'friedrich_coefficients': [{'coeff': 3, 'm': 3, 'r': 30}],
        'kurtosis': [{}],
        'linear_trend': [{'attr': 'pvalue'}],
        'partial_autocorrelation': [{'lag': 3}, {'lag': 4}, {'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}],
        'quantile': [{'q': 0.2}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_linear_trend': [{'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 10, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 1}, {'lag': 2}],
        'binned_entropy': [{'max_bins': 10}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.2, 'isabs': False, 'f_agg': 'var'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.4, 'qh': 0.6, 'isabs': True, 'f_agg': 'mean'}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 1, 'attr': 'abs'},
        {'coeff': 22, 'attr': 'abs'},
        {'coeff': 23, 'attr': 'abs'},
        {'coeff': 24, 'attr': 'abs'},
        {'coeff': 25, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'kurtosis': [{}],
        'partial_autocorrelation': [{'lag': 2}, {'lag': 3}],
        'ratio_beyond_r_sigma': [{'r': 2}],
        'spkt_welch_density': [{'coeff': 2}]}]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df_tsf = []
        for i in range(13):
            sensor_name = f'sensor_{i:0>2}'
            ts = X[['sequence', 'step', sensor_name]]
            features = extract_features(
                ts, 
                self.sensorwise_fcs[i],
                column_id='sequence', 
                column_sort='step'
            )
            df_tsf.append( features )
        df_tsf = pd.concat(df_tsf, axis=1)
        return df_tsf

In [4]:
from lightgbm import LGBMClassifier
from MyFeatureExtractor import MyFeatureExtractor

df = load_raw_data('train')
y = load_label('train')

def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val
    
for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=42):
    df_train, y_train = df[~mask_df_val], y[~mask_y_val]
    df_val, y_val = df[mask_df_val], y[mask_y_val]
    
    extractors = [ElementaryExtractor(), TsfreshExtractor()]
    X_train = pd.concat([extractor.fit_transform(df_train) for extractor in extractors], axis=1)
    X_val = pd.concat([extractor.transform(df_val) for extractor in extractors], axis=1)
    print(X_train.shape, X_val.shape)
    
    clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=100, random_state=42)
    clf.fit(X_train.values, y_train)
    print(evaluate(clf, X_train, y_train))
    print(evaluate(clf, X_val, y_val))

  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:21<00:00,  1.82it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:11<00:00,  3.49it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.04it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.13it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:40<00:00,  1.01s/it]
Feature Extraction: 100%|██████████████████████

(20817, 321) (5151, 321)
0.9962656843776108
0.9670068784102586


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:21<00:00,  1.87it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.78it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.11it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.14it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:40<00:00,  1.00s/it]
Feature Extraction: 100%|██████████████████████

(21369, 321) (4599, 321)
0.9960108021296352
0.9650876845766595


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:21<00:00,  1.88it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.87it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:17<00:00,  2.23it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:17<00:00,  2.24it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:40<00:00,  1.02s/it]
Feature Extraction: 100%|██████████████████████

(19964, 321) (6004, 321)
0.9971422543442928
0.9500655485586287


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:20<00:00,  1.96it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.81it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.13it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.18it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:38<00:00,  1.04it/s]
Feature Extraction: 100%|██████████████████████

(20714, 321) (5254, 321)
0.9963278422051424
0.965404456326952


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:20<00:00,  1.92it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.83it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.12it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:18<00:00,  2.18it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:39<00:00,  1.01it/s]
Feature Extraction: 100%|██████████████████████

(21008, 321) (4960, 321)
0.9963736898606519
0.9640361098966531


In [None]:
from MBOP import MBOP
from CorrExtractor import CorrExtractor  

for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=42):
    df_train, y_train = df[~mask_df_val], y[~mask_y_val]
    df_val, y_val = df[mask_df_val], y[mask_y_val]
    
    extractors = [CorrExtractor(), ElementaryExtractor(), TsfreshExtractor(), MBOP(), ]
    X_train = pd.concat([extractor.fit_transform(df_train) for extractor in extractors], axis=1)
    X_val = pd.concat([extractor.transform(df_val) for extractor in extractors], axis=1)
    print(X_train.shape, X_val.shape)
    
    clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=100, random_state=42)
    clf.fit(X_train.values, y_train)
    print(evaluate(clf, X_train, y_train))
    print(evaluate(clf, X_val, y_val))

  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:22<00:00,  1.81it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:11<00:00,  3.60it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.04it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:19<00:00,  2.10it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:41<00:00,  1.03s/it]
Feature Extraction: 100%|██████████████████████

0-th machine fitted
1-th machine fitted
2-th machine fitted
3-th machine fitted
4-th machine fitted
5-th machine fitted
6-th machine fitted
7-th machine fitted
8-th machine fitted
9-th machine fitted
10-th machine fitted
11-th machine fitted
12-th machine fitted
reducing
fit_transform result has been saved as instance variable ft_X
all fitted


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:10<00:00,  3.91it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:04<00:00,  8.78it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:06<00:00,  6.05it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:06<00:00,  6.07it/s]
Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [00:11<00:00,  3.40it/s]
Feature Extraction: 100%|██████████████████████

0-th channel finished
number of pure features of 0 BOP=(74,)
1-th channel finished
number of pure features of 1 BOP=(76,)
2-th channel finished
number of pure features of 2 BOP=(130,)
3-th channel finished
number of pure features of 3 BOP=(75,)
4-th channel finished
number of pure features of 4 BOP=(972,)
5-th channel finished
number of pure features of 5 BOP=(310,)
6-th channel finished
number of pure features of 6 BOP=(74,)
7-th channel finished
number of pure features of 7 BOP=(74,)
8-th channel finished
number of pure features of 8 BOP=(79,)
9-th channel finished
number of pure features of 9 BOP=(77,)
10-th channel finished
number of pure features of 10 BOP=(1102,)
11-th channel finished
number of pure features of 11 BOP=(72,)
12-th channel finished
number of pure features of 12 BOP=(1287,)
shape=(5151, 406)
(20817, 909) (5151, 909)
0.9970522374592594
0.9671036682442504


In [None]:
from sklearn.model_selection import GroupKFold
from powershap import PowerShap

for mask_df_fit, mask_y_fit in group_splitter(df, nfold=2, random_state=42):
    break
df_fit, y_fit = df[mask_df_fit], y[mask_y_fit]
extractors = [CorrExtractor(), ElementaryExtractor(), TsfreshExtractor(), MBOP(), ]
for extractor in extractors:
    extractor.fit(df_fit)

df_trans, y_trans = df[~mask_df_fit], y[~mask_y_fit]
splitter = GroupKFold(n_splits=5)
selector = PowerShap()

X_trans = pd.concat([extractor.transform(df_trans) for extractor in extractors], axis=1)
selector.fit(X_trans, y_trans, cv=list(splitter.split(X_trans, y_trans, df_trans['subject'].iloc[::60])))
selector.get_support()