# Non-NN models

We should study these notebooks:

https://www.kaggle.com/code/jeroenvdd/tpsapr22-best-non-dl-model-tsflex-powershap?scriptVersionId=94240450

https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn

In [1]:
input_path = '../input/tabular-playground-series-apr-2022/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

def submit(arr):
    df = pd.read_csv(f'{input_path}/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)

In [3]:
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.stats import kurtosis
from tsfresh.feature_extraction.extraction import extract_features

class ElementaryExtractor(BaseEstimator, TransformerMixin):
    features_to_use = ['med_abs_val_00',
        'max_abs_val_00',
        'sum_abs_diff_00',
        'l2_sum_00',
        'l2_sum_diff_00',
        'l2_sum_diff2_00',
        'kurt_00',
        'sm_00',
        'kurt_diff_00',
        'mean_01',
        'med_abs_val_01',
        'l2_sum_diff2_01',
        'sm_01',
        'iqr_diff_01',
        'mean_02',
        'med_abs_val_02',
        'max_abs_val_02',
        'med_abs_diff_02',
        'max_abs_diff_02',
        'l2_sum_diff_02',
        'l2_sum_diff2_02',
        'std_02',
        'kurt_02',
        'std_diff_02',
        'iqr_diff_02',
        'kurt_diff_02',
        'med_abs_val_03',
        'med_abs_diff_03',
        'max_abs_diff_03',
        'sum_abs_diff_03',
        'sm_03',
        'iqr_diff_03',
        'mean_04',
        'med_abs_val_04',
        'max_abs_val_04',
        'med_abs_diff_04',
        'max_abs_diff_04',
        'l2_sum_04',
        'l2_sum_diff2_04',
        'iqr_04',
        'kurt_04',
        'sm_04',
        'kurt_diff_04',
        'mean_05',
        'med_abs_diff_05',
        'sum_abs_diff_05',
        'sm_05',
        'mean_06',
        'med_abs_val_06',
        'med_abs_diff_06',
        'max_abs_diff_06',
        'l2_sum_diff2_06',
        'kurt_06',
        'iqr_diff_06',
        'kurt_diff_06',
        'med_abs_val_07',
        'sum_abs_diff_07',
        'l2_sum_07',
        'l2_sum_diff_07',
        'l2_sum_diff2_07',
        'iqr_07',
        'sm_07',
        'iqr_diff_07',
        'kurt_diff_07',
        'max_abs_diff_08',
        'sum_abs_diff_08',
        'l2_sum_08',
        'l2_sum_diff_08',
        'l2_sum_diff2_08',
        'iqr_08',
        'kurt_08',
        'iqr_diff_08',
        'kurt_diff_08',
        'mean_09',
        'max_abs_diff_09',
        'sum_abs_diff_09',
        'l2_sum_09',
        'l2_sum_diff2_09',
        'sm_09',
        'iqr_diff_09',
        'kurt_diff_09',
        'mean_10',
        'med_abs_val_10',
        'max_abs_diff_10',
        'l2_sum_diff2_10',
        'std_10',
        'kurt_10',
        'sm_10',
        'std_diff_10',
        'kurt_diff_10',
        'mean_11',
        'sum_abs_diff_11',
        'l2_sum_diff_11',
        'sm_11',
        'iqr_diff_11',
        'kurt_diff_11',
        'max_abs_diff_12',
        'sum_abs_diff_12',
        'l2_sum_12',
        'l2_sum_diff2_12',
        'iqr_12',
        'kurt_12',
        'sm_12',
        'kurt_diff_12',
        'up_sum_02',
        'up_max_02',
        'up_mean_02',
        'down_count_02']
    
    def fit(self, X):
        return self
    
    def transform(self, X, y=None):
        seq_no = X['sequence'].iloc[::60]
        x = X.loc[:, 'sensor_00':'sensor_12'].values.reshape(-1, 60, 13)
        features = dict()
        for i in range(13):
            channel = x[:, :, i]
            # mean
            features[f'mean_{i:0>2}'] = np.mean(channel, axis=1)
            # median of absolute values
            features[f'med_abs_val_{i:0>2}'] = np.median(np.abs(channel), axis=1)
            # maximum of absolute values
            features[f'max_abs_val_{i:0>2}'] = np.max(np.abs(channel), axis=1)
            #median of absolute diff
            features[f'med_abs_diff_{i:0>2}'] = np.median(np.abs(np.diff(channel, axis=1)), axis=1)
            # maximum of absolute diff
            features[f'max_abs_diff_{i:0>2}'] = np.max(np.abs(np.diff(channel, axis=1)), axis=1)
            # absolute sum of difference
            features[f'sum_abs_diff_{i:0>2}'] = np.sum(np.abs(np.diff(channel, axis=1)), axis=1)
            # square sum
            features[f'l2_sum_{i:0>2}'] = np.linalg.norm(channel, axis=1)
            # square sum of difference
            features[f'l2_sum_diff_{i:0>2}'] = np.linalg.norm(np.diff(channel, axis=1), axis=1)
            # square sum of 2-diff
            features[f'l2_sum_diff2_{i:0>2}'] = np.linalg.norm(np.diff(np.diff(channel, axis=1), axis=1), axis=1)
            # standard deviation
            features[f'std_{i:0>2}'] = np.std(channel, axis=1)
            features[f'iqr_{i:0>2}'] = np.quantile(channel, 0.75, axis=1) - np.quantile(channel, 0.25, axis=1)
            features[f'kurt_{i:0>2}'] = kurtosis(channel, axis=1)
            features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)

            features[f'std_diff_{i:0>2}'] = np.std(np.diff(channel, axis=1), axis=1)
            features[f'iqr_diff_{i:0>2}'] = np.quantile(np.diff(channel, axis=1), 0.75, axis=1) - np.quantile(np.diff(channel, axis=1), 0.25, axis=1)
            features[f'kurt_diff_{i:0>2}'] = kurtosis(np.diff(channel, axis=1), axis=1)

        sensor_02 = x[:, :, 2]
        features[f'up_count_02'] = np.sum(np.diff(sensor_02, axis=1) >= 0, axis=1)
        features[f'up_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_max_02'] = np.max(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_mean_02'] = np.nan_to_num(features[f'up_max_02'] / features[f'up_count_02'], posinf=40)

        features[f'down_count_02'] = np.sum(np.diff(sensor_02, axis=1) < 0, axis=1)
        features[f'down_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_min_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
        
        return pd.DataFrame(features, index=seq_no)[self.features_to_use]
    
class TsfreshExtractor(BaseEstimator, TransformerMixin):
    sensorwise_fcs = [{'agg_autocorrelation': [{'f_agg': 'var', 'maxlag': 40}],
        'agg_linear_trend': [{'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 3, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10},
        {'coeff': 9, 'k': 10}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}],
        'variation_coefficient': [{}]},
        {'absolute_sum_of_changes': [{}],
        'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 5, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 5, 'f_agg': 'var'}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.4, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': False, 'f_agg': 'var'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.2, 'qh': 0.6, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.6, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 1.0, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.4, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.4, 'qh': 1.0, 'isabs': True, 'f_agg': 'mean'},
        {'ql': 0.6, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.6, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.8, 'qh': 1.0, 'isabs': False, 'f_agg': 'mean'}],
        'cid_ce': [{'normalize': True}],
        'cwt_coefficients': [{'widths': (2, 5, 10, 20), 'coeff': 1, 'w': 2}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'abs'}],
        'matrix_profile': [{'threshold': 0.98, 'feature': 'min'}],
        'partial_autocorrelation': [{'lag': 2}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}],
        'quantile': [{'q': 0.1}],
        'ratio_value_number_to_time_series_length': [{}],
        'spkt_welch_density': [{'coeff': 2}],
        'standard_deviation': [{}],
        'time_reversal_asymmetry_statistic': [{'lag': 1}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}]},
        {'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'min'},
        {'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'var'},
        {'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'max'},
        {'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'var'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 10, 'k': 10},
        {'coeff': 2, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 2}, {'lag': 6}],
        'cid_ce': [{'normalize': True}],
        'energy_ratio_by_chunks': [{'num_segments': 10, 'segment_focus': 1},
        {'num_segments': 10, 'segment_focus': 3},
        {'num_segments': 10, 'segment_focus': 5},
        {'num_segments': 10, 'segment_focus': 6},
        {'num_segments': 10, 'segment_focus': 7},
        {'num_segments': 10, 'segment_focus': 9}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 0, 'attr': 'real'},
        {'coeff': 3, 'attr': 'abs'},
        {'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'friedrich_coefficients': [{'coeff': 1, 'm': 3, 'r': 30},
        {'coeff': 3, 'm': 3, 'r': 30}],
        'index_mass_quantile': [{'q': 0.2}, {'q': 0.3}, {'q': 0.7}],
        'kurtosis': [{}],
        'large_standard_deviation': [{'r': 0.25}],
        'number_peaks': [{'n': 10}, {'n': 5}],
        'partial_autocorrelation': [{'lag': 4}, {'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 5}],
        'ratio_beyond_r_sigma': [{'r': 0.5}, {'r': 1}, {'r': 2}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}],
        'time_reversal_asymmetry_statistic': [{'lag': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'cwt_coefficients': [{'widths': (2, 5, 10, 20), 'coeff': 10, 'w': 20}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'partial_autocorrelation': [{'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}]},
        {'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 5, 'f_agg': 'max'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.8, 'isabs': True, 'f_agg': 'mean'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'abs'},
        {'coeff': 1, 'attr': 'imag'}],
        'number_crossing_m': [{'m': 0}],
        'skewness': [{}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'kurtosis': [{}]},
        {'agg_linear_trend': [{'attr': 'intercept', 'chunk_len': 50, 'f_agg': 'var'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 3, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10},
        {'coeff': 8, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 6}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'quantile': [{'q': 0.9}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_autocorrelation': [{'f_agg': 'var', 'maxlag': 40}],
        'agg_linear_trend': [{'attr': 'rvalue', 'chunk_len': 10, 'f_agg': 'var'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10}, {'coeff': 10, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'pvalue'}, {'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 1}, {'lag': 2}, {'lag': 5}, {'lag': 6}],
        'change_quantiles': [{'ql': 0.2, 'qh': 0.8, 'isabs': False, 'f_agg': 'mean'},
        {'ql': 0.2, 'qh': 0.8, 'isabs': True, 'f_agg': 'var'}],
        'cid_ce': [{'normalize': True}],
        'fft_aggregated': [{'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 4, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'friedrich_coefficients': [{'coeff': 3, 'm': 3, 'r': 30}],
        'kurtosis': [{}],
        'linear_trend': [{'attr': 'pvalue'}],
        'partial_autocorrelation': [{'lag': 3}, {'lag': 4}, {'lag': 9}],
        'permutation_entropy': [{'tau': 1, 'dimension': 4}],
        'quantile': [{'q': 0.2}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 4, 'k': 10},
        {'coeff': 5, 'k': 10},
        {'coeff': 6, 'k': 10},
        {'coeff': 7, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 1, 'attr': 'imag'}],
        'spkt_welch_density': [{'coeff': 2}]},
        {'agg_linear_trend': [{'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'max'},
        {'attr': 'stderr', 'chunk_len': 10, 'f_agg': 'min'}],
        'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 10, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 6, 'k': 10}],
        'augmented_dickey_fuller': [{'attr': 'usedlag'}],
        'autocorrelation': [{'lag': 1}, {'lag': 2}],
        'binned_entropy': [{'max_bins': 10}],
        'change_quantiles': [{'ql': 0.0, 'qh': 0.2, 'isabs': False, 'f_agg': 'var'},
        {'ql': 0.0, 'qh': 1.0, 'isabs': True, 'f_agg': 'var'},
        {'ql': 0.4, 'qh': 0.6, 'isabs': True, 'f_agg': 'mean'}],
        'fft_aggregated': [{'aggtype': 'kurtosis'}, {'aggtype': 'skew'}],
        'fft_coefficient': [{'coeff': 0, 'attr': 'abs'},
        {'coeff': 1, 'attr': 'abs'},
        {'coeff': 22, 'attr': 'abs'},
        {'coeff': 23, 'attr': 'abs'},
        {'coeff': 24, 'attr': 'abs'},
        {'coeff': 25, 'attr': 'abs'}],
        'fourier_entropy': [{'bins': 100}],
        'kurtosis': [{}],
        'partial_autocorrelation': [{'lag': 2}, {'lag': 3}],
        'ratio_beyond_r_sigma': [{'r': 2}],
        'spkt_welch_density': [{'coeff': 2}]}]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df_tsf = []
        for i in range(13):
            sensor_name = f'sensor_{i:0>2}'
            ts = X[['sequence', 'step', sensor_name]]
            features = extract_features(
                ts, 
                self.sensorwise_fcs[i],
                column_id='sequence', 
                column_sort='step'
            )
            df_tsf.append( features )
        df_tsf = pd.concat(df_tsf, axis=1)
        return df_tsf
    

def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator

class CorrExtractor(BaseEstimator, TransformerMixin):
    #def __init__(self):
        #self.features = {}
        #self.list_df = []
        #self.length = 0
        #self.corrs = pd.DataFrame()
    def fit(self, X):
        return self

    def transform(self, X, y=None):
        def df_autocorr(df, lag=1, axis=0):
            # Compute full-sample column-wise autocorrelation for a DataFrame.
            return df.apply(lambda col: col.autocorr(lag), axis=axis)
        features = {}
        list_df = []
        for n,g in X.groupby('sequence'):
            list_df.append(g)
        length = len(list_df)
        corrs = pd.DataFrame()
        for k in range(0,length):
            features[k] = {}
            for i in range(0,13):
                features[k][f'autocorr_{i:0>2}'] = df_autocorr(list_df[k])[i+3]
                features[k][f'autocorr_diff_{i:0>2}'] = df_autocorr(list_df[k].diff().drop(list_df[k].diff().index[[0]]))[i+3]
                for j in range(0,13):
                    if i>j:
                        features[k][f'corr_{j:0>2}_{i:0>2}'] = list_df[k].loc[:, 'sensor_00':'sensor_12'].corr().iat[i,j]
                    elif j>i:
                        features[k][f'corr_diff_{i:0>2}_{j:0>2}'] = list_df[k].diff().drop(list_df[k].diff().index[[0]]).loc[:, 'sensor_00':'sensor_12'].corr().iat[i,j]
            corrs = pd.concat([corrs, pd.DataFrame(features[k], index = [list_df[k].iloc[0,0]])], axis=0)
        return corrs.replace(np.nan, 0)
    
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
! pip install pyts
from pyts.transformation import BagOfPatterns as BOP

class MBOP(BaseEstimator, TransformerMixin):
    """Multivariate Bag of patterns.
    Given multivariate time series , MBOP splits indidual time series, BOP-transform 
    them and gather them in one dataframe data frame. further note is documnet of BOP.
    
    BOP
    This algorithm uses a sliding window to extract subsequences from the
    time series and transforms each subsequence into a word using the
    Piecewise Aggregate Approximation and the Symbolic Aggregate approXimation
    algorithms. Thus it transforms each time series into a bag of words.
    Then it derives the frequencies of each word for each time series.

    Parameters of MBOP
    ----------
    
    n_channels : non negative int (default = 13)
        number of time series
        
    m_occur: positive float strictly under 1 (default = 0.01)
        parameter for reduction of dimension of features.
        ignores feature of pattern of trivial occurrence.
        while normal BOP produces features every pattern,
        MBOP will drop column with mean of occurence less than m_occur 
        i.e. patterns that appear less than (sample * m_occur)
    
    
    
    
    window_size : int or float (default = 0.5)
        Length of the sliding window. If float, it represents
        a percentage of the size of each time series and must be
        between 0 and 1.

    word_size : int or float (default = 0.5)
        Length of the words. If float, it represents
        a percentage of the length of the sliding window and must be
        between 0. and 1.

    n_bins : int (default = 4)
        The number of bins to produce. It must be between 2 and
        ``min(window_size, 26)``.

    strategy : 'uniform', 'quantile' or 'normal' (default = 'normal')
        Strategy used to define the widths of the bins:

        - 'uniform': All bins in each sample have identical widths
        - 'quantile': All bins in each sample have the same number of points
        - 'normal': Bin edges are quantiles from a standard normal distribution

    numerosity_reduction : bool (default = True)
        If True, delete sample-wise all but one occurence of back to back
        identical occurences of the same words.

    window_step : int or float (default = 1)
        Step of the sliding window. If float, it represents the percentage of
        the size of each time series and must be between 0 and 1. The step of
        sliding window will be computed as
        ``ceil(window_step * n_timestamps)``.
    
    
    norm_mean 
    is not supported for initiallizing MBOP
        

    norm_std : bool (default = True)
    is not supported for initiallizing MBOP

    sparse : bool (default = True)
        Return a sparse matrix if True, else return an array.

    overlapping : bool (default = True)
        If True, time points may belong to two bins when decreasing the size
        of the subsequence with the Piecewise Aggregate Approximation
        algorithm. If False, each time point belong to one single bin, but
        the size of the bins may vary.

    alphabet :
    is not supported for initiallizing MBOP    
    
    """
    def __init__(self, n_channels=13, m_occur=0.01,
                 window_size=4 , word_size=4 ,  n_bins =6, strategy = "quantile" ,  sparse  = False, 
                 numerosity_reduction=True, window_step=1 ,overlapping=True  ):      
        self.window_size = window_size
        self.word_size = word_size
        self.n_bins = n_bins
        self.strategy = strategy
        self.numerosity_reduction = numerosity_reduction
        self.window_step = window_step
        self.sparse = sparse
        self.overlapping = overlapping    # BOP parameters til here
        self.m_occur= m_occur           # minimum of mean occurence columns with mean lower than minimum occur will be dropped out for size of feature 
        self.n_channels = n_channels
        self.col_list=[]   #list of columns index with nontrivial occurrence called in self.reducer in self.fitting
        self.MACHINES=[]   #stores n_channel Bop machines in this list
        self.idces=[]
        self.ft_X=None
    
        
    def reducer (self, X,save_trans_X=False): 
        """part of fitting. function used in self.fit
        create instance variable reducing dimension of features.
        
        X :  3d arrary with (sample, time, channel)"""
        temp_col_list=[]
        trans_X_list=[]
        for i in range(self.n_channels):
            #acutal transform happens here.
            #thus when fitting take save_trans_X= True then we can use self.recycle to recycle this result 
            temp_col=self.MACHINES[i].transform(X[:,:,i])    
            self.col_list.append(temp_col.mean(axis=0)>self.m_occur)
            if save_trans_X:
                trans_X_list.append(temp_col[:,self.col_list[i]])                
        if save_trans_X:
            self.X_of_fit_list=trans_X_list
        return self
    
        
    def fit(self, X,save_trans_X=False): 
        """
        Fits BOP machines, given suitable dataframe.
        There are n_channel number of different BOP Machines to fit.
        Note that fit() actualy calculates result of fit_transform(X) 
        during it's process.
        Hence if user is willing to save this calculation
        take save_trans_X=True
        then return of fit_transform(X) will be stored as instance
        variable ft_X
        
        X : DataFrame with first column : index,
                           last n_channel column: data of interest 
            For fit takes first and last n_channel-columns of data
            and transform data with index by first column of X.
             (n_samples*time rows, alpha) alpha: integer larger than n_channel.
             
            MBOP considers first column to be index and
            last n_channel columns to be data of interest
            fit will take first column as index of resultant dataframe
            must make sure that X.iloc[:,0] is series of index and
            last n_channel columns store data of interest
        
        Creates
        -------
        (when save_trans_X=True) self.tf_X : result of fit_transform(X)         
        """
        self.MACHINES=[]  
        self.col_list=[]                #resets col_list
        data_3d=X.iloc[:,-self.n_channels :].to_numpy().reshape(-1,60,13)      #separating data and information array and reshaping by (n_sample , -1)
        seq=(X.iloc[:,0].to_numpy().reshape(-1,60))[:,0]  #seq is 1d array 
        for i in range(self.n_channels):
            self.MACHINES.append(BOP(
                window_size=self.window_size, word_size=self.word_size,
            n_bins=self.n_bins, strategy=self.strategy, sparse=self.sparse,
            numerosity_reduction=self.numerosity_reduction,
            window_step=self.window_step, overlapping=self.overlapping))
            self.MACHINES[i].fit(data_3d[:,:,i])
            print("{}-th machine fitted".format(i))
        print("reducing")
        self.reducer(data_3d,save_trans_X=save_trans_X) #makes object variable for transform (collecting index of nontrivial columns)
        if save_trans_X:
            self.ft_X=self.recycle(seq)
            del self.X_of_fit_list
        print("all fitted")
        return self
    
    def recycle(self,seq=None): 
        """part of fitting
           activates when parameter of fit is True
        """
        print("fit_transform result has been saved as instance variable ft_X")
        return pd.DataFrame(np.concatenate(self.X_of_fit_list,axis=1),index=seq)
    
    def gods_sake(self):
        print("help me")
        return self
        
    
    def transform(self, X,y=None,train_transform=False):
        """
        Transforms last n_channels-columns of X to (n_smaple, n_feature) DataFrame,
        with index from first column of X.
        If train_transform=True, method will try to find previously calculated
        result while fitting.  
        X : dataframe with first column holding index of X_new(reurn of transform) and
            last-n_channels-columns holding data to transform.
            Need to make sure first and last n_channel columns are correct
            
        y : ignored

        train_transform : If is True and save_trans_X was True when fitting, 
                          retrieves transform result (Default  = False )
                          deletes ft_X
        Returns
        ------
        X_new : dataframe indexed with first column of X (n_samples, n_features)
        """
        if train_transform:
            if type(self.ft_X)!=type(None):
                transform_X=self.ft_X.copy()
                del self.ft_X
                self.ft_X=None
                print("previous calculation ft_X deleted")
                return transform_X

        temp_col_list=[]
        data_3d=X.iloc[:,-self.n_channels :].to_numpy().reshape(-1,60,13)
        seq=(X.iloc[:,0].to_numpy().reshape(-1,60))[:,0]  #seq is 1d array 
        for i in range(self.n_channels):
            temp_col=self.MACHINES[i].transform(data_3d[:,:,i])
            print("{}-th channel finished".format(i))
            print("number of pure features of {} BOP={}".format(i,self.col_list[i].shape))
            temp_col_list.append(temp_col[:,self.col_list[i]])  #not temp_col list but some indexing because we are reducing by dropping trivial patterns
            del temp_col
        transform_X=np.concatenate(temp_col_list,axis=1)
        print("shape={}".format(transform_X.shape))
        del temp_col_list
        return pd.DataFrame(transform_X,index=seq)
        
    
    
    def fit_transform(self,X,y=None):
        """Faster than fitting and transforming"""
        self.fit(X,save_trans_X=True)
        transform_X=self.ft_X.copy()
        del self.ft_X
        return transform_X
    
    
    
    
    
    def refinement(self,trans_train_X,new_m_occur=0.011): 
        """
        method defined for search of better m_occur (larger than m_occur)
        used to find larger m_occur parameter i.e. larger refinement , smaller dimension of feature.
        mostly 1percent works fine
        
        trans_train_X : The transform of X used to fit BOPs.i.e. fit_transform (X).
                        Need to input transform of exactly same dataframe that 
                        has been used for fitting
        new_m_occur  :  float larger than self.m_occur or list of such floats.
                        
        Creates
        -------
        idces : list of indices(Int64Index) corresponding to inputed list or even single new_m_occur (pandas.core.indexes.numeric.Int64Index)
        
        
        Example
        --------
        Being Int64Index and being a subindex of columns of fit_transform(X), can input directly.
        >>train_X=fit_transform(train_X)
        >>test_X=transform(test_data)
        >>for i in self.idces:
        >>    clf.fit(train_X[i],train_y)
        >>    clf.score(test_X[i],test_y)
        """
        self.idces=[]
        try:
            for i in new_m_occur:
                if i<self.m_occur:
                    print("new minimum occurrence has to be larger than previous one")
                    pass
                else:
                    self.idces.append((trans_train_X.mean()[(trans_train_X.mean()>i)
                                                  ]).index)
            print("instance variable created: idces list of new minimum mean occurence")
        except TypeError:
            if new_m_occur<self.m_occur:
                print("new minimum occurrence has to be larger than previous one")
                return None
            else:
                print("instance variable created: list with single new minimum mean occurence")
                self.idces.append((trans_train_X.mean()[(trans_train_X.mean()>new_m_occur)
                                                                              ]).index)

Collecting pyts
  Downloading pyts-0.12.0-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: pyts
Successfully installed pyts-0.12.0
[0m

In [5]:
df = load_raw_data('train')
y = load_label('train')

In [None]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_union
from sklearn.metrics import classification_report
cv_scores = []

extractors = [CorrExtractor(), ElementaryExtractor(), TsfreshExtractor(), MBOP()]
extractor = make_union(*extractors)

for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=42):
    df_train, y_train = df[~mask_df_val], y[~mask_y_val]
    df_val, y_val = df[mask_df_val], y[mask_y_val]
    
    X_train = extractor.fit_transform(df_train)
    X_val = extractor.transform(df_val)
    print(X_train.shape, X_val.shape)
    
    clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    print(evaluate(clf, X_train, y_train))
    print(evaluate(clf, X_val, y_val))
    print(classification_report(y_val, (clf.predict(X_val) >= 0.5).astype(int), digits=4 ))
    
    cv_scores.append(evaluate(clf, X_val, y_val))
print(f'5-fold CV score: {np.mean(cv_scores):.4f}')

Feature Extraction: 100%|██████████| 10/10 [02:09<00:00, 12.91s/it]
Feature Extraction: 100%|██████████| 10/10 [00:59<00:00,  5.98s/it]
Feature Extraction: 100%|██████████| 10/10 [01:35<00:00,  9.52s/it]
Feature Extraction: 100%|██████████| 10/10 [01:54<00:00, 11.43s/it]
Feature Extraction: 100%|██████████| 10/10 [04:34<00:00, 27.41s/it]
Feature Extraction: 100%|██████████| 10/10 [01:08<00:00,  6.87s/it]
Feature Extraction: 100%|██████████| 10/10 [00:58<00:00,  5.85s/it]
Feature Extraction: 100%|██████████| 10/10 [02:11<00:00, 13.17s/it]
Feature Extraction: 100%|██████████| 10/10 [00:11<00:00,  1.14s/it]
Feature Extraction: 100%|██████████| 10/10 [02:08<00:00, 12.87s/it]
Feature Extraction: 100%|██████████| 10/10 [04:28<00:00, 26.86s/it]
Feature Extraction: 100%|██████████| 10/10 [02:01<00:00, 12.16s/it]
Feature Extraction: 100%|██████████| 10/10 [02:39<00:00, 15.93s/it]


0-th machine fitted
1-th machine fitted
2-th machine fitted
3-th machine fitted
4-th machine fitted
5-th machine fitted
6-th machine fitted
7-th machine fitted
8-th machine fitted
9-th machine fitted
10-th machine fitted
11-th machine fitted
12-th machine fitted
reducing
fit_transform result has been saved as instance variable ft_X
all fitted


Feature Extraction: 100%|██████████| 10/10 [00:33<00:00,  3.31s/it]
Feature Extraction: 100%|██████████| 10/10 [00:14<00:00,  1.47s/it]
Feature Extraction: 100%|██████████| 10/10 [00:24<00:00,  2.40s/it]
Feature Extraction: 100%|██████████| 10/10 [00:30<00:00,  3.02s/it]
Feature Extraction: 100%|██████████| 10/10 [01:09<00:00,  6.92s/it]
Feature Extraction: 100%|██████████| 10/10 [00:18<00:00,  1.84s/it]
Feature Extraction: 100%|██████████| 10/10 [00:15<00:00,  1.57s/it]
Feature Extraction: 100%|██████████| 10/10 [00:34<00:00,  3.49s/it]
Feature Extraction: 100%|██████████| 10/10 [00:03<00:00,  3.12it/s]
Feature Extraction: 100%|██████████| 10/10 [00:34<00:00,  3.41s/it]
Feature Extraction: 100%|██████████| 10/10 [01:06<00:00,  6.66s/it]
Feature Extraction: 100%|██████████| 10/10 [00:31<00:00,  3.11s/it]
Feature Extraction: 100%|██████████| 10/10 [00:40<00:00,  4.08s/it]


0-th channel finished
number of pure features of 0 BOP=(74,)
1-th channel finished
number of pure features of 1 BOP=(76,)
2-th channel finished
number of pure features of 2 BOP=(130,)
3-th channel finished
number of pure features of 3 BOP=(75,)
4-th channel finished
number of pure features of 4 BOP=(972,)
5-th channel finished
number of pure features of 5 BOP=(310,)
6-th channel finished
number of pure features of 6 BOP=(74,)
7-th channel finished
number of pure features of 7 BOP=(74,)
8-th channel finished
number of pure features of 8 BOP=(79,)
9-th channel finished
number of pure features of 9 BOP=(77,)
10-th channel finished
number of pure features of 10 BOP=(1102,)
11-th channel finished
number of pure features of 11 BOP=(72,)
12-th channel finished
number of pure features of 12 BOP=(1287,)
shape=(5151, 406)
(20817, 909) (5151, 909)
0.997048055973935
0.9679046719638748
              precision    recall  f1-score   support

           0       0.93      0.88      0.91      2592
     

Feature Extraction: 100%|██████████| 10/10 [02:18<00:00, 13.87s/it]
Feature Extraction: 100%|██████████| 10/10 [01:02<00:00,  6.25s/it]
Feature Extraction: 100%|██████████| 10/10 [01:39<00:00,  9.98s/it]
Feature Extraction: 100%|██████████| 10/10 [02:04<00:00, 12.47s/it]
Feature Extraction: 100%|██████████| 10/10 [04:52<00:00, 29.23s/it]
Feature Extraction: 100%|██████████| 10/10 [01:15<00:00,  7.52s/it]
Feature Extraction: 100%|██████████| 10/10 [01:02<00:00,  6.25s/it]
Feature Extraction: 100%|██████████| 10/10 [02:20<00:00, 14.07s/it]
Feature Extraction: 100%|██████████| 10/10 [00:12<00:00,  1.20s/it]
Feature Extraction: 100%|██████████| 10/10 [02:18<00:00, 13.82s/it]
Feature Extraction: 100%|██████████| 10/10 [04:39<00:00, 27.95s/it]
Feature Extraction: 100%|██████████| 10/10 [02:10<00:00, 13.07s/it]
Feature Extraction: 100%|██████████| 10/10 [02:46<00:00, 16.69s/it]


0-th machine fitted
1-th machine fitted
2-th machine fitted
3-th machine fitted
4-th machine fitted
5-th machine fitted
6-th machine fitted
7-th machine fitted
8-th machine fitted
9-th machine fitted
10-th machine fitted
11-th machine fitted
12-th machine fitted
reducing
fit_transform result has been saved as instance variable ft_X
all fitted


Feature Extraction: 100%|██████████| 10/10 [00:28<00:00,  2.87s/it]
Feature Extraction: 100%|██████████| 10/10 [00:12<00:00,  1.28s/it]
Feature Extraction: 100%|██████████| 10/10 [00:21<00:00,  2.12s/it]
Feature Extraction: 100%|██████████| 10/10 [00:26<00:00,  2.65s/it]
Feature Extraction: 100%|██████████| 10/10 [01:00<00:00,  6.09s/it]
Feature Extraction: 100%|██████████| 10/10 [00:15<00:00,  1.52s/it]
Feature Extraction: 100%|██████████| 10/10 [00:12<00:00,  1.27s/it]
Feature Extraction: 100%|██████████| 10/10 [00:29<00:00,  2.90s/it]
Feature Extraction: 100%|██████████| 10/10 [00:02<00:00,  3.50it/s]
Feature Extraction: 100%|██████████| 10/10 [00:28<00:00,  2.84s/it]
Feature Extraction: 100%|██████████| 10/10 [00:57<00:00,  5.76s/it]
Feature Extraction: 100%|██████████| 10/10 [00:27<00:00,  2.73s/it]
Feature Extraction: 100%|██████████| 10/10 [00:35<00:00,  3.51s/it]


0-th channel finished
number of pure features of 0 BOP=(76,)
1-th channel finished
number of pure features of 1 BOP=(81,)
2-th channel finished
number of pure features of 2 BOP=(132,)
3-th channel finished
number of pure features of 3 BOP=(85,)
4-th channel finished
number of pure features of 4 BOP=(775,)
5-th channel finished
number of pure features of 5 BOP=(164,)
6-th channel finished
number of pure features of 6 BOP=(75,)
7-th channel finished
number of pure features of 7 BOP=(81,)
8-th channel finished
number of pure features of 8 BOP=(79,)
9-th channel finished
number of pure features of 9 BOP=(81,)
10-th channel finished
number of pure features of 10 BOP=(1065,)
11-th channel finished
number of pure features of 11 BOP=(73,)
12-th channel finished
number of pure features of 12 BOP=(1290,)
shape=(4599, 406)
(21369, 909) (4599, 909)
0.9967701620475397
0.9665739281037276
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      2412
    

Feature Extraction: 100%|██████████| 10/10 [02:05<00:00, 12.53s/it]
Feature Extraction: 100%|██████████| 10/10 [00:54<00:00,  5.48s/it]
Feature Extraction: 100%|██████████| 10/10 [01:31<00:00,  9.11s/it]
Feature Extraction: 100%|██████████| 10/10 [01:51<00:00, 11.15s/it]
Feature Extraction: 100%|██████████| 10/10 [04:19<00:00, 25.96s/it]
Feature Extraction: 100%|██████████| 10/10 [01:07<00:00,  6.71s/it]
Feature Extraction: 100%|██████████| 10/10 [00:56<00:00,  5.70s/it]
Feature Extraction: 100%|██████████| 10/10 [02:06<00:00, 12.69s/it]
Feature Extraction: 100%|██████████| 10/10 [00:10<00:00,  1.09s/it]
Feature Extraction: 100%|██████████| 10/10 [02:02<00:00, 12.21s/it]
Feature Extraction: 100%|██████████| 10/10 [04:05<00:00, 24.56s/it]
Feature Extraction: 100%|██████████| 10/10 [02:02<00:00, 12.21s/it]
Feature Extraction: 100%|██████████| 10/10 [02:34<00:00, 15.46s/it]


0-th machine fitted
1-th machine fitted
2-th machine fitted
3-th machine fitted
4-th machine fitted
5-th machine fitted
6-th machine fitted
7-th machine fitted
8-th machine fitted
9-th machine fitted
10-th machine fitted
11-th machine fitted
12-th machine fitted
reducing
fit_transform result has been saved as instance variable ft_X
all fitted


Feature Extraction: 100%|██████████| 10/10 [00:37<00:00,  3.76s/it]
Feature Extraction: 100%|██████████| 10/10 [00:16<00:00,  1.64s/it]
Feature Extraction: 100%|██████████| 10/10 [00:27<00:00,  2.79s/it]
Feature Extraction: 100%|██████████| 10/10 [00:32<00:00,  3.30s/it]
Feature Extraction: 100%|██████████| 10/10 [01:17<00:00,  7.79s/it]
Feature Extraction: 100%|██████████| 10/10 [00:20<00:00,  2.03s/it]
Feature Extraction: 100%|██████████| 10/10 [00:17<00:00,  1.75s/it]
Feature Extraction: 100%|██████████| 10/10 [00:37<00:00,  3.77s/it]
Feature Extraction: 100%|██████████| 10/10 [00:03<00:00,  2.87it/s]
Feature Extraction: 100%|██████████| 10/10 [00:36<00:00,  3.65s/it]
Feature Extraction: 100%|██████████| 10/10 [01:14<00:00,  7.50s/it]
Feature Extraction: 100%|██████████| 10/10 [00:36<00:00,  3.66s/it]
Feature Extraction: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


0-th channel finished
number of pure features of 0 BOP=(75,)
1-th channel finished
number of pure features of 1 BOP=(81,)
2-th channel finished
number of pure features of 2 BOP=(127,)
3-th channel finished
number of pure features of 3 BOP=(84,)
4-th channel finished
number of pure features of 4 BOP=(994,)
5-th channel finished
number of pure features of 5 BOP=(325,)
6-th channel finished
number of pure features of 6 BOP=(74,)
7-th channel finished
number of pure features of 7 BOP=(81,)
8-th channel finished
number of pure features of 8 BOP=(79,)
9-th channel finished
number of pure features of 9 BOP=(81,)
10-th channel finished
number of pure features of 10 BOP=(1115,)
11-th channel finished
number of pure features of 11 BOP=(71,)
12-th channel finished
number of pure features of 12 BOP=(1289,)
shape=(6004, 406)
(19964, 909) (6004, 909)
0.9977424039223814
0.9510285631120259
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      2789
    

In [None]:
clf = LGBMClassifier(num_leaves=31, max_depth=4, n_estimators=100)

df_train_final = df
y_train_final = y
X_train_final = extractor.fit_transform(df_train_final)
clf.fit(X_train_final, y_train_final)

df_test_final = load_raw_data('test')
X_test_final = extractor.transform(df_test_final)
y_pred = clf.predict_proba(X_test_final)[:, 1]
submit(y_pred)