In [None]:
input_path = '../input/tabular-playground-series-apr-2022/'
output_path = './'

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator
from scipy.stats import kurtosis
from tsfresh.feature_extraction.extraction import extract_features

class ElementaryExtractor(BaseEstimator, TransformerMixin):
    features_to_use = ['med_abs_val_00',
        'max_abs_val_00',
        'sum_abs_diff_00',
        'l2_sum_00',
        'l2_sum_diff_00',
        'l2_sum_diff2_00',
        'kurt_00',
        'sm_00',
        'kurt_diff_00',
        'mean_01',
        'med_abs_val_01',
        'l2_sum_diff2_01',
        'sm_01',
        'iqr_diff_01',
        'mean_02',
        'med_abs_val_02',
        'max_abs_val_02',
        'med_abs_diff_02',
        'max_abs_diff_02',
        'l2_sum_diff_02',
        'l2_sum_diff2_02',
        'std_02',
        'kurt_02',
        'std_diff_02',
        'iqr_diff_02',
        'kurt_diff_02',
        'med_abs_val_03',
        'med_abs_diff_03',
        'max_abs_diff_03',
        'sum_abs_diff_03',
        'sm_03',
        'iqr_diff_03',
        'mean_04',
        'med_abs_val_04',
        'max_abs_val_04',
        'med_abs_diff_04',
        'max_abs_diff_04',
        'l2_sum_04',
        'l2_sum_diff2_04',
        'iqr_04',
        'kurt_04',
        'sm_04',
        'kurt_diff_04',
        'mean_05',
        'med_abs_diff_05',
        'sum_abs_diff_05',
        'sm_05',
        'mean_06',
        'med_abs_val_06',
        'med_abs_diff_06',
        'max_abs_diff_06',
        'l2_sum_diff2_06',
        'kurt_06',
        'iqr_diff_06',
        'kurt_diff_06',
        'med_abs_val_07',
        'sum_abs_diff_07',
        'l2_sum_07',
        'l2_sum_diff_07',
        'l2_sum_diff2_07',
        'iqr_07',
        'sm_07',
        'iqr_diff_07',
        'kurt_diff_07',
        'max_abs_diff_08',
        'sum_abs_diff_08',
        'l2_sum_08',
        'l2_sum_diff_08',
        'l2_sum_diff2_08',
        'iqr_08',
        'kurt_08',
        'iqr_diff_08',
        'kurt_diff_08',
        'mean_09',
        'max_abs_diff_09',
        'sum_abs_diff_09',
        'l2_sum_09',
        'l2_sum_diff2_09',
        'sm_09',
        'iqr_diff_09',
        'kurt_diff_09',
        'mean_10',
        'med_abs_val_10',
        'max_abs_diff_10',
        'l2_sum_diff2_10',
        'std_10',
        'kurt_10',
        'sm_10',
        'std_diff_10',
        'kurt_diff_10',
        'mean_11',
        'sum_abs_diff_11',
        'l2_sum_diff_11',
        'sm_11',
        'iqr_diff_11',
        'kurt_diff_11',
        'max_abs_diff_12',
        'sum_abs_diff_12',
        'l2_sum_12',
        'l2_sum_diff2_12',
        'iqr_12',
        'kurt_12',
        'sm_12',
        'kurt_diff_12',
        'up_sum_02',
        'up_max_02',
        'up_mean_02',
        'down_count_02']
    
    def fit(self, X):
        return self
    
    def transform(self, X, y=None):
        seq_no = X['sequence'].iloc[::60]
        x = X.loc[:, 'sensor_00':'sensor_12'].values.reshape(-1, 60, 13)
        features = dict()
        for i in range(13):
            channel = x[:, :, i]
            # mean
            features[f'mean_{i:0>2}'] = np.mean(channel, axis=1)
            # median of absolute values
            features[f'med_abs_val_{i:0>2}'] = np.median(np.abs(channel), axis=1)
            # maximum of absolute values
            features[f'max_abs_val_{i:0>2}'] = np.max(np.abs(channel), axis=1)
            #median of absolute diff
            features[f'med_abs_diff_{i:0>2}'] = np.median(np.abs(np.diff(channel, axis=1)), axis=1)
            # maximum of absolute diff
            features[f'max_abs_diff_{i:0>2}'] = np.max(np.abs(np.diff(channel, axis=1)), axis=1)
            # absolute sum of difference
            features[f'sum_abs_diff_{i:0>2}'] = np.sum(np.abs(np.diff(channel, axis=1)), axis=1)
            # square sum
            features[f'l2_sum_{i:0>2}'] = np.linalg.norm(channel, axis=1)
            # square sum of difference
            features[f'l2_sum_diff_{i:0>2}'] = np.linalg.norm(np.diff(channel, axis=1), axis=1)
            # square sum of 2-diff
            features[f'l2_sum_diff2_{i:0>2}'] = np.linalg.norm(np.diff(np.diff(channel, axis=1), axis=1), axis=1)
            # standard deviation
            features[f'std_{i:0>2}'] = np.std(channel, axis=1)
            features[f'iqr_{i:0>2}'] = np.quantile(channel, 0.75, axis=1) - np.quantile(channel, 0.25, axis=1)
            features[f'kurt_{i:0>2}'] = kurtosis(channel, axis=1)
            features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)

            features[f'std_diff_{i:0>2}'] = np.std(np.diff(channel, axis=1), axis=1)
            features[f'iqr_diff_{i:0>2}'] = np.quantile(np.diff(channel, axis=1), 0.75, axis=1) - np.quantile(np.diff(channel, axis=1), 0.25, axis=1)
            features[f'kurt_diff_{i:0>2}'] = kurtosis(np.diff(channel, axis=1), axis=1)

        sensor_02 = x[:, :, 2]
        features[f'up_count_02'] = np.sum(np.diff(sensor_02, axis=1) >= 0, axis=1)
        features[f'up_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_max_02'] = np.max(np.clip(np.diff(sensor_02, axis=1), 0, None), axis=1)
        features[f'up_mean_02'] = np.nan_to_num(features[f'up_max_02'] / features[f'up_count_02'], posinf=40)

        features[f'down_count_02'] = np.sum(np.diff(sensor_02, axis=1) < 0, axis=1)
        features[f'down_sum_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_min_02'] = np.sum(np.clip(np.diff(sensor_02, axis=1), None, 0), axis=1)
        features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
        
        return pd.DataFrame(features, index=seq_no)[self.features_to_use]

In [None]:
from lightgbm import LGBMClassifier
from MyFeatureExtractor import MyFeatureExtractor

df = load_raw_data('train')
y = load_label('train')

def group_splitter(df, y, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield df[~mask_df_val], df[mask_df_val], y[~mask_y_val], y[mask_y_val]
    
for df_train, df_val, y_train, y_val in group_splitter(df, y, nfold=5, random_state=42):
    extractors = [ElementaryExtractor(), MyFeatureExtractor()]
    X_train = pd.concat([extractor.fit_transform(df_train) for extractor in extractors], axis=1)
    X_val = pd.concat([extractor.transform(df_val) for extractor in extractors], axis=1)
    
    clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=100, random_state=42)
    clf.fit(X_train.values, y_train)
    print(evaluate(clf, X_train, y_train))
    print(evaluate(clf, X_val, y_val))