# Non-NN models

In [1]:
input_path = './input/'
output_path = './output/pure_non_nn_results/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

def to_csv(arr,train_or_test='train',name=None):
    df = pd.DataFrame(arr)
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}_v0.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}_v0.csv', index = False )

def ans_to_csv(arr,train_or_test='train',name=None):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    df['state'] = arr
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}.csv', index = False )

def submit(arr):
    df = pd.read_csv(f'{input_path}/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)

In [3]:


def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val

In [4]:
from tbr_apr_mds import CorrExtractor ,ElementaryExtractor, TsfreshExtractor,MBOP

In [5]:
def short_test(x,y,n):
    return df.loc[df.sequence<n] ,y[:n]

In [6]:
df = load_raw_data('train')
y = load_label('train')
df_test = load_raw_data("test")
y_test = load_label("test")

In [7]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_union
from sklearn.metrics import classification_report
clfs = [
            LGBMClassifier(n_estimators=200, random_state=55,max_depth=10,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=62,max_depth=10,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=1,max_depth=-1,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=42,max_depth=-1,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=42,max_depth=-1,boosting_type="goss",metric="auc"),
]

extractors = [ ElementaryExtractor(), TsfreshExtractor(), MBOP(window_size=4,word_size=4,n_bins=10,m_occur=0.006)]
extractor = make_union(*extractors)
train_preds_array = np.zeros(len(clfs)*len(y)).reshape(-1,len(clfs))
test_preds_array = np.zeros(len(clfs)*len(y_test)).reshape(-1,len(clfs))
p=0
for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=21):
    if p==0:
        p+=1
        df_train, y_train = df[~mask_df_val], y[~mask_y_val]
        df_val, y_val = df[mask_df_val], y[mask_y_val]
        X_train = extractor.fit_transform(df_train)
        X_val = extractor.transform(df_val)
        print(X_train.shape, X_val.shape)
        for clf,i in zip(clfs,range(len(clfs))):
            clf.fit(X_train, y_train,eval_set=[(X_val,y_val)],verbose=100)
            X_train_F = extractor.transform(df)
            X_test_F = extractor.transform(df_test)
            train_preds_array[:,i] += clf.predict_proba(X_train_F)[:,1]
            test_preds_array[:,i] += clf.predict_proba(X_test_F)[:,1]
            print(clf)
            print(evaluate(clf, X_train, y_train))
            print(evaluate(clf, X_val, y_val))
            print(classification_report(y_val, (clf.predict(X_val) >= 0.5).astype(int), digits=4 ))
    else:
        p+=1
to_csv(train_preds_array,"train")
to_csv(test_preds_array,"test")

  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


(21201, 738) (4767, 738)




[100]	valid_0's auc: 0.959125
[200]	valid_0's auc: 0.963063


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


LGBMClassifier(boosting_type='dart', max_depth=10, metric='auc',
               n_estimators=200, random_state=55)
0.99522983892498
0.9630627620778018
              precision    recall  f1-score   support

           0     0.9247    0.8785    0.9010      2378
           1     0.8848    0.9288    0.9063      2389

    accuracy                         0.9037      4767
   macro avg     0.9048    0.9037    0.9036      4767
weighted avg     0.9047    0.9037    0.9036      4767





[100]	valid_0's auc: 0.955988
[200]	valid_0's auc: 0.961855


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


LGBMClassifier(boosting_type='dart', max_depth=10, metric='auc',
               n_estimators=200, random_state=62)
0.9930998576132419
0.9618547090480936
              precision    recall  f1-score   support

           0     0.9267    0.8772    0.9013      2378
           1     0.8839    0.9309    0.9068      2389

    accuracy                         0.9041      4767
   macro avg     0.9053    0.9041    0.9041      4767
weighted avg     0.9053    0.9041    0.9041      4767





[100]	valid_0's auc: 0.956171
[200]	valid_0's auc: 0.961604


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


LGBMClassifier(boosting_type='dart', metric='auc', n_estimators=200,
               random_state=1)
0.9915446382486429
0.9616042268302188
              precision    recall  f1-score   support

           0     0.9224    0.8747    0.8979      2378
           1     0.8814    0.9267    0.9035      2389

    accuracy                         0.9008      4767
   macro avg     0.9019    0.9007    0.9007      4767
weighted avg     0.9018    0.9008    0.9007      4767





[100]	valid_0's auc: 0.953933
[200]	valid_0's auc: 0.958852


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


LGBMClassifier(boosting_type='dart', metric='auc', n_estimators=200,
               random_state=42)
0.9869028032392988
0.9588520908664291
              precision    recall  f1-score   support

           0     0.9209    0.8667    0.8930      2378
           1     0.8747    0.9259    0.8996      2389

    accuracy                         0.8964      4767
   macro avg     0.8978    0.8963    0.8963      4767
weighted avg     0.8977    0.8964    0.8963      4767





[100]	valid_0's auc: 0.964879
[200]	valid_0's auc: 0.968095


  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(-1e30, 1e30)
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)


LGBMClassifier(boosting_type='goss', metric='auc', n_estimators=200,
               random_state=42)
0.9999716828334965
0.9680949375132237
              precision    recall  f1-score   support

           0     0.9199    0.8982    0.9089      2378
           1     0.9010    0.9221    0.9115      2389

    accuracy                         0.9102      4767
   macro avg     0.9105    0.9102    0.9102      4767
weighted avg     0.9104    0.9102    0.9102      4767



In [8]:
# clf = LGBMClassifier(num_leaves=31, max_depth=4, n_estimators=100)

# df_train_final = df
# y_train_final = y
# X_train_final = extractor.fit_transform(df_train_final)
# clf.fit(X_train_final, y_train_final)

# df_test_final = load_raw_data('test')
# X_test_final = extractor.transform(df_test_final)
# y_pred = clf.predict_proba(X_test_final)[:, 1]
# submit(y_pred)