# Non-NN models

We should study these notebooks:

https://www.kaggle.com/code/jeroenvdd/tpsapr22-best-non-dl-model-tsflex-powershap?scriptVersionId=94240450

https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../'))

input_path = '../../input/tabular-playground-series-apr-2022'
output_path = '../../output'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + '/' + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

def submit(arr, tag=''):
    if tag:
        tag = '_' + tag
    df = pd.read_csv(f'{input_path}/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission{tag}.csv', index=False)

def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val

In [3]:
from DataAugmentation import Reverter, MultPerturb

class MySoftVoter():
    def __init__(self, models, weights=None):
        self.models = models
        if weights is None:
            weights = np.ones((len(models), ))
        weights /= np.sum(weights)
        self.weights = weights
    
    def predict(self, X):
        result = np.zeros((X.shape[0], ), dtype=X.dtype)
        for model, weight in zip(self.models, self.weights):
            add = model.predict(X)
            if len(add.shape) > 1:
                add = add[:, 0]
            result += add * weight
        return result
    
    def predict_proba(self, X):
        result = np.zeros((X.shape[0], 2), dtype=X.dtype)
        for model, weight in zip(self.models, self.weights):
            add = model.predict_proba(X)
            result += add * weight
        return result
        
df = load_raw_data('train')
y = load_label('train')

df, y = Reverter(random_state=42).transform(df, y)
df, y = MultPerturb(random_state=42).transform(df, y)

In [5]:
from ElementaryExtractor import ElementaryExtractor, TsfreshExtractor
from SWK.MBOP import MBOP
# from JHLee.CorrExtractor (2) import CorrExtractor

from lightgbm import LGBMClassifier
from sklearn.pipeline import make_union
from sklearn.metrics import classification_report
cv_scores = []

# extractors = [CorrExtractor(), ElementaryExtractor(), TsfreshExtractor(), MBOP()]
extractors = [ElementaryExtractor(), TsfreshExtractor(), MBOP()]
extractor = make_union(*extractors)

for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=42):
    df_train, y_train = df[~mask_df_val], y[~mask_y_val]
    df_val, y_val = df[mask_df_val], y[mask_y_val]
    break
    X_train = extractor.fit_transform(df_train)
    X_val = extractor.transform(df_val)
    print(X_train.shape, X_val.shape)
    
    clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    print(evaluate(clf, X_train, y_train))
    print(evaluate(clf, X_val, y_val))
    print(classification_report(y_val, (clf.predict(X_val) >= 0.5).astype(int), digits=4 ))
    
    cv_scores.append(evaluate(clf, X_val, y_val))
print(f'5-fold CV score: {np.mean(cv_scores):.4f}')

5-fold CV score: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [6]:
clf = LGBMClassifier(num_leaves=31, max_depth=4, n_estimators=100, random_state=random_state)

df_train_final = df
y_train_final = y
X_train_final = extractor.fit_transform(df_train_final)
clf.fit(X_train_final, y_train_final)

df_test_final = load_raw_data('test')
X_test_final = extractor.transform(df_test_final)
y_pred = clf.predict_proba(X_test_final)[:, 1]
submit(y_pred, tag='non_NN')

  features[f'kurt_{i:0>2}'] = kurtosis(channel, axis=1)
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(
  features[f'sm_{i:0>2}'] = np.nan_to_num(features[f'std_{i:0>2}'] / np.abs(np.mean(channel, axis=1))).clip(
  features[f'down_mean_02'] = np.nan_to_num(features[f'down_min_02'] / features[f'down_count_02'], neginf=-40)
Feature Extraction: 100%|██████████| 80/80 [02:03<00:00,  1.54s/it]
Feature Extraction: 100%|██████████| 80/80 [01:28<00:00,  1.11s/it]
Feature Extraction: 100%|██████████| 80/80 [01:52<00:00,  1.40s/it]
Feature Extraction: 100%|██████████| 80/80 [01:55<00:00,  1.44s/it]
Feature Extraction: 100%|██████████| 80/80 [03:55<00:00,  2.94s/it]
Feature Extraction: 100%|██████████| 80/80 [01:31<00:00,  1.14s/it]
Feature Extraction: 100%|██████████| 80/80 [01:27<00:00,  1.10s/it]
Feature Extraction: 100%|██████████| 80/80 [02:11<00:00,  1.64s/it]
Feature Extraction: 100%|██████████| 80/80 [01:13<00:00,  1.08it/s]
Fea

In [15]:
clf = LGBMClassifier(num_leaves=31, max_depth=-1, n_estimators=1000, random_state=42)

clf.fit(X_train_final, y_train_final)
y_pred = clf.predict_proba(X_test_final)[:, 1]
submit(y_pred, tag='non_NN_param')

In [11]:
models = [
    LGBMClassifier(num_leaves=31, max_depth=4, n_estimators=100, random_state=random_state)
    for random_state in range(42, 142)
    ]

for clf in models:
    clf.fit(X_train_final, y_train_final)

model = MySoftVoter(models)
y_pred = model.predict_proba(X_test_final)[:, 1]
submit(y_pred, tag='non_NN_softvote100')