In [1]:
input_path = '../input/tabular-playground-series-apr-2022/'
output_path = './'

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

def submit(arr):
    df = pd.read_csv(f'{input_path}/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)

In [3]:
import tensorflow as tf
from tensorflow import keras
from sklearn.base import BaseEstimator, TransformerMixin

class RNNThickModel(keras.Model):
    def __init__(self):
        super(RNNThickModel, self).__init__()
        self.fns = [
            keras.layers.LSTM(
                units=256, 
                kernel_regularizer=keras.regularizers.L2(2e-3),
#                 recurrent_regularizer=keras.regularizers.L2(1e-5),
#                 dropout=0.05,
#                 recurrent_dropout=0.01,
                return_sequences=True
            ),
            keras.layers.LSTM(
                units=128,
                kernel_regularizer=keras.regularizers.L2(2e-3),
#                 recurrent_regularizer=keras.regularizers.L2(1e-5),
#                 dropout=0.05,
#                 recurrent_dropout=0.01,
            ),
            keras.layers.Dense(units=32, activation='elu'),
            keras.layers.Dense(units=1, activation='sigmoid')
        ]
    
    def call(self, inputs):
        outputs = inputs
        for layer in self.fns:
            outputs = layer(outputs)
        return outputs
    
    def predict_proba(self, X):
        return np.concatenate([1-self.predict(X), self.predict(X)], axis=1)

def random_sensor_swap(x, y, random_state=None):
    rng = np.random.default_rng(random_state)
    p_swap = 0.5
    indices = rng.choice(np.arange(x.shape[0]), int(p_swap*x.shape[0]), replace=False)
    x_aug, y_aug = x[indices], y[indices]
    swap_codes = rng.integers(0, 13, (x_aug.shape[0], 2))
    for i in range(x_aug.shape[0]):
        a, b = swap_codes[i]
        x_aug[i, :, [a, b]] = x_aug[i, :, [b, a]]
    x = np.concatenate([x, x_aug], axis=0)
    y = np.concatenate([y, y_aug], axis=0)
    return x, y

def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val


class DF2arr(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, 'sensor_00':'sensor_12'].values.reshape(-1, 60, 13)
    
    
class MyPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = normalize(X)
        return X
    
def normalize(x):
    x = x / (np.linalg.norm(x, axis=1, keepdims=True) + 1e-10)
    return x

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
cv_scores = []

df = load_raw_data('train')
X = DF2arr().transform(df)
y = load_label('train')
subj_nums = df['subject']

preprocessor = make_pipeline(DF2arr(), MyPreprocessor())

keras.backend.clear_session()
tf.random.set_seed(42)

callbacks = [
    keras.callbacks.EarlyStopping(patience=200, restore_best_weights=True)
]
for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=42):
    df_train, df_val = df[~mask_df_val], df[mask_df_val]
    y_train, y_val = y[~mask_y_val], y[mask_y_val]
    for mask_df_v, mask_y_v in group_splitter(df_train, nfold=5, random_state=42):
        df_t, df_v = df_train[~mask_df_v], df_train[mask_df_v]
        y_t, y_v = y_train[~mask_y_v], y_train[mask_y_v]

    X_t = preprocessor.fit_transform(df_t)
    X_v = preprocessor.transform(df_v)
    X_val = preprocessor.transform(df_val)

    with tf.device('gpu:0'):
        model = RNNThickModel()
        model.compile(
            loss='binary_crossentropy', 
            metrics=['AUC'],
            optimizer=keras.optimizers.Adam(1e-3))
        model.fit(
            X_t, y_t, 
            batch_size=1024,
            epochs=500, 
            callbacks=callbacks,
            validation_data=(X_v, y_v),
            verbose=0
        )
    print(evaluate(model, X_val, y_val))
    print(classification_report(y_val, (model.predict(X_val) >= 0.5).astype(int), digits=4 ))
    
    cv_scores.append(evaluate(model, X_val, y_val))
print(f'5-fold CV score: {np.mean(cv_scores):.4f}')

0.9550240858938919
              precision    recall  f1-score   support

           0     0.8668    0.9035    0.8848      2592
           1     0.8979    0.8593    0.8782      2559

    accuracy                         0.8816      5151
   macro avg     0.8823    0.8814    0.8815      5151
weighted avg     0.8822    0.8816    0.8815      5151

0.9521588824661936
              precision    recall  f1-score   support

           0     0.8763    0.9075    0.8916      2412
           1     0.8939    0.8587    0.8759      2187

    accuracy                         0.8843      4599
   macro avg     0.8851    0.8831    0.8838      4599
weighted avg     0.8847    0.8843    0.8842      4599

0.6153964112512665
              precision    recall  f1-score   support

           0     0.6738    0.3385    0.4506      2789
           1     0.5992    0.8579    0.7056      3215

    accuracy                         0.6166      6004
   macro avg     0.6365    0.5982    0.5781      6004
weighted avg     

In [None]:
df_test_final = load_raw_data('test')

X_train = preprocessor.fit_transform(df_train)
X_val = preprocessor.transform(df_val)
X_test_final = preprocessor.transform(df_test_final)

with tf.device('gpu:0'):
    model.fit(X_train, y_train, 
              epochs=500, 
              batch_size=1024,
              callbacks=callbacks,
              validation_data=(X_val, y_val)
             )
y_pred = model.predict(X_test_final)
submit(y_pred)