# Tabular Playground Series April 2022

Using convolutional layers on the data.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import tensorflow as tf
import scipy.fft

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
labels = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv")

In [None]:
SENSORS = ['sensor_{:02d}'.format(x) for x in range(0, 13)]
TIMESTEPS = 60    # length of each sequence
DELTAT = 1        # time difference between values in sequence

# Data Analysis

In [None]:
train

In [None]:
train.describe().T

In [None]:
train.isna().sum()

In [None]:
labels.head()

In [None]:
labels.state.value_counts()

In [None]:
test

Check if train and test subjects overlap:

In [None]:
train_subjects = set(train.subject.unique())
test_subjects = set(test.subject.unique())

train_subjects.intersection(test_subjects)

In [None]:
def plot_sequence(seq, state):
    sequence_id = seq.sequence.unique()[0]
    x = seq.step
    
    fig, axis = plt.subplots(len(SENSORS)//5 + 1, 5, figsize=(20, 10))
    fig.suptitle(f"Sequence: {str(sequence_id)}, State: {str(state)}")
    for sensor, ax in zip(SENSORS, axis.flatten()):
        ax.plot(x, seq[sensor])
        y = seq[sensor].to_numpy()
        f = scipy.fft.rfft(y)
        fx = scipy.fft.rfftfreq(TIMESTEPS, DELTAT)
        p = np.abs(f/TIMESTEPS)
        ax2 = ax.twinx()
        ax2.plot(fx*2*TIMESTEPS, p, c='orange')
        ax.grid()
        ax.set_title(sensor)

    plt.tight_layout()
    plt.show()

IDX = 3
seq = train[train.sequence == IDX]
state = labels[labels.sequence == IDX].state.to_numpy()[0]

plot_sequence(seq, state)

In [None]:
corr = train[SENSORS].corr()

sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

# Augment

In [None]:
SHIFTED_FEATURES = [f'{f}_shift' for f in SENSORS]
DIFF_FEATURES = [f'{f}_diff' for f in SENSORS]
FFT_FEATURES = [f'{f}_fft' for f in SENSORS]

ROLL_MEAN_FEATURES = [f'{f}_roll_mean' for f in SENSORS]

def augment(df):
    for sensor, shifted, diff, fft, roll_mean in zip(SENSORS, SHIFTED_FEATURES, DIFF_FEATURES, FFT_FEATURES, ROLL_MEAN_FEATURES):
        df[shifted] = df.groupby('sequence')[sensor].shift(1)
        df[shifted].fillna(0, inplace=True)
        df[diff] = df[sensor] - df[shifted]
        # TODO: Consider using rfft for speed and to avoid duplicate data
        df[fft] = df.groupby('sequence')[sensor].transform(lambda x: np.abs(scipy.fft.fft(x.to_numpy()/TIMESTEPS)))
        df[roll_mean] = df.groupby('sequence')[sensor].rolling(window=6, min_periods=1).mean().reset_index(level=0,drop=True)
augment(train)
augment(test)

FEATURES = SENSORS + SHIFTED_FEATURES + DIFF_FEATURES + FFT_FEATURES + ROLL_MEAN_FEATURES

In [None]:
train.isna().sum()

In [None]:
train.head()

In [None]:
from sklearn.preprocessing import QuantileTransformer

normalizer = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=1337).fit(train[FEATURES])
train_normalized = normalizer.transform(train[FEATURES])
test_normalized = normalizer.transform(test[FEATURES])

In [None]:
num_features = len(FEATURES)

train_data = train_normalized.reshape(int(len(train_normalized) / TIMESTEPS), TIMESTEPS, num_features)
test_data = test_normalized.reshape(int(len(test_normalized) / TIMESTEPS), TIMESTEPS, num_features)

print("train_data:", train_data.shape)
print("test_data:", test_data.shape)

Test whether the normalization works as expected

In [None]:
IDX = 2
one = tf.reshape(train_data[:, :, IDX], -1)

print(one.shape)
print("min:", tf.math.reduce_min(one))
print("max:", tf.math.reduce_max(one))
print("mean should be close to 0 and std close to 1:")
print("mean:", tf.math.reduce_mean(one))
print("std:", tf.math.reduce_std(one))

# Training

The model is based on the general idea of InceptionNet and ResNet, but adjusted for time-series data (1D temporal).

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, Flatten, GlobalMaxPooling1D, \
    BatchNormalization, MaxPooling1D, GlobalAveragePooling1D, Concatenate

def get_model(input_shape):
    
    def inception_block(inp, filters):
        reg = "l2"
        bottleneck = Conv1D(filters=filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=reg)(inp)
    
        feat1 = Conv1D(filters=filters, kernel_size=3, activation="relu", padding="same", kernel_regularizer=reg)(bottleneck)
        feat2 = Conv1D(filters=filters, kernel_size=7, activation="relu", padding="same", kernel_regularizer=reg)(bottleneck)
        feat3 = Conv1D(filters=filters, kernel_size=15, activation="relu", padding="same", kernel_regularizer=reg)(bottleneck)
    
        bypass = MaxPooling1D(pool_size=3, strides=1, padding="same")(inp)
        bypass = Conv1D(filters=filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=reg)(bypass)

        out = Concatenate()([feat1, feat2, feat3, bypass, inp])
        out = BatchNormalization()(out)
        
        return out
    
    inp = Input(input_shape)
    x = inp
    
    x = Conv1D(filters=128, kernel_size=7, activation="relu", padding="same")(x)

    x = inception_block(x, filters=32)
    x = inception_block(x, filters=64)
    x = MaxPooling1D()(x)
    x = inception_block(x, filters=128)
    x = MaxPooling1D()(x)
    x = inception_block(x, filters=256)
    x = MaxPooling1D()(x)
    x = inception_block(x, filters=512)
    
    x = GlobalMaxPooling1D()(x)
    
    dnn = Dropout(0.3)(x)    
    dnn = Dense(256, activation="relu", kernel_regularizer="l2")(dnn)
    dnn = Dropout(0.2)(dnn)
    dnn = Dense(64, activation="relu", kernel_regularizer="l2")(dnn)

    output_layer = Dense(1, activation="sigmoid")(dnn)

    model = tf.keras.models.Model(inputs=inp, outputs=output_layer)
    model.compile(loss='binary_crossentropy', 
              optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4), 
              metrics=['accuracy', 'AUC'])
    
    return model

input_shape = (train_data.shape[1], train_data.shape[2])
model = get_model(input_shape)

model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=False)

In [None]:
from sklearn.model_selection import GroupKFold

FOLDS=5
BATCH=64
EPOCHS=50

kf = GroupKFold(n_splits=FOLDS)
groups = train.sequence.unique()
predictions = list()
histories = list()

for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(train_data, labels.state, groups)):
    
    print("#" * 15, f"Fold {fold_idx} Training", "#" * 15)

    train_X, valid_X = train_data[train_idx], train_data[valid_idx]
    train_y, valid_y = labels.iloc[train_idx].state, labels.iloc[valid_idx].state

    callbacks = [
        tf.keras.callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=5, 
                                             mode="max", verbose=1), 
        tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=8, mode="max", verbose=1,
                                         restore_best_weights=True)
    ]

    model = get_model(input_shape)
    
    history = model.fit(train_X, train_y, batch_size=BATCH, epochs=EPOCHS, callbacks=callbacks,
                        validation_data=(valid_X, valid_y))
    histories.append(history.history)
    
    print("#" * 15, f"Fold {fold_idx} Prediction", "#" * 15)
    prediction = model.predict(test_data).squeeze()
    predictions.append(prediction)
    # TODO remove
    # break

print("#" * 15, "Done", "#" * 15)

In [None]:
PLOTS = ["loss", "accuracy", "auc", "lr"]

for fold_id, history in enumerate(histories):
    fig, axis = plt.subplots(1, len(PLOTS), figsize=(20, 4))
    fig.suptitle(f"Fold {fold_id}")
    for plot, ax in zip(PLOTS, axis.flatten()):
        for label, data in history.items():
            if plot in label:
                ax.plot(data, label=label)
            ax.legend()
            ax.grid()
    plt.tight_layout()
    plt.show()


## Confusion Matrix

In [None]:
train_predictions = model.predict(train_data)
train_predictions = (train_predictions > 0.5).astype(np.int32)

confusion = tf.math.confusion_matrix(labels.state, train_predictions)
confusion /= tf.math.reduce_sum(confusion)

sns.heatmap(confusion, annot=True)

# Submission

In [None]:
submission["state"] = np.mean(predictions, axis=0)

submission.to_csv("submission.csv", index=False)

submission.head()