## Sensor Data: Time-Series Classification

- Augment data with: scaling, shift, vertical flip and bandpass filter of each sensor time series
- Train LSTM/GRU - DNN model
- **LS: 0.965**

In [None]:
# packages
import os
import numpy as np 
import pandas as pd 
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
from typing import Tuple, List, Dict
import itertools
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from scipy import signal

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedGroupKFold

# tensorflow
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU, Bidirectional

In [None]:
class Config:
    train_path = "/kaggle/input/tabular-playground-series-apr-2022/train.csv"
    test_path = "/kaggle/input/tabular-playground-series-apr-2022/test.csv"
    train_labels_path = "/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv"
    sample_path = "/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv"
    
    seq_length = 60
    num_sensor = 13
    
    NFOLDS = 3
    EPOCHS = 2
    BATCH_SIZE = 128
    
# GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)
    
# data loading
train = pd.read_csv(Config.train_path)
test = pd.read_csv(Config.test_path)
train_labels = pd.read_csv(Config.train_labels_path)

submission = pd.read_csv(Config.sample_path)
submission.head(2)

In [None]:
from scipy.signal import butter, sosfilt

def scaling(signal, scale=0.05):
    '''Introduces some noise into the signal'''
    scale_factor = np.random.normal(loc=1.0, scale=scale, size=(1, signal.shape[1]))
    noise = np.matmul(np.ones((signal.shape[0], 1)), scale_factor)
    return signal * noise

def vertical_flip(signal):
    '''
    Input: signal
    Return: vertically flipped signal
    '''
    return signal[::-1, :]

def shift(signal, interval=10):
    '''
    Input: signal
    Return: shited signal by interval
    '''
    for col in range(signal.shape[1]):
        offset = np.random.choice(range(-interval, interval))/100
        signal[:, col] += offset
    return signal

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], analog = False, btype="band", output="sos")
    return sos

def butter_bandpass_filter(signal, lowcut, highcut, fs, order=5):
    sos = butter_bandpass(lowcut, highcut, fs, order=order)
    transformed_signal = np.zeros([signal.shape[0], signal.shape[1]])
    for i in range(signal.shape[1]): 
        transformed_signal[:, i] = sosfilt(sos, signal[:, i])  
    return transformed_signal

def transform(signal, train=True):
    '''
    Input: a signal to transform
    Output: for a training signla: it is randomly transformed and returned 
    '''
    if train:
        rn = np.random.randn()
        
        # randomly doing - scaling/flipping/shift/bandpass filter
        if rn < 0.25:
            signal = scaling(signal, scale=0.10)
        elif rn >= 0.25 and rn < 0.50:
            signal = vertical_flip(signal)
        elif rn >= 0.5 and rn < 0.75:
            signal = shift(signal, interval=10)
        else:
            signal = butter_bandpass_filter(signal, 0.05, 48, 256)

    return signal

In [None]:
# plot Sensor_01 first series
plt.plot(train[:60].sensor_01.values);
plt.title("original");

In [None]:
# plot Sensor_01 first series: with augmentations:

fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(24, 4))

axs[0].set_title("scaling")
axs[0].plot(scaling(train[:60].sensor_01.values.reshape(-1, 1), scale=0.50).ravel());
axs[1].set_title("shift") 
axs[1].plot(shift(train[:60].sensor_01.values.reshape(-1, 1), interval=50).ravel()); # verticle shift
axs[2].set_title("verticle flip")
axs[2].plot(vertical_flip(train[:60].sensor_01.values.reshape(-1, 1)).ravel());
axs[3].set_title("bandpass filter")
axs[3].plot(butter_bandpass_filter(train[:60].sensor_01.values.reshape(-1, 1), 0.05, 48, 256).ravel());

In [None]:
features = train.columns.tolist()[3:]

# Feature Normalization
sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

def transformed_data(train):
    sequences = train.sequence.values
    subjects = train.subject.values
    steps = train.step.values
    
    train = train.drop(["sequence", "subject", "step"], axis=1).values
    train = train.reshape(-1, Config.seq_length, Config.num_sensor)
    
    Xs = []
    for X in tqdm(train):
        tmp = []
        X = X.T  # 60 x 13 => 13 x 60
        for data in X:
            data = data.reshape(-1, 1)
            Xt = transform(data).ravel()
            tmp.append(Xt)
        Xs.append(np.array(tmp).T)
        
    Xs = np.array(Xs)
    Xs = Xs.reshape(len(Xs)*Config.seq_length, Config.num_sensor)
    
    df = pd.DataFrame(data=Xs)
    df.columns = features
    df.insert(0, "step", steps)
    df.insert(0, "subject", subjects)
    df.insert(0, "sequence", sequences)
    df = df.sort_values(by=["sequence", "subject", "step"])
    return df

In [None]:
train_aug = transformed_data(train[:500*60])

In [None]:
train_aug.head()

In [None]:
train.head()

In [None]:
# Data Preparation
features = train.columns.tolist()[3:]

def process_data(df):
    for feature in features:
        df[feature + '_lag1'] = df.groupby('sequence')[feature].shift(1)
        df.fillna(0, inplace=True)
        df[feature + '_diff1'] = df[feature] - df[feature + '_lag1']    

# add new features in train and test
process_data(train)
process_data(train_aug)
process_data(test)

groups = list(train["sequence"].unique())
labels = list(train_labels["state"].values)

# add augmented records
train = pd.concat([train, train_aug], sort=False)
train = train.drop(["sequence", "subject", "step"], axis=1).values
train = train.reshape(-1, Config.seq_length, train.shape[-1])

groups.extend(groups[:train_aug.sequence.nunique()])
labels.extend(labels[:train_aug.sequence.nunique()])

# new groups and lables: augmented records included
groups = np.array(groups)
labels = np.array(labels)

# test data prep
test = test.drop(["sequence", "subject", "step"], axis=1).values
test = test.reshape(-1, Config.seq_length, test.shape[-1])

In [None]:
def DeepResidualModel(seq_length=60, depth=13, n_class=2): 
    """
        Ref: https://arxiv.org/pdf/1805.00794.pdf
    """
    inputs = tf.keras.layers.Input(shape=(seq_length, depth))
    out1 = tf.keras.layers.Conv1D(filters=16, kernel_size=3, strides=1)(inputs)
    
    for _ in range(4):
    
        out = tf.keras.layers.Conv1D(filters=16, kernel_size=3, strides=1, padding='same')(out1)
        out = tf.keras.layers.Activation("relu")(out)
        out = tf.keras.layers.Conv1D(filters=16, kernel_size=3, strides=1, padding='same')(out)
        out = tf.keras.layers.Add()([out, out1])
        out = tf.keras.layers.Activation("relu")(out)
        out1 = tf.keras.layers.MaxPooling1D(pool_size=3, strides=2)(out)
        
    out = tf.keras.layers.Flatten()(out1)
    out = tf.keras.layers.Dense(16)(out)
    out = tf.keras.layers.Activation("relu")(out)
    out = tf.keras.layers.Dense(16)(out)
    out = tf.keras.layers.Dense(n_class)(out)
    out = tf.keras.layers.Sigmoid()(out)
    
    return tf.keras.Model(inputs=inputs, outputs=out)


def DNNModel():
    """
        Ref: https://www.kaggle.com/code/dmitryuarov/tps-sensors-auc-0-964
    """
    input_ = Input(shape=(train.shape[-2:]))
    out = Bidirectional(LSTM(units=512, return_sequences=True))(input_)
    
    z1 = Bidirectional(LSTM(units=256, return_sequences=True))(out)
    z2 = Bidirectional(GRU(units=256, return_sequences=True))(out)
    out = Concatenate(axis=2)([z1, z2])
    
    out = Bidirectional(LSTM(units=128, return_sequences=True))(out)
    
    out = GlobalMaxPooling1D()(out)
    out = Dense(units=128, activation='selu')(out)
    out = Dense(1, activation='sigmoid')(out)

    return Model(inputs=input_, outputs=out, name='DNNModel')

In [None]:
model = DNNModel()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
# StratifiedGroupKFold Cross-Validation Training and OOF predictions

# Example run params
Config.EPOCHS = 2 # run for 20 for best results
Config.NFOLDS = 3 # run for 10 for best results

predictions, scores = [], []
sgkfolds = StratifiedGroupKFold(n_splits = Config.NFOLDS, shuffle=True)
for fold, (train_idx, valid_idx) in enumerate(sgkfolds.split(train, labels, groups)):
    print('*'*20, f'FOLD: {fold+1}', '*'*20)

    X_train, X_valid = train[train_idx], train[valid_idx]
    y_train, y_valid = labels[train_idx], labels[valid_idx]

    model = DNNModel()
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics='AUC')

    learning_rate = ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=5, verbose=True)
    early_stopping = EarlyStopping(monitor="val_auc", patience=5, verbose=True, mode="max", restore_best_weights=True)
    
    # Model fitting
    model.fit(X_train, y_train, 
              validation_data=(X_valid, y_valid), 
              epochs=Config.EPOCHS,
              verbose=True,
              batch_size=Config.BATCH_SIZE, 
              callbacks=[learning_rate, early_stopping],
             )

    y_proba = model.predict(X_valid, batch_size=Config.BATCH_SIZE).squeeze()
    score = metrics.roc_auc_score(y_valid, y_proba)  # AUC score
    scores.append(score)
    predictions.append(model.predict(test, batch_size=Config.BATCH_SIZE).squeeze())
    print(f"Fold={fold+1}: OOF Validation AUC Score: {score}")
    print("")

print("="*60)
print(f'Mean AUC score on {sgkfolds.n_splits} folds = {np.mean(scores)}')
print("="*60)

In [None]:
## Train for 10 stratified group k folds and 20 epochs for the best results

In [None]:
submission["state"] = sum(predictions)/Config.NFOLDS
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
print("Success!")