![](https://storage.googleapis.com/kaggle-competitions/kaggle/28009/logos/header.png?)

___
# Introduction

## Overview

In this competition, you'll classify 60-second sequences of sensor data, indicating whether a subject was in either of two activity states for the duration of the sequence.





## Files descriptions


* `train.csv` - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants

* `train_labels.csv` - the class label for each sequence.

* `test.csv` - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.

* `sample_submission.csv` - a sample submission file in the correct format.

## Field describtion


* `sequence` - a unique id for each sequence
* `subject` - a unique id for the subject in the experiment
* `step` - time step of the recording, in one second intervals
* `sensor_00` - `sensor_12` - the value for each of the thirteen sensors at that time step
* `state` - the state associated to each sequence. This is the target which you are trying to predict.

## Setup

In [None]:
import os
import warnings

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

from IPython.display import display
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")
sns.set_palette("rocket", 8, .75)

In [None]:
TRAIN_DATA_PATH = '../input/tabular-playground-series-apr-2022/train.csv'
TRAIN_LABELS_PATH = '../input/tabular-playground-series-apr-2022/train_labels.csv'
TEST_DATA_PATH = '../input/tabular-playground-series-apr-2022/test.csv'
SAMPLE_SUBMISSION = '../input/tabular-playground-series-apr-2022/sample_submission.csv'
SUBMISSION_FILE = 'submission.csv'

NUM_FOLDS = 3
NUM_EPOCHS = 30 
BATCH_SIZE = 256 
VERBOSE = 1

INDEX = 'sequence'
TARGET = 'state'
SENSORS = ['sensor_{:02d}'.format(x) for x in range(0, 13)]

___
# Explore data

## Read data

In [None]:
def read_data():
    """Reads the data sets
    """
    train = pd.read_csv(TRAIN_DATA_PATH, index_col=INDEX)
    test = pd.read_csv(TEST_DATA_PATH, index_col=INDEX)
    labels = pd.read_csv(TRAIN_LABELS_PATH, index_col=INDEX)
    submission = pd.read_csv(SAMPLE_SUBMISSION, index_col=INDEX)
    
    return train, test, labels, submission

In [None]:
train, test, labels, submission = read_data()

In [None]:
display(train.head())
display(test.head())
display(labels.head())

display(train.describe().T)

## Missing values

In [None]:
def total_missing_values(train, test):
    df = pd.DataFrame({
        'data_set': ['train', 'test'],
        'missing': [ train.isna().sum().sum(), test.isna().sum().sum()],
    }).set_index('data_set')

    df['missing_%'] = df['missing'] / len(train) 
    return df
    
total_missing_values(train, test) 

## Target `state`

In [None]:
def plot_count(data, feature, target=TARGET, ax=None, percent=True):
    if ax is None:
        ax = plt.gca()

    sns.countplot(
        data=data, 
        x=feature,
        hue=target,
        palette='rocket', 
        alpha=0.75,
        ax=ax)
        
    ax.set_title(f'Count {feature}')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 4))
plot_count(labels, TARGET, target=None, ax=ax)

plt.tight_layout()
plt.show()

## Feature `subject`

In [None]:
def count_sequences(data):
    df = data.groupby(by='subject').count()[['step']]
    df.columns = ['count']
    
    return df.sort_values(by='count', ascending=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 5))

cnt_train = count_sequences(train)
sns.lineplot(
    x=range(0, len(cnt_train)), 
    y=cnt_train['count'], 
    ax=ax, 
    palette='rocket', 
    color='r', 
    label='train')

cnt_test = count_sequences(test)
sns.lineplot(
    x=range(0, len(cnt_test)), 
    y=cnt_test['count'], 
    ax=ax, 
    palette='rocket', 
    label='test')

ax.set_title('Number of sequences per subject')
ax.set_xlabel('subject')
ax.set_ylabel('# of sequences')

plt.tight_layout()
plt.show()

In [None]:
display(cnt_train.describe().T)
display(cnt_test.describe().T)

## Feature `sensor_xx`

In [None]:
def plot_sequence(data, sensor, ax=None, **kwargs):
    if ax is None:
        ax = plt.gca()

    sns.lineplot(
        x='step', 
        y=sensor, 
        data=data, 
        palette='rocket',
        hue='subject',
        marker='o',
        alpha=0.75,
        ax=ax, **kwargs)
    
    ax.set_title(sensor)
    ax.set_xlabel('Time step')
    ax.set_ylabel('Value')

In [None]:
def plot_dist(data, sensor, ax=None):
    if ax is None:
        ax = plt.gca()
    
    sns.histplot(
        data=data, 
        x=sensor, 
        bins=35, 
        legend=True, 
        palette='rocket',
        alpha=0.75,
        kde=True, ax=ax)

    ax.set_title(sensor)

### Distribution of `sensor_xx`

In [None]:
data = train.sample(frac=0.01).reset_index()

fig, axis = plt.subplots(3, 5, figsize=(20, 10))
for sensor, ax in zip(SENSORS, axis.flatten()):
    plot_dist(data, sensor=sensor, ax=ax)

plt.tight_layout()
plt.show()

### Outlier detection

In [None]:
data = train.sample(frac=0.6).reset_index()

fig, axis = plt.subplots(4, 4, figsize=(18, 7))
for sensor, ax in zip(SENSORS, axis.flatten()):
    sns.boxplot(
        data=data, 
        x=sensor,
        ax=ax,
        boxprops=dict(alpha=.75),
        palette='rocket')
    
plt.tight_layout()
plt.show()

## Plot example sequence 

In [None]:
sequence_id = 70

In [None]:
data = train.loc[sequence_id].reset_index()

fig, axis = plt.subplots(7, 2, figsize=(20, 20))
for sensor, ax in zip(SENSORS, axis.flatten()):
    plot_sequence(data, sensor=sensor, ax=ax)

plt.tight_layout()
plt.show()

___
#  Feature Engneering

## Add Lag and Diff Features

In [None]:
LAG_1_FEATURES = [f'{f}_lag_1' for f in SENSORS]
DIFF_1_FEATURES = [f'{f}_diff_1' for f in SENSORS]

def feature_engneering(data):
    """
    """
    for f in SENSORS:
        data[f'{f}_lag_1'] = data.groupby('sequence')[f].shift(1)
        data.fillna(0, inplace=True)
        data[f'{f}_diff_1'] = data[f] - data[f'{f}_lag_1']


In [None]:
feature_engneering(train)
feature_engneering(test)

In [None]:
FEATURES = SENSORS + LAG_1_FEATURES + DIFF_1_FEATURES
num_features = len(FEATURES)

print(f'Number of features: {num_features}')

# LSTM Model

In [None]:
class Metrics(keras.callbacks.Callback):

    def on_train_begin(self, logs={}):
        """Function is called when the training begins"""
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        self.logs.append(logs)

    def get_metrics(self):
        df = pd.DataFrame(self.logs)
        df['epoch'] = range(1, len(self.logs)+1)

        return df.set_index('epoch')

In [None]:
def build_model():
    """
    """
    model = keras.Sequential([
        layers.Input(shape=(60, num_features), name='input_layer'),
        layers.BatchNormalization(),
        layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(512, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(512, return_sequences=True)),
        layers.Bidirectional(layers.LSTM(256, return_sequences=True)),
        layers.Conv1D(32, 7),
        layers.MaxPooling1D(),
        layers.Conv1D(64, 3),
        layers.MaxPooling1D(),
        layers.Conv1D(128, 3),
        layers.GlobalMaxPooling1D(),
        layers.Dense(150, activation="swish"),
        layers.Dense(50, activation="swish"),
        layers.Dense(1, activation="sigmoid")
        
    ], name='lstm_model')
    
    model.compile(
        optimizer="adam", 
        loss='binary_crossentropy',
        metrics=[
            keras.metrics.MeanSquaredError(name='mse'),
            keras.metrics.AUC(name='auc'),
        ])
    
    return model

model = build_model()
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
groups = train.index.unique()
num_features = len(FEATURES)

train_data = train[FEATURES].values.reshape(int(len(train) / 60), 60, num_features)
test_data = test[FEATURES].values.reshape(int(len(test) / 60), 60, num_features)

display(train_data.shape)
display(test_data.shape)

In [None]:
kf = GroupKFold(n_splits=NUM_FOLDS)

y_preds = []
metrics = Metrics()

for fold_idx, (train_idx, valid_idx) in enumerate(kf.split(train_data, labels['state'], groups)):
    print("="*5, f"Fold {fold_idx+1}", "="*5)
    
    X_train, X_valid = train_data[train_idx], train_data[valid_idx]
    y_train, y_valid = labels.iloc[train_idx].values, labels.iloc[valid_idx].values
    
    model = build_model()
    
    lr = keras.callbacks.ReduceLROnPlateau(
        monitor="val_auc", 
        factor=0.6, 
        patience=4, 
        verbose=VERBOSE)

    es = keras.callbacks.EarlyStopping(
        monitor="val_auc", 
        patience=7, 
        verbose=VERBOSE, 
        mode="max", 
        restore_best_weights=True)
    
    model.fit(
        X_train, 
        y_train, 
        validation_data=(X_valid, y_valid), 
        epochs=NUM_EPOCHS, 
        batch_size=BATCH_SIZE,
        verbose=VERBOSE,
        callbacks=[
            metrics,
            lr, 
            es
        ])

    y_preds.append(model.predict(test_data).squeeze())

In [None]:
from matplotlib.ticker import MaxNLocator 

df = metrics.get_metrics()
display(df)

fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))

sns.lineplot(
    x=df.index, y='loss', data=df, 
    marker='o', ax=ax1, label='loss')

sns.lineplot(
    x=df.index, y='val_loss', data=df, 
    marker='o', ax=ax1, color='r', label='val_loss')

sns.lineplot(
    x=df.index, y='mse', data=df, 
    marker='o', ax=ax2, label='mse')

sns.lineplot(
    x=df.index, y='val_mse', data=df, 
    marker='o', ax=ax2, color='r', label='val_mse')

sns.lineplot(
    x=df.index, y='auc', data=df, 
    marker='o', ax=ax3, label='auc')

sns.lineplot(
    x=df.index, y='val_auc', data=df, 
    marker='o', ax=ax3, color='r', label='val_auc')

ax1.get_xaxis().set_major_locator(MaxNLocator(integer=True))
ax1.set_title('Binary Crossentropy')

ax2.get_xaxis().set_major_locator(MaxNLocator(integer=True))
ax2.set_title('Mean Squared Error (MSE)')

ax3.get_xaxis().set_major_locator(MaxNLocator(integer=True))
ax3.set_title('Accuracy (AUC)')

plt.tight_layout()
plt.show()

___
# Submission

In [None]:
submission[TARGET] = np.mean(y_preds, axis=0)
submission.to_csv(SUBMISSION_FILE, index=True)

submission

Thanks for reading. If this notebook was useful, please vote for it.