In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Setup

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
path_train = '../input/tabular-playground-series-apr-2022/train.csv'
path_train_labels = '../input/tabular-playground-series-apr-2022/train_labels.csv'
path_test = '../input/tabular-playground-series-apr-2022/test.csv'
path_submission = '../input/tabular-playground-series-apr-2022/sample_submission.csv'

# Preprocessing

## Data loading

In [None]:
df_train = pd.read_csv(path_train)
df_train_labels = pd.read_csv(path_train_labels)
df_test = pd.read_csv(path_test)

In [None]:
display(df_train)
display(df_test)
display(df_train_labels)

## Data cleaning

### Missing data

In [None]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())
print(df_train_labels.isnull().sum())

There is no missing data.

### 'step' integrity

For each sequence, there should be 60 steps (from 0 to 59).

In [None]:
df_train.groupby('sequence')['step'].count().describe()

In [None]:
df_test.groupby('sequence')['step'].count().describe()

Every sequence has data of 60 steps.

### Possible data cleaning

Although we have perfect data here, it is possible that we need to handle with imperfect data in other cases. For example:
* Missing data for target ('state').
* Missing data for features ('sensor_xx').
* Incomplete 'step' for some sequences.

Let's delete some values to show some possible solutions.

In [None]:
# copy original data
df_train_dc = df_train.copy()
df_train_labels_dc = df_train_labels.copy()

# delete some values
df_train_dc = df_train_dc.drop([0, 3, 100])
df_train_dc.loc[[1, 5, 7], ['sensor_00']] = None
df_train_labels_dc.loc[[1, 60, 100], ['state']] = None

display(df_train_dc)
display(df_train_labels_dc)


In [None]:
# for missing values in 'state', we can simply drop them.
df_train_labels_dc = df_train_labels_dc[~df_train_labels_dc['state'].isnull()]
df_train_labels_dc

In [None]:
# for the training set, we only use the sequences which have 'state' results.
df_train_dc = df_train_dc[df_train_dc['sequence'].isin(df_train_labels_dc['sequence'])]
df_train_dc

In [None]:
# for missing values in training set, we need to resample the data 
# to make sure that each sequence has 60 steps and each feature 'sensor_xx'
# has a value.

df_sequence = pd.DataFrame(df_train_dc['sequence'].unique(), columns=['sequence'])
df_step = pd.DataFrame(range(60), columns=['step'])

# create 'temp' column to merge 'sequence' and 'step'
df_sequence['temp'] = 1
df_step['temp'] = 1

df = pd.merge(df_sequence, df_step, on='temp').drop(columns='temp')
df


In [None]:
df_train_dc = pd.merge(df, df_train_dc, on=['sequence', 'step'], how='left')
df_train_dc

In [None]:
# use .fillna to fill the missing values.
# we can use the mean value group by the 'sequence'.
# other options are 'ffill' or 'bfill'.

for i in range(13):
    columnname = 'sensor_' + f'{i:02}'
    df_train_dc[columnname] = df_train_dc[columnname].fillna(df_train_dc.groupby('sequence')[columnname].transform('mean'))

df_train_dc

## Data wrangling

In [None]:
x = df_train.copy()
x_test = df_test.copy()

In [None]:
# groupby 'sequence'
groups = x['sequence']

In [None]:
# drop the non-feature columns
x = x.drop(['sequence', 'subject', 'step'], axis=1).values
x_test = x_test.drop(['sequence', 'subject', 'step'], axis=1).values

In [None]:
# target column
y = df_train_labels['state']

In [None]:
#standardize
standardscaler = StandardScaler()

x = standardscaler.fit_transform(x)
x_test = standardscaler.transform(x_test)

In [None]:
# reshape the features
# each sequence has 60 steps, each step has 13 sensor_xx values
# so the shape is (-1, 60, 13)
x = x.reshape(-1, 60, 13)
x_test = x_test.reshape(-1, 60, 13)

# Model

In [None]:
def NN():
    with tpu_strategy.scope():
        model = keras.models.Sequential(
            [
                keras.layers.Input(shape=(60, 13)),
                keras.layers.LSTM(500, return_sequences=True),
                keras.layers.LSTM(400, return_sequences=True),
                keras.layers.LSTM(300, return_sequences=True),
                keras.layers.LSTM(200, return_sequences=True),
                keras.layers.Conv1D(32, 3),
                keras.layers.GlobalMaxPooling1D(),
                keras.layers.Dense(128, activation='swish'),
                keras.layers.Dense(64, activation='swish'),
                keras.layers.Dense(1, activation='sigmoid')
            ]
        )

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[keras.metrics.AUC()])
    
    return model

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
cv_score = 0
y_test_preds = []
kf = GroupKFold(n_splits=5)

In [None]:
for fold_idx, (train_idx, cv_idx) in enumerate(kf.split(x, y, groups.unique())):
    
    print('*'*15, f'Fold {fold_idx+1}', '*'*15)

    x_train, x_cv = x[train_idx], x[cv_idx]
    y_train, y_cv = y.iloc[train_idx].values, y.iloc[cv_idx].values

    model = NN()
    model.fit(x_train, y_train, validation_data=(x_cv, y_cv), epochs=100, batch_size=2048,
              callbacks=[keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)])
    
    cv_score += roc_auc_score(y_cv, model.predict(x_cv).squeeze())

    y_test_preds.append(model.predict(x_test).squeeze())

print('*'*30)
print(cv_score/5)

# Submission

In [None]:
submission = pd.DataFrame(df_test['sequence'].unique(), columns=['sequence'])
submission['state'] = sum(y_test_preds) / 5
submission

In [None]:
submission.to_csv("LSTM_V1.4.csv", index=False)