In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
from tqdm.auto import tqdm

import scipy

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
labels_df = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
test_df = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")

In [None]:
# checking the training set
train_df.head()

In [None]:
# checking the labels set
labels_df.head()

In [None]:
# exploring the shape of the datasets, since the number of labels != number of training examples
print(f"Train shape: {train_df.shape} | Labels shape: {labels_df.shape}")
# also check the sequence in the train, label, and test datasets
print(f"Train sequence range: [{train_df['sequence'].min()}, {train_df['sequence'].max()}]; \tTotal: {len(train_df['sequence'])}")
print(f"Label sequence range: [{labels_df['sequence'].min()}, {labels_df['sequence'].max()}]; \tTotal: {len(labels_df['sequence'])}")
print(f"Test sequence range: [{test_df['sequence'].min()}, {test_df['sequence'].max()}]; \tTotal: {len(test_df['sequence'])}")
print(f"\nThere are {train_df.shape[0]/labels_df.shape[0]} times more training features than labels")

**VISUALIZING FEATURES**

In [None]:
# considering only the sensor data
# visualizing the sensor data, by plotting all of them, 
# ideas borrowed from https://www.kaggle.com/code/ambrosm/tpsapr22-eda-which-makes-sense
# There are 13 features belonging to the sensors
figure = plt.figure(figsize=(16,8))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4,4, sensor+1)
    plt.hist(train_df[sensor_name],
            bins=25)
#     sns.lineplot(data=train_df[sensor_name], linewidth=1)
    plt.title(f"{sensor_name} Histogram")
    
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('Sensor Histogram before outlier removal', y=1.02)
plt.show()

**REMOVING OUTLIERS**

In [None]:
# as seen from above, there isn't much the features can tell.
# time to consider removing some features, considering 2% feature removal
figure = plt.figure(figsize=(16,8))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4,4,sensor+1)
    plt.hist(train_df[sensor_name],
            bins=100,
            range=([train_df[sensor_name].quantile(0.02), train_df[sensor_name].quantile(0.98)]))
    plt.title(f'{sensor_name} Histogram')

figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('Sensor histogram after outlier removal', y=1.02);

**MERGE TRAINING AND LABELS DATASET**

In [None]:
# since we are going to train a model down the road, we need to specify our features
features = train_df.iloc[:, 3:].columns.tolist()

# then merge the label dataframe with the train dataframe
# train_df = pd.merge(train_df, labels_df, on=['sequence'], how='left')
target = 'state'

groups = train_df['sequence']

In [None]:
def feature_engineer(df, features=features):
    for col in features:
        df[col+'_lag'] = df.groupby(['sequence'])[col].shift(1)
        df.fillna(0, inplace=True)
        df[col+'_diff1'] = df[col] - df[col+'_lag']
    return df

train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

In [None]:
# update the features
features = train_df.columns.tolist()[3:]
print(features)

**VISUALIZING THE LABELS**

In [None]:
# checking distribution of the labels
# train_df[target].value_counts().plot(kind='bar')
# plt.title("Distribution of the labels", fontsize=18);
# print(f"Label 0: {(train_df[target]==0).sum()}")
# print(f"Label 1: {(train_df[target]==1).sum()}")

In [None]:
# preprocessing the data
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
test_df[features] = scaler.fit_transform(test_df[features])

**MODEL BUILDING**

In [None]:
# reshape the data to be used.
train_arr = train_df[features].values
train_arr = train_arr.reshape(-1, 60, train_arr.shape[-1])

In [None]:
# GRU MODEL
def gru_model():
    inputs = keras.Input(shape=(train_arr.shape[1], train_arr.shape[-1]))
    x1 = layers.Bidirectional(layers.GRU(768, return_sequences=True))(inputs)
    
    x21 = layers.Bidirectional(layers.GRU(512, return_sequences=True))(x1)
    x22 = layers.Bidirectional(layers.GRU(512, return_sequences=True))(inputs)
    l2 = layers.Concatenate(axis=2)([x1, x22])
    
    x31 = layers.Bidirectional(layers.GRU(384, return_sequences=True))(l2)
    x32 = layers.Bidirectional(layers.GRU(384, return_sequences=True))(x21)
    l3 = layers.Concatenate(axis=2)([x31, x32])
        
    x41 = layers.Bidirectional(layers.GRU(256, return_sequences=True))(l3)
    x42 = layers.Bidirectional(layers.GRU(256, return_sequences=True))(x41)
    x43 = layers.Bidirectional(layers.GRU(128, return_sequences=True))(x32)
    x44 = layers.Bidirectional(layers.GRU(128, return_sequences=True))(x43)
    l4 = layers.Concatenate(axis=2)([x42, x44])
        
    l5 = layers.Concatenate(axis=2)([x1, l2, l3, l4])
    g = layers.GlobalMaxPooling1D()(l5)
    x7 = layers.Dense(128, activation='selu')(g)
    x8 = layers.Dropout(0.2)(x7)
    x9 = layers.Dense(64, activation='selu')(x8)
    outputs = layers.Dense(units=1, activation="sigmoid")(x9)
    
    model = keras.Model(inputs=inputs, outputs=outputs, name='GRU_model')
    
    return model

model = gru_model()

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['AUC'])

lr = keras.callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=5, verbose=True, mode="max")
es = keras.callbacks.EarlyStopping(monitor="val_auc", patience=10, verbose=True, mode="max", restore_best_weights=True)
ckpt = keras.callbacks.ModelCheckpoint(filepath='gru_model.keras', save_best_only=True)
model.fit(x=train_arr, 
         y=labels_df['state'],
         validation_split=0.2,
         batch_size=128,
         epochs=40,
         callbacks=[lr, es, ckpt])
# keras.utils.plot_model(model, show_shapes=False, show_layer_names=True)

In [None]:
model = keras.models.load_model('gru_model.keras')
model.evaluate(train_arr, labels_df['state'])

In [None]:
test_arr = test_df[features].values
test_arr = test_arr.reshape(-1, 60, test_arr.shape[-1])
test_arr.shape

In [None]:
predictions = model.predict(test_arr)

In [None]:
predictions

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")
sample.shape

In [None]:
sample['state'] = predictions

In [None]:
sample.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')