# TPS April 2022 

- Hello Kagglers, in this notebook I have used DNN/LSTM based architecture to get good performing model on public LeaderBoard.

- This notebook is inspired from the work of DMITRY UAROV 
https://www.kaggle.com/code/dmitryuarov/tps-sensors-auc-0-964

- The architecture proposed by DMITRY UAROV was overfitting so I tried to take measures to reduce overfitting in this notebook.

- Please `upvote` this notebook if you find it useful

### Data Descriptions
In this competition, you'll classify 60-second sequences of sensor data, indicating whether a subject was in either of two activity states for the duration of the sequence

### Files and Field Descriptions
train.csv - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants
* sequence - a unique id for each sequence
* subject - a unique id for the subject in the experiment
* step - time step of the recording, in one second intervals
* sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step
* train_labels.csv - the class label for each sequence.
* sequence - the unique id for each sequence.
* state - the state associated to each sequence. This is the target which you are trying to predict.

test.csv - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.

sample_submission.csv - a sample submission file in the correct format.`

# Install Libraries

In [None]:
! pip install pydot
! pip install graphviz

In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold, GroupKFold

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply

np.random.seed(2022)
tf.random.set_seed(2022)

pd.set_option('display.max_columns', None)
#########################################################

train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
t_lbls = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

ss = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')


# EDA

In [None]:
train.head(3)

In [None]:
print('DATA INFORMATION')
print()
print('Count of sequences:')
print(f'train - {int(len(train)/60)} | test - {int(len(test)/60)}')
print()
print('Missing values:')
print(f'train - {train.isna().sum().sum()} | test - {test.isna().sum().sum()}')
print()
print('Distribution of target:')
print(f'"1" - {round(t_lbls["state"].value_counts()[1]/len(t_lbls)*100,2)}% | "0" - {round(t_lbls["state"].value_counts()[0]/len(t_lbls)*100,2)}%')
print()
print('-'*39)
print()
print('Train features')
display(train[train.columns.tolist()[3:]].describe().transpose()[['mean', 'min', 'max']]\
.style.background_gradient(cmap='Blues'))
print()
print('-'*39)
print()
print('Test features')
display(test[test.columns.tolist()[3:]].describe().transpose()[['mean', 'min', 'max']]\
.style.background_gradient(cmap='Blues'))

# Preprocessing

In [None]:
features = train.columns.tolist()[3:]
def prep(df):
    for feature in features:
        df[feature + '_lag1'] = df.groupby('sequence')[feature].shift(1)
        df.fillna(0, inplace=True)
        df[feature + '_diff1'] = df[feature] - df[feature + '_lag1']    

prep(train)
prep(test)

features = train.columns.tolist()[3:]
sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

groups = train["sequence"]
labels = t_lbls["state"]

train = train.drop(["sequence", "subject", "step"], axis=1).values
train = train.reshape(-1, 60, train.shape[-1])

test = test.drop(["sequence", "subject", "step"], axis=1).values
test = test.reshape(-1, 60, test.shape[-1])

# DNN model

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 256
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

In [None]:
def dnn_model():
    
    x_input = Input(shape=(train.shape[-2:]))
    
    x1 = Bidirectional(LSTM(units=512, return_sequences=True))(x_input)
    x1_dr = Dropout(rate = 0.2)(x1)
    x2 = Bidirectional(LSTM(units=256, return_sequences=True))(x1_dr)
    z1 = Bidirectional(GRU(units=256, return_sequences=True))(x1_dr)
    
    c = Concatenate(axis=2)([x2, z1])
    c_dr = Dropout(rate = 0.2)(c)
    b = BatchNormalization()(c_dr)
    
    x3 = Bidirectional(LSTM(units=128, return_sequences=True))(b)
    x3_dr = Dropout(rate = 0.2)(x3)
    
    x4 = GlobalMaxPooling1D()(x3_dr)
    x5 = Dense(units=128, activation='selu')(x4)
    x6 = Dropout(rate = 0.2)(x5)
    x_output = Dense(1, activation='sigmoid')(x6)

    model = Model(inputs=x_input, outputs=x_output, name='lstm_model')
    
    return model

model = dnn_model()

In [None]:
plot_model(
    model, 
    to_file='Super_Model.png', 
    show_shapes=True,
    show_layer_names=True
)

In [None]:
def plotHist(hist):
    plt.plot(hist.history["auc"])
    plt.plot(hist.history["val_auc"])
    plt.title("model performance")
    plt.ylabel("area_under_curve")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
with strategy.scope():
    VERBOSE = True
    predictions, scores = [], []
    k = GroupKFold(n_splits = 10)

    for fold, (train_idx, val_idx) in enumerate(k.split(train, labels, groups.unique())):
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    
        X_train, X_val = train[train_idx], train[val_idx]
        y_train, y_val = labels.iloc[train_idx].values, labels.iloc[val_idx].values
        
        model = dnn_model()
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics='AUC')

        lr = ReduceLROnPlateau(monitor="val_auc", factor=0.6, 
                               patience=4, verbose=VERBOSE)

        es = EarlyStopping(monitor="val_auc", patience=7, 
                           verbose=VERBOSE, mode="max", 
                           restore_best_weights=True)
        
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        chk_point = ModelCheckpoint(f'./TPS_model_2022_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_auc', verbose=VERBOSE, 
                                    save_best_only=True, mode='max')
        
        history = model.fit(X_train, y_train, 
                  validation_data=(X_val, y_val), 
                  epochs=15,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
        load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        model = load_model(f'./TPS_model_2022_{fold+1}C.h5', options=load_locally)
        
        y_pred = model.predict(X_val, batch_size=BATCH_SIZE).squeeze()
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        predictions.append(model.predict(test, batch_size=BATCH_SIZE).squeeze())
        
        print(f"Fold-{fold+1} | OOF Score: {score}")
        plotHist(history)
    
    print(f'Mean accuracy on {k.n_splits} folds - {np.mean(scores)}')

In [None]:
ss["state"] = sum(predictions)/k.n_splits 
ss.to_csv('submission.csv', index=False)
ss  