In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

* train.csv - 거의 1,000명의 실험 참가자를 위한 13개의 생물학적 센서에 대한 ~26,000개의 60초 녹음으로 구성된 훈련 세트
    * sequence- 각 시퀀스에 대한 고유 ID
    * subject- 실험 대상의 고유 id
    * step- 1초 간격으로 기록의 시간 단계
    * sensor_00- sensor_12- 해당 시간 단계에서 13개의 센서 각각에 대한 값

* train_labels.csv - 각 시퀀스의 클래스 레이블입니다.
    * sequence- 각 시퀀스의 고유 ID입니다.
    * state- 각 시퀀스와 관련된 상태. 이것은 당신이 예측하려는 목표입니다.

* test.csv - 테스트 세트. ~12,000개 시퀀스 각각에 대해 해당 시퀀스의 값(state)을 예측해야 합니다.

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:30px;"> 
<b> Tabular April using RNN </b>
</div>

### RNN (Recurrent Neural Network, 순환 신경망)
    * 자연어(NL, Natural Language)나 음성신호, 주식과 같은 연속적인(sequential) 시계열(time series) 데이터에 적합한 모델.
    * 시계열 또는 자연어와 같은 시퀀스 데이터를 모델링하는 데 강력한 신경망 클래스입니다.
    * keras.layers.LSTM: 내장된 RNN 레이어 중 하나

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Data preprocessing </b>
</div>

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import matplotlib.gridspec as gridspec
import tensorflow as tf
from tensorflow import keras
from IPython.display import display
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Checking data distribution </b>
</div>

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
train_label = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
test_df = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

In [None]:
display(train_df.head())
print("\n")
display(train_label.head())
print("\n")
display(test_df.head())

In [None]:
display(train_df)
print("\n")
display(train_label)
print("\n")
display(test_df)

In [None]:
display(train_df.describe)
print("\n")
display(test_df.describe)

In [None]:
print(train_df.info())
print("\n")
print(test_df.info())

In [None]:
# 데이터의 고유값 확인
train_df.nunique()

In [None]:
# sequence, subject, step 컬럼의 대한 고유값의 수 확인
train_df.sequence.nunique(),train_df.subject.nunique(),train_df.step.nunique()

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> NULL, NaN value checking </b>
</div>

In [None]:
isnull_series = train_df.isnull().sum()

print('\n NULL column and number of them \n', 
      isnull_series[isnull_series > 0].sort_values(ascending=False))

In [None]:
isnull_series_2 = test_df.isnull().sum()

print('\n NULL column and number of them \n', 
      isnull_series_2[isnull_series_2 > 0].sort_values(ascending=False))

* train data set, test data set 모두 Null 값 없음.

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Check the correlation between features </b>
</div> 

In [None]:
corr_matrix = train_df.corr()

fig, ax = plt.subplots(1,1, figsize = (15,15))

sns.heatmap(train_df.iloc[:, :].corr(),
            ax = ax,
            vmax=0.9, 
            fmt = '.2f',
            annot=True, 
            linewidths = 0.01,
            linecolor='white',
            cmap="coolwarm")

plt.title("Correlation Heatmap for Train data")
plt.show()

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Simple plottting </b>
</div> 

In [None]:
features = train_df.iloc[:, 3:]
i = 1
plt.figure()
fig, ax = plt.subplots(3, 5,figsize=(20, 15))

for feature in features:
    plt.subplot(3, 5, i)
    sns.distplot(train_df[feature],
                 color = "green", 
                 kde = True, 
                 bins = 30, 
                 label = 'train data')
    sns.distplot(test_df[feature],
                 color = "red", 
                 kde = True, 
                 bins = 30, 
                 label ='test data')
    plt.ylabel("");plt.xlabel(feature, fontsize=9);plt.legend()
    i += 1
plt.show()

In [None]:
import random

sample = random.randint(0, train_df.subject.nunique() - 1)
train_df_mask = train_df['subject'] == sample
train_df[train_df_mask].iloc[:,:]

In [None]:
train_df.subject.hist(bins = 200),train_df.subject.unique().max()

In [None]:
print(train_label)
print("\n")
print(train_label.info())


In [None]:
train_label.state.hist()

In [None]:
# subject, sequence plot

sample = random.randint(0, len(train_df)-1)

sample_1 = [train_df.iloc[sample].subject]
sample_2 = [train_df.iloc[sample].sequence]

train_df_mask = train_df[train_df.subject.isin(sample_1) & train_df.sequence.isin(sample_2)]

features = train_df_mask.iloc[:,3:]
i = 1
plt.figure()
fig, ax = plt.subplots(3, 5,figsize=(20, 20))

for feature in features:
    plt.subplot(3,5,i)
    sns.distplot(train_df_mask[feature],
                 color="green", 
                 kde=True, 
                 bins=30, 
                 label='train data')
    plt.ylabel("");plt.xlabel(feature, fontsize=9);plt.legend()
    i += 1
plt.show()

In [None]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()

sample = random.randint(0, len(train_df) - 1)
sample_1 = [train_df.iloc[sample].subject]
sample_2 = [train_df.iloc[sample].sequence]
data = train_df[train_df.subject.isin(sample_1) & train_df.sequence.isin(sample_2)]
features = data.iloc[:,3:]

plot = figure(x_axis_type = "datetime", 
              title = "Sensor의 랜덤 분포",
              sizing_mode = "stretch_width")
plot.grid.grid_line_alpha=0.6
plot.xaxis.axis_label = 'step'
plot.yaxis.axis_label = 'sensor'

plot.line(data['step'], data['sensor_00'], color='#69d84f', legend_label='sensor_00')
plot.line(data['step'], data['sensor_01'], color='#FF4500', legend_label='sensor_01')
plot.line(data['step'], data['sensor_02'], color='#663399', legend_label='sensor_02')
plot.line(data['step'], data['sensor_03'], color='#191970', legend_label='sensor_03')
plot.line(data['step'], data['sensor_04'], color='#800000', legend_label='sensor_04')
plot.line(data['step'], data['sensor_05'], color='#FFD700', legend_label='sensor_05')
plot.line(data['step'], data['sensor_06'], color='#FF00FF', legend_label='sensor_06')
plot.line(data['step'], data['sensor_07'], color='#483D8B', legend_label='sensor_07')
plot.line(data['step'], data['sensor_08'], color='#DC143C', legend_label='sensor_08')
plot.line(data['step'], data['sensor_09'], color='#6495ED', legend_label='sensor_09')
plot.line(data['step'], data['sensor_10'], color='#8A2BE2', legend_label='sensor_10')
plot.line(data['step'], data['sensor_11'], color='#bcbd22', legend_label='sensor_11')
plot.line(data['step'], data['sensor_12'], color='#d62728', legend_label='sensor_12')

plot.legend.location = "center_left"
plot.legend.background_fill_alpha = 0.6

show(plot)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Data scaling </b>
</div> 

In [None]:
groups = train_df["sequence"]

train_df = train_df.drop(["sequence", "subject", "step"], inplace=False, axis=1).values
test_df = test_df.drop(["sequence", "subject", "step"], inplace=False, axis=1).values
train_label = train_label["state"]

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

sc.fit(train_df)
train_df = sc.transform(train_df)
test_df = sc.transform(test_df)

# RNN형태로 변환
train_df = train_df.reshape(int(len(train_df)/60), 60, 13)
test_df = test_df.reshape(int(len(test_df)/60), 60, 13)


<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Predicting using RNN </b>
</div> 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply


x_input = Input(shape=(train_df.shape[-2:]))
    
x1 = Bidirectional(LSTM(units=512, return_sequences=True))(x_input)
x2 = Bidirectional(LSTM(units=256, return_sequences=True))(x1)
z1 = Bidirectional(GRU(units=256, return_sequences=True))(x1)
    
c = Concatenate(axis=2)([x2, z1])
    
x3 = Bidirectional(LSTM(units=128, return_sequences=True))(c)
    
x4 = GlobalMaxPooling1D()(x3)
x5 = Dense(units=128, activation='selu')(x4)
x_output = Dense(1, activation='sigmoid')(x5)

model = Model(inputs=x_input, outputs=x_output, name='lstm_model')
model.summary()

In [None]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=[keras.metrics.AUC()])
model.fit(train_df,
          train_label, 
          validation_split = 0.3, 
          epochs = 10, 
          batch_size = 512,
          callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])

In [None]:
# plot 1: Loss
#fig, ax = plt.subplots(2,1, figsize=(15, 10))
#ax[0].plot(model['loss'], color='b', label="Training loss")
#ax[0].plot(model['val_loss'], color='r', label="validation loss",axes =ax[0])
#legend = ax[0].legend(loc='best', shadow=True)

# plot 2: accuracy
#ax[1].plot(model['accuracy'], color='b', label="Training accuracy")
#ax[1].plot(model['val_accuracy'], color='r',label="Validation accuracy")
#legend = ax[1].legend(loc='best', shadow=True)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> Submisson </b>
</div> 

In [None]:
test_pred = model.predict(test_df)

In [None]:
submission["state"] = test_pred
submission.to_csv("submission.csv", index=False)
submission

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#D0F0C0;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="text-align:center;">
<span style="font-size:25px;"> 
<b> References </b>
</div> 

* RNN
        1. https://ebbnflow.tistory.com/135
        2. https://www.tensorflow.org/guide/keras/rnn 
        3. https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver
        4. https://excelsior-cjh.tistory.com/183 
        5. https://sykflyinginthesky.tistory.com/31

* Markdown
        1. https://www.kaggle.com/code/shubhamksingh/create-beautiful-notebooks-formatting-tutorial/notebook#basics