# Analysis by LSTM - Using Pytorch
This notebook implements the LSTM method by using Pytorch

In [None]:
# Import the required modules
import numpy as np
import pandas as pd
import warnings
import time
warnings.simplefilter('ignore')
import math
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Devices to be used：：', device)

# Preprocessing
## Loading Data
First, download the data using pandas and check the contents

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
train_label_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
#Check data
print(train_df.shape)  # confirm shape
train_df.head(70)

In [None]:
print(train_label_df.shape) 
train_label_df.head(10)

In [None]:
print(test_df.shape)
test_df.head(10)

**Data Summary**
***
**train.csv**...the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants  
*   sequence - a unique id for each sequence
*   subject - a unique id for the subject in the experiment
*   step - time step of the recording, in one second intervals
*   sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step  

**train_labels.csv**...the class label for each sequence.  
*   sequence - the unique id for each sequence.
*   state - the state associated to each sequence. This is the target which you are trying to predict.  

**test.csv**...the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.  
***
There are 60 seconds of data per person, and the state (0 or 1) of the person is predicted from the values.

## Data Analysis
Let's look at the information on the parameters of each sensor and see how they relate to STATE.  
First, check the statistics of each data using DESCRIBE
*   Four median zero (2, 3, 8, 12)
*   Some of the quartile ranges and maximum-minimum ranges are very different (are there skipped values?)
*   The average value is near zero overall.

In [None]:
train_df.loc[:, 'sensor_00': 'sensor_12'].describe()

Using Seaborn to check the distribution of sensors for 1,000 people.
*   There are quite a few outliers.
*   Histograms are nicely distributed.

In [None]:
sns.set()
cols = train_df.columns[3:]
sns.pairplot(train_df[:60*100][cols], size = 3)
plt.show()

## Preprocessing
Pre-processing.
Combine the test and training data for batch processing

In [None]:
ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df))#.reset_index(drop=True)
print(all_data.shape)
all_data.head()

Check for missing values.
*   No missing values

In [None]:
# Check missing values
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(22)

Return to training and test data

In [None]:
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]
print(train_df.shape, test_df.shape)
train_df.head()

# Trainig
## Splitting data
Split data for training models and for validation of model accuracy  
In this case, training data for model: validation data for accuracy = 8 : 2

In [None]:
length = len(train_df)
train_size = int(length * 0.8) - int(length * 0.8 % 60)
test_size = length - train_size
length_y = len(train_label_df)
train_size_y = int(length_y * 0.8)
test_size_y = length_y - train_size_y
X_train, X_test = train_df[0:train_size], train_df[train_size:length]
y_train, y_test = train_label_df[0:train_size_y], train_label_df[train_size_y:length_y]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
X_train.head()

## Model Creation
I will use something called an LSTM model.
First, create a class that creates a dataset that can be read by pytorch

In [None]:
from torch.utils.data.sampler import SubsetRandomSampler
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, sequence_num, y=None, mode='train'):
        self.data = X
        self.teacher = y
        self.sequence_num = sequence_num
        self.mode = mode
    def __len__(self):
        return len(self.teacher)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        if self.mode == 'train':
            out_label =  self.teacher[idx[0]//self.sequence_num]
            return out_data, out_label
        else:
            return out_data
def create_dataset(dataset, dataset_num, sequence_num, input_size, batch_size, shuffle=False):
    sampler = np.array([list(range(i*sequence_num, (i+1)*sequence_num)) for i in range(dataset_num//sequence_num)])
    if shuffle == True:
        np.random.shuffle(sampler)
    dataloader = DataLoader(dataset, batch_size, sampler=sampler)
    return dataloader

###########  operation check　###############
sequence_num = 60
X = np.random.rand(60*1000, 13)
y = np.random.rand(60*1000, 1)

dataset = MyDataset(X, y=y, sequence_num=sequence_num)
dataloader = create_dataset(dataset, X.shape[0], sequence_num, X.shape[1], 32)
# dataloader = DataLoader(dataset, batch_size=32)#, sampler=sampler)
for b, tup in enumerate(dataloader):
    print('---------')
    print(tup[0].shape, tup[1].shape)
    break
print(X[-2], y[-1])
############################################

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size=5, sequence_num=60, lstm_dim1=48, lstm_dim2=128, lstm_dim3=256,
                 num_layers=2, output_size=1, batch_size = 32):
        super().__init__()
        self.batch_size = batch_size
        
        self.lstm1 = nn.LSTM(input_size, lstm_dim1, num_layers, batch_first=True)
        
        self.linear1 = nn.Linear(lstm_dim1*sequence_num, 96)
        self.bn1 = nn.BatchNorm1d(96)
        self.linear2 = nn.Linear(96, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.linear3 = nn.Linear(16, 1)
        self.dropout = nn.Dropout(0.3899990603626676)
        self.logs_train = [[], [np.inf]]
        self.logs_valid = [[], [np.inf]]
        self.stdsc = StandardScaler()
        self.stdsc_y = StandardScaler()

    def forward(self, x):
        lstm_out, _ = self.lstm1(x)
        x = lstm_out.reshape(lstm_out.shape[0], -1)
        x = self.bn1(F.leaky_relu(self.linear1(x)))
        x = self.dropout(x)
        x = self.bn2(F.leaky_relu(self.linear2(x)))
        x = self.dropout(x)
        x = torch.sigmoid(self.linear3(x))
        return x

    def fit(self, X, y, num_epochs=50, sequence_num=60, batch_size=32):
        # Check whether GPU is available
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        ####   data processing↓
        # standardization
        X = self.stdsc.fit_transform(X)
        # Data set and data loader creation
        num_train = len(X)
        dataset_size = X.shape[0]# data size
        input_size = X.shape[1]  # input size
        # create dataset
        dataset = MyDataset(X, y=y, sequence_num=sequence_num, mode='train')
        # create dataloader
        dataloader = create_dataset(dataset, dataset_size, sequence_num, input_size, batch_size)
        ####    trainig settings
        # Setting up optimization methods
        lr = 0.028333303850396258
        weight_decay = 1.0078357791694276e-05
        optimizer = torch.optim.Adagrad(self.parameters(), lr=lr, weight_decay=weight_decay)
        # Define the error function
        criterion = nn.MSELoss()
        # Network to GPU
        self.to(device)
        # Model in learning mode
        self.train()
        # If the network is somewhat fixed, make it faster
        torch.backends.cudnn.benchark = True
        # Save batch size
        batch_size = dataloader.batch_size
        # Set iteration counter
        iteration = 1
        # Loop of epoch
        for epoch in tqdm(range(num_epochs)):
            # Save start time
            t_epoch_strat = time.time()
            epoch_loss = 0.0
            ### Training
            # Loop to retrieve minibatch by minibatch from the data loader
            for data, targets in dataloader:
                # Converted to be handled by GPU
                data = data.to(device)
                targets = targets.to(device)
                # gradient initialization
                optimizer.zero_grad()

                # Get output
                data = data.to(torch.float32)
                output = self.forward(data)
                output = output.view(1,-1)[0]
                targets = targets.to(torch.float32)
                # Calculate error
                loss = criterion(output, targets)
                # backpropergation
                loss.backward()
                # step
                optimizer.step()
                # memory erorr
                epoch_loss += loss.item()
                iteration += 1
            # loss per epoch
            t_epoch_finish = time.time()
            # save model
            if epoch_loss/num_train < min(self.logs_train[1]):
                torch.save(self.state_dict(), './models')
            self.logs_train[0].append(epoch+1)
            self.logs_train[1].append(epoch_loss/num_train)
            

    def predict(self, X, sequence_num=60):
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        valid_loss = 0.0
        num_valid = len(X)
        indices_valid = list(range(num_valid))
        # standerdization
        X = self.stdsc.transform(X)
        # create dataset
        dataset = MyDataset(X, sequence_num=sequence_num, mode='valid')
        # create dataloader
        valid_loader = create_dataset(dataset, X.shape[0], sequence_num, X.shape[1], batch_size)
        y_pred = np.array([])
        for data in valid_loader:
            data = data.to(torch.float32)
            data = data.to(device)
            output = self.forward(data)
            output = output.view(1, -1)
            output = output.to('cpu').detach().numpy().copy()
            y_pred = np.append(y_pred, output[0])
        return y_pred

###########  operation check　###############
model = LSTM(input_size=7)
data = np.random.rand(32, 60, 7)
data = torch.from_numpy(data.astype(np.float32)).clone()
model(data)
#############################################

In [None]:
k_fold = 10
valid_per = 1/k_fold
epoch = 200
batch_size = 256
categorical_columns = X_train.columns[3:]

scores = []
models = []
for k in range(k_fold):
    length = len(X_train)  
    valid_size = int(length * valid_per) - int(length * valid_per % 60) 
    train_size = length - valid_size  
    length_y = len(y_train)    
    valid_size_y = int(length_y * valid_per)   
    train_size_y = length_y - valid_size_y   
    ### 
    X_train_k, X_test_k = X_train.drop(X_train.index[range(k*valid_size, (k+1)*valid_size)]), X_train[k*valid_size:(k+1)*valid_size] 
    y_train_k, y_test_k = y_train.drop(y_train.index[range(k*valid_size_y, (k+1)*valid_size_y)]), y_train[k*valid_size_y:(k+1)*valid_size_y]
    #### 
    X_train_np = X_train_k[categorical_columns].values  
    y_train_np = y_train_k['state'].values
    X_test_np = X_test_k[categorical_columns].values
    y_test_np = y_test_k['state'].values
    print(X_train_np.shape, y_train_np.shape)
    print(X_test_np.shape, y_test_np.shape)
    ### 
    model_k = LSTM(input_size=len(categorical_columns))
    model_k.to(device)
    ### 
    model_k.fit(X_train_np, y_train_np, num_epochs=epoch, sequence_num=sequence_num, batch_size=batch_size)
    ## 
    model_k.load_state_dict(torch.load('models'))
    pred_k = model_k.predict(X_test_np, sequence_num=sequence_num)
    score_k = roc_auc_score(y_test_np, pred_k)   
    print('Fold: %2d, AUC: %.3f' % (k+1, score_k))
    roc = roc_curve(y_test_np, pred_k)  
    fpr, tpr, thresholds = roc
    plt.plot(model_k.logs_train[0], model_k.logs_train[1][1:])
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()
    
    plt.plot(fpr, tpr, marker='o')
    plt.xlabel('FPR: False positive rate')
    plt.ylabel('TPR: True positive rate')
    plt.grid()
    plt.show()
    scores.append(score_k)
    models.append(model_k)
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
def predict(models, X_test):
    y_pred = np.zeros((len(X_test)//60, len(models)))
    for fold_, model in enumerate(models):
        pred_ = model.predict(X_test)
        pred_ = pred_.reshape(1, -1)
        y_pred[:, fold_] = pred_[0]
    y_pred = y_pred.mean(axis=1)
    return y_pred
X_test_np = X_test[categorical_columns].values
y_test_np = y_test['state'].values
y_pred = predict(models, X_test_np)
print(y_pred[:10])

In [None]:
y_pred[y_pred < 0] = 0
y_pred[y_pred > 1] = 1
roc = roc_curve(y_test_np, y_pred)
print("roc", roc_auc_score(y_test_np, y_pred))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
y_pred_train = predict(models, X_train_np)
y_pred_train = y_pred_train
roc = roc_curve(y_train_np, y_pred_train)
print("roc", roc_auc_score(y_train_np, y_pred_train))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

# Submission Data Preparation

In [None]:
X_submit = test_df[categorical_columns].values
y_submit = predict(models, X_submit)
print(y_submit.shape)
plt.hist(y_submit, bins=30, density=True)
plt.show()

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
print(submission_df.shape)
submission_df.head()

In [None]:
submission_df['state'] = pd.DataFrame(y_submit)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False, header=True)