# Analysis using Neural Networks
## Introduction
Thanks for coming to see this.
This notebook will show you how to build Neural Networks for Pytorch beginners!  

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from tqdm import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Active device：', device)

## Preprocessing
### Data loading
Load data by using pandas.

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
train_label_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

Check the form and content of the data

In [None]:
print(train_df.shape) 
train_df.head(70)

In [None]:
print(train_label_df.shape) 
train_label_df.head(10)

In [None]:
print(test_df.shape)
test_df.head(10)

### Exploratory Data Analysis


Use pandas' describe function to check information on training data  
*   sensor_02, 03, 08, 12 have a median value of 0
*   Most sensors have first and third quartiles and min, max values far apart

In [None]:
train_df.loc[:, 'sensor_00': 'sensor_12'].describe()

Here, combine data to process training and test data in batches

In [None]:
ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df))#.reset_index(drop=True)
print(all_data.shape)
all_data.head()

Check data type

In [None]:
all_data.info()

Calculate the first-order difference in step

In [None]:
features = all_data.columns.tolist()[3:]
for feature in features:
    all_data[feature + '_lag1'] = all_data.groupby('sequence')[feature].shift(1)
    all_data.fillna(0, inplace=True)
    all_data[feature + '_diff1'] = all_data[feature] - all_data[feature + '_lag1']
    all_data.drop(feature+'_lag1', axis=1, inplace=True)
all_data.head()

Aggregate data by sequence.
Obtain average, standard deviation, maximum, minimum, sum, median, first quantile and third quantile data for each sequence.

In [None]:
features = all_data.columns[3:]
print(features)
# mean
mean_seq = all_data.groupby('sequence').mean()
all_data_summ = mean_seq.rename(columns={s: s+'_mean' for s in features})
# std
std_seq = all_data.groupby('sequence').std().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, std_seq.rename(columns={s: s+'_std' for s in features})], axis=1)
# max
max_seq = all_data.groupby('sequence').max().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, max_seq.rename(columns={s: s+'_max' for s in features})], axis=1)
# min
min_seq = all_data.groupby('sequence').min().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, min_seq.rename(columns={s: s+'_min' for s in features})], axis=1)
# sum
sum_seq = all_data.groupby('sequence').sum().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, sum_seq.rename(columns={s: s+'_sum' for s in features})], axis=1)
# median
medi_seq = all_data.groupby('sequence').median().drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, medi_seq.rename(columns={s: s+'_medi' for s in features})], axis=1)
# first quantile
quan1_seq = all_data.groupby('sequence').quantile(0.25).drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan1_seq.rename(columns={s: s+'_quantile1' for s in features})], axis=1)
# third quartile
quan3_seq = all_data.groupby('sequence').quantile(0.75).drop(['step', 'subject'], axis=1)
all_data_summ = pd.concat([all_data_summ, quan3_seq.rename(columns={s: s+'_quantile3' for s in features})], axis=1)
print(all_data_summ.columns.tolist())
all_data_summ.head()

In [None]:
all_data_summ.info()

Check missing values

In [None]:
all_data_na = (all_data_summ.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(22)

Restore the data

In [None]:
train_df = all_data_summ[:ntrain//60]
test_df = all_data_summ[ntrain//60:]
print(train_df.shape, test_df.shape)
train_df.head()

## Trainig
### Split data
Split training data into training data and test data using train_test_split in sklern

In [None]:
test_size = 0.20
features = train_df.columns[2:]
X, y = train_df[features].values, train_label_df['state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

### Define Model
This section defines the model.
The three hidden layers of the network are 128, 64, and 32 nodes, respectively.  
Also, train and predict were added to this Class.

In [None]:
# the parameter of hidden layer
layer_2 = 128
layer_3 = 64
layer_4 = 32
# define the class
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, layer_2)
        self.bn1 = nn.BatchNorm1d(layer_2)
        self.fc2 = nn.Linear(layer_2, layer_3)
        self.bn2 = nn.BatchNorm1d(layer_3)
        self.fc3 = nn.Linear(layer_3, layer_4)
        self.bn3 = nn.BatchNorm1d(layer_4)
        self.fc4 = nn.Linear(layer_4, 1)
        self.dropout = nn.Dropout(0.4)
        self.stdsc = StandardScaler()
        self.logs_train = [[], [np.inf]]
        self.logs_valid = [[], [np.inf]]

    def forward(self, x):
        x = self.bn1(F.relu(self.fc1(x)))
        x = self.dropout(x)
        x = self.bn2(F.relu(self.fc2(x)))
        x = self.dropout(x)
        x = self.bn3(F.relu(self.fc3(x)))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc4(x))
        return x

    def fit(self, X, y, num_epochs=100, batch_size=1080):
        # check whether GPU is available
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print('Active dvice：', device)
        ####   Data processing↓
        # standerd scaler
        X = self.stdsc.fit_transform(X)
        # Index acquisition to randomly rearrange the data set
        num_train = len(X)
        indices_train = list(range(num_train))
        # create dataloder for read pytorch model
        train_sampler = SubsetRandomSampler(indices_train)
        train = torch.utils.data.TensorDataset(torch.Tensor(X), torch.tensor(y))
        dataloader = torch.utils.data.DataLoader(train,sampler=train_sampler, batch_size=batch_size)
        ####    training setup 
        # Setting up optimization methods
        lr = 0.01
        beta1, beta2 = 0.0, 0.9
        optimizer = torch.optim.Adagrad(self.parameters(), lr)#, [beta1, beta2])
        # Define the error function
        criterion = nn.MSELoss()
        #　Network to GPU
        self.to(device)
        #　Model in learning mode
        self.train()
        #　If the network is somewhat fixed, make it faster
        torch.backends.cudnn.benchark = True
        #　Save batch size
        batch_size = dataloader.batch_size
        #　Set iteration counter
        iteration = 1
        #epoch loop
        for epoch in tqdm(range(num_epochs)):
            # Save start time
            t_epoch_strat = time.time()
            epoch_loss = 0.0
#             print('-------------')
#             print('Epoch {}/{}'.format(epoch, num_epochs))
#             print('-------------')
#             print(' (train) ')
            ### training
            # Loop to retrieve minibatch by minibatch from the data loader
            for data, targets in dataloader:
                # Converted to be handled by GPU
                data = data.to(device)
                targets = targets.to(device)
                # gradient initialization
                optimizer.zero_grad()
                # Get output
                output = self.forward(data)
                output = output.view(1,-1)[0]
                targets = targets.to(torch.float32)
                # Calculate error
                loss = criterion(output, targets)
                # backpropergation
                loss.backward()
                # step
                optimizer.step()
                
                # memory error
                epoch_loss += loss.item()
                iteration += 1
            # loss per epoch
            t_epoch_finish = time.time()
#             print('Epoch: {} \tTraining Loss: {:.6f}'.format(
#                 epoch+1, 
#                 epoch_loss/num_train,
#                 ))
#             print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_strat))
            # save parameter
            if epoch_loss/num_train < min(self.logs_train[1]):
#                 print('--save model--')
                torch.save(self.state_dict(), './models')
            self.logs_train[0].append(epoch+1)
            self.logs_train[1].append(epoch_loss/num_train)
            
    def predict(self, X):
        valid_loss = 0.0
        num_valid = len(X)
        indices_valid = list(range(num_valid))
        # standerd scaler
        X = self.stdsc.transform(X)
        # create dataloader
        valid = torch.utils.data.TensorDataset(torch.Tensor(X))
        valid_loader = torch.utils.data.DataLoader(valid,sampler=indices_valid, batch_size=1000)
        model.eval()
        y_pred = np.array([])
        for data in valid_loader:
            data = data[0]
            output = self.forward(data)
            output = output.view(1, -1)
            output = output.to('cpu').detach().numpy().copy()
            y_pred = np.append(y_pred, output[0])
        y_pred = np.array(y_pred)
        y_pred = y_pred.reshape(-1, 1)
        return y_pred
    
    def loss_prot(self):
        plt.plot(self.logs_train[0], self.logs_train[1][1:], '-b')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.show()
        
###########  check operation　###############
X = np.random.rand(1000, 13)
y = np.random.rand(1000, 1)

dataset = torch.utils.data.TensorDataset(torch.Tensor(X), torch.tensor(y))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
model = Net(input_size=13)
for b, tup in enumerate(dataloader):
    print('---------')
    print(tup[0].shape, tup[1].shape)
    data = tup[0].to(torch.float32)
    print(model(data).shape)
    break
######################################

In [None]:
### prameter
k_split = 10
num_epochs = 200
batch_size = 512
# k-fold cross-validation
kfold = StratifiedKFold(n_splits=k_split,
                        random_state=1, shuffle=True).split(X_train, y_train)     #(分割数、シード)を指定
scores = []   # list to save score 
models = []   # list to save model
for k, (train, test) in enumerate(kfold):
    # Instantiate Model
    model = Net(len(X_train[0]))
    # model to GPU
    model.to(device)
    # training
    model.fit(X_train[train], y_train[train], num_epochs=num_epochs, batch_size=batch_size)
    # load model 
    model.load_state_dict(torch.load('models'))
    # predict valid data
    pred_y_k = model.to('cpu').predict(X_train[test])
    # calcrate score
    score = roc_auc_score(y_train[test], pred_y_k)
    print('Fold: %2d, AUC: %.3f' % (k+1, score))
    scores.append(score)
    models.append(model)
    model.loss_prot()
print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

### Evaluated with test data
Evaluate the model you created using the test data you just created.

Now that we have created k models, we need to create a predict function for them.

In [None]:
# predict by k-fold
def predict(models, X_test):
#　Create array for storing test data
    y_pred = np.zeros((len(X_test), len(models)))
    for fold_, model in enumerate(models):
        pred_ = model.predict(X_test)
        # save predict
        pred_ = pred_.reshape(1, -1)
        y_pred[:, fold_] = pred_[0]
    y_pred = y_pred.mean(axis=1)
    return y_pred
y_pred = predict(models, X_test)
print(y_pred[:10])

Evaluated by checking AUC and ROC curves against predicted values

In [None]:
y_pred = y_pred
roc = roc_curve(y_test, y_pred)
print("roc", roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
y_pred_train = predict(models, X_train)
y_pred_train = y_pred_train
roc = roc_curve(y_train, y_pred_train)
print("roc", roc_auc_score(y_train, y_pred_train))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

# Submission

In [None]:
# Creation of files for submission
X_submit = test_df[features].values
y_submit = predict(models, X_submit)
print(y_submit[:10])
plt.hist(y_submit, bins=40, density=True)
plt.show()

load sample_submission.csv

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
submission_df.head()

In [None]:
submission_df['state'] = pd.DataFrame(y_submit)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False, header=True)