# Tabuler Playground seriese

This notebook introduces a new structure of neural networks

# Preprocessing

## Import modules

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from tqdm import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Active device：', device)

## Data loads

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
train_label_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
print(train_df.shape) 
train_df.head(70)

In [None]:
print(train_label_df.shape) 
train_label_df.head(10)

## Check the data

In [None]:
train_df.loc[:, 'sensor_00': 'sensor_12'].describe()

## Process data

In [None]:
ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df))#.reset_index(drop=True)
print(all_data.shape)
all_data.head()

In [None]:
features = all_data.columns.tolist()[3:]
for feature in features:
    all_data[feature + '_lag1'] = all_data.groupby('sequence')[feature].shift(1)
    all_data.fillna(0, inplace=True)
    all_data[feature + '_diff1'] = all_data[feature] - all_data[feature + '_lag1']
    all_data.drop(feature+'_lag1', axis=1, inplace=True)
all_data.head()

In [None]:
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]
print(train_df.shape, test_df.shape)
train_df.head()

In [None]:
sequence_num = train_df.groupby('sequence').count()['subject'].values
print(sequence_num)

# training
## Spritting data

In [None]:
test_rate = 0.2
def train_test_split(X, y, sequence_num, test_rate=0.2):
    feature_size = X.shape[1]
    id_seq = np.array([list(range(0, sequence_num[0]))])
    num_1 = sequence_num[0]
    for num in sequence_num[:-1]:
        id_seq = np.vstack((id_seq, np.arange(num_1, num+num_1)))
        num_1 += num
    X_sequence = X[id_seq]
    # 
    id_data = np.arange(len(id_seq))
    np.random.seed(2022)
    np.random.shuffle(id_data)
    train_size = int(len(id_data)*(1-test_rate))
    test_size = len(id_data) - train_size
    X_train = X_sequence[id_data[:train_size]].reshape(-1, feature_size)
    y_train = y[id_data[:train_size]]
    X_test = X_sequence[id_data[train_size:]].reshape(-1, feature_size)
    y_test = y[id_data[train_size:]]
    sequence_num_train = sequence_num[id_data[:train_size]]
    sequence_num_test = sequence_num[id_data[train_size:]]
    return X_train, X_test, y_train, y_test, sequence_num_train, sequence_num_test
X = train_df.values
y = train_label_df.values
X_train, X_test, y_train, y_test, sequence_num_train, sequence_num_test = train_test_split(X, y, sequence_num, test_rate)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
valid_size = 0.2
X_train, X_valid, y_train, y_valid, sequence_num_train, sequence_num_valid = train_test_split(X_train, y_train, sequence_num_train, valid_size)
print(X_train.shape, X_valid.shape)
print(y_train.shape, y_valid.shape)

## Standard scaler

In [None]:
stdsc = StandardScaler()
print(X_train[:, 0].reshape(-1, 1), stdsc.fit_transform(X_train[:, 1:]))
X_train_std = np.hstack([X_train[:, 0].reshape(-1, 1), stdsc.fit_transform(X_train[:, 3:])])
X_valid_std = np.hstack([X_valid[:, 0].reshape(-1, 1), stdsc.transform(X_valid[:, 3:])])
X_test_std = np.hstack([X_test[:, 0].reshape(-1, 1), stdsc.transform(X_test[:, 3:])])
print(X_train_std.shape, y.shape)

## Create data sets and data loaders

In [None]:
# 
from torch.utils.data.sampler import SubsetRandomSampler
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, sequence_num, y=None, mode='train'):
        self.data = X
        self.teacher = y
        self.sequence_num = sequence_num
        self.mode = mode
    def __len__(self):
        return len(self.teacher)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        if self.mode == 'train':
            out_label =  self.teacher[idx[0]//self.sequence_num]
            return out_data, out_label
        else:
            return out_data
def create_dataset(dataset, dataset_num, sequence_num, input_size, batch_size, shuffle=False):
    sampler = np.array([list(range(i*sequence_num, (i+1)*sequence_num)) for i in range(dataset_num//sequence_num)])
    if shuffle == True:
        np.random.shuffle(sampler)
    dataloader = DataLoader(dataset, batch_size, sampler=sampler)
    return dataloader
# create dataloader
batch_size = 256
dataset_train = MyDataset(X_train_std, sequence_num_train[0], y=y_train, mode='train')
dataset_valid = MyDataset(X_valid_std, sequence_num_valid[0], y=y_valid, mode='train')
dataset_test = MyDataset(X_test_std, sequence_num_test[0], y=y_test, mode='val')
dataloader_train = create_dataset(dataset_train, X_train_std.shape[0], sequence_num_train[0], X_train_std.shape[1], batch_size=batch_size)
dataloader_valid = create_dataset(dataset_valid, X_valid_std.shape[0], sequence_num_valid[0], X_valid_std.shape[1], batch_size=batch_size)
dataloader_test = create_dataset(dataset_test, X_test_std.shape[0], sequence_num_test[0], X_test_std.shape[1], batch_size=batch_size)

# Hyperparameter tuning

In [None]:
class Net1(nn.Module):
    def __init__(self, input_size, trial, num_layer, num_nodes, dropout_rate):
        super(Net1, self).__init__()
        self.activation = get_activation(trial)
        # first layer
        self.linears = nn.ModuleList([nn.Linear(input_size, num_nodes[0])])
        # After 2nd layer
        for i in range(1, num_layer):
            self.linears.append(nn.Linear(num_nodes[i-1], num_nodes[i]))
        # last layer
        self.fc_last = nn.Linear(num_nodes[-1], 1)
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        for i, linear in enumerate(self.linears):
            x = self.activation(linear(x))
            x = self.dropout(x)
        x = self.fc_last(x)
        return x

class Net2(nn.Module):
    def __init__(self, input_size, trial, num_layer, num_nodes, dropout_rate):
        super(Net2, self).__init__()
        self.activation = get_activation(trial)
        # first layer
        self.linears = nn.ModuleList([nn.Linear(input_size, num_nodes[0])])
        # After 2nd layer
        for i in range(1, num_layer):
            self.linears.append(nn.Linear(num_nodes[i-1], num_nodes[i]))
        # last layer
        self.fc_last = nn.Linear(num_nodes[-1], 1)
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        for i, linear in enumerate(self.linears):
            x = self.activation(linear(x))
            x = self.dropout(x)
        x = self.fc_last(x)
        return x

In [None]:
def train(model1, model2, dataloader, optimizer1, optimizer2):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model1.train()
    model2.train()
    criterion = nn.MSELoss()
    for data, target in dataloader:
        data, target = data.to(torch.float32)[:, :, 1:], target.to(torch.float32)[:, 1]
        # 
        data, target = data.to(device), target.to(device)
        # 
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        #
        output1 = model1(data).view(-1, data.shape[1])
        output2 = model2(output1)
        # 
        loss = criterion(output2.view(-1, 1), target.view(-1, 1))
        loss.backward()
        optimizer1.step()
        optimizer2.step()
def test(model1, model2, dataloader):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model1.eval()
    model2.eval()
    criterion = nn.MSELoss()
    loss = 0
    count = 0
    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(torch.float32)[:, :, 1:], target[:, 1]
            data, target = data.to(device), target.to(device)
            output1 = model1(data).view(-1, data.shape[1])
            output2 = model2(output1)
            loss += criterion(output2.view(-1, 1), target.view(-1, 1))
            count += 1
    loss /= count
    return loss

In [None]:
def get_optimizer(trial, model1, model2):
    optimizer_names = ['MomentumSGD', 'Adam', 'Adagrad']
    optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
    if optimizer_name == optimizer_names[0]: 
        momentum_sgd_lr = trial.suggest_loguniform('Momentum_SGD_lr', 1e-5, 1e-2)
        optimizer1 = torch.optim.SGD(model1.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
        optimizer2 = torch.optim.SGD(model2.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
    elif optimizer_name == optimizer_names[1]:
        adam_lr = trial.suggest_loguniform('Adam_lr', 1e-5, 1e-2)
        optimizer1 = torch.optim.Adam(model1.parameters(), lr=adam_lr, weight_decay=weight_decay)
        optimizer2 = torch.optim.Adam(model2.parameters(), lr=adam_lr, weight_decay=weight_decay)
    elif optimizer_name == optimizer_names[2]:
        adagrad_lr = trial.suggest_loguniform('Adagrad_lr', 1e-5, 1e-2)
        optimizer1 = torch.optim.Adagrad(model1.parameters(), lr=adagrad_lr, weight_decay=weight_decay)
        optimizer2 = torch.optim.Adagrad(model2.parameters(), lr=adagrad_lr, weight_decay=weight_decay)
    return optimizer1, optimizer2

def loss_plot(logs_train, logs_test):
    plt.plot(logs_train[0][1:], logs_train[1][1:], '-b', label='train')
    plt.plot(logs_test[0][1:], logs_test[1][1:], '-r', label='test')
    plt.xlabel('epoch', fontsize=15)
    plt.ylabel('loss', fontsize=15)
    plt.legend()
#     plt.ylim(0, 100)
    plt.show()

In [None]:
def get_activation(trial):
    activation_names = ['ReLU', 'ELU', 'leaky_relu']
    activation_name = trial.suggest_categorical('activation', activation_names)
    if activation_name == activation_names[0]:
        activation = F.relu
    elif activation_name == activation_names[1]:
        activation = F.elu
    else:
        activation = F.leaky_relu
    return activation

In [None]:
epochs = 100
batch_size = 256
def objective(trial):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # 隠れ層の数
    num_layer1 = trial.suggest_int('num_layer1', 2, 7)
    num_layer2 = trial.suggest_int('num_layer2', 2, 7)
    # 隠れ層のノード数
    num_nodes1 = [int(trial.suggest_discrete_uniform('num_nodes_1_'+str(i), 16, 128, 16)) for i in range(num_layer1)]
    num_nodes2 = [int(trial.suggest_discrete_uniform('num_nodes_2_'+str(i), 16, 128, 16)) for i in range(num_layer2)]
    # dropout rate
    dropout_rate1 =  trial.suggest_float('dropout_rate1', 0.0, 1.0)
    dropout_rate2 =  trial.suggest_float('dropout_rate2', 0.0, 1.0)
    # モデルのインスタンス化
    model1 = Net1(dataloader_valid.dataset.data.shape[1]-1, trial, num_layer1, num_nodes1, dropout_rate1).to(device)
    model2 = Net2(sequence_num[0], trial, num_layer2, num_nodes2, dropout_rate2).to(device)
    optimizer1, optimizer2 = get_optimizer(trial, model1, model2)
    error_rate = np.inf
    for epoch in range(epochs):
        train(model1, model2, dataloader_valid, optimizer1, optimizer2)
    error_rate = test(model1, model2, dataloader_valid)
    return error_rate

In [None]:
# import optuna
# TRIAL_SIZE = 100
# study = optuna.create_study(study_name='aaa')
# study.optimize(objective, n_trials=TRIAL_SIZE)
# best_params = study.best_params
# print(best_params)

In [None]:
# from optuna.visualization import plot_optimization_history
# plot_optimization_history(study)

In [None]:
# params = best_params
params = {'num_layer1': 7, 'num_layer2': 7, 'num_nodes_1_0': 128.0, 'num_nodes_1_1': 112.0, 'num_nodes_1_2': 128.0, 'num_nodes_1_3': 48.0, 'num_nodes_1_4': 64.0, 'num_nodes_1_5': 48.0, 'num_nodes_1_6': 128.0, 'num_nodes_2_0': 112.0, 'num_nodes_2_1': 128.0, 'num_nodes_2_2': 80.0, 'num_nodes_2_3': 128.0, 'num_nodes_2_4': 128.0, 'num_nodes_2_5': 128.0, 'num_nodes_2_6': 128.0, 'dropout_rate1': 0.2437830034858727, 'dropout_rate2': 0.017851001001991003, 'activation': 'ELU', 'optimizer': 'Adam', 'weight_decay': 3.66056306659787e-09, 'Adam_lr': 0.000670442196296899}

## Training with tuned models

In [None]:
class Net1(nn.Module):
    def __init__(self, input_size, num_layer, num_nodes, dropout_rate, activation_name):
        super(Net1, self).__init__()
        self.activation = get_activation(activation_name)
        # first layer
        self.linears = nn.ModuleList([nn.Linear(input_size, num_nodes[0])])
        # After 2nd layer
        for i in range(1, num_layer):
            self.linears.append(nn.Linear(num_nodes[i-1], num_nodes[i]))
        # last layer
        self.fc_last = nn.Linear(num_nodes[-1], 1)
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        for i, linear in enumerate(self.linears):
            x = self.activation(linear(x))
            x = self.dropout(x)
        x = self.fc_last(x)
        return x

class Net2(nn.Module):
    def __init__(self, input_size, num_layer, num_nodes, dropout_rate, activation_name):
        super(Net2, self).__init__()
        self.activation = get_activation(activation_name)
        # first layer
        self.linears = nn.ModuleList([nn.Linear(input_size, num_nodes[0])])
        # After 2nd layer
        for i in range(1, num_layer):
            self.linears.append(nn.Linear(num_nodes[i-1], num_nodes[i]))
        # last layer
        self.fc_last = nn.Linear(num_nodes[-1], 1)
        self.dropout = nn.Dropout(dropout_rate)
    def forward(self, x):
        for i, linear in enumerate(self.linears):
            x = self.activation(linear(x))
            x = self.dropout(x)
        x = self.fc_last(x)
        return x
def get_activation(activation_name):
    activation_names = ['ReLU', 'ELU', 'leaky_relu']
    if activation_name == activation_names[0]:
        activation = F.relu
    elif activation_name == activation_names[1]:
        activation = F.elu
    else:
        activation = F.leaky_relu
    return activation

In [None]:
def train(model1, model2, dataloader, optimizer1, optimizer2):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model1.train()
    model2.train()
    criterion = nn.MSELoss()
    epoch_loss = 0
    iteration = 0
    for data, target in dataloader:
        data, target = data.to(torch.float32)[:, :, 1:], target.to(torch.float32)[:, 1]
        # 
        data, target = data.to(device), target.to(device)
        # 
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        #
        output1 = model1(data).view(-1, data.shape[1])
        output2 = model2(output1)
        # 
        loss = criterion(output2.view(-1, 1), target.view(-1, 1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer1.step()
        optimizer2.step()
        iteration += 1
    epoch_loss /= iteration
    return epoch_loss

def predict(model1, model2, dataloader):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model1.eval()
    model2.eval()
    y_pred = np.array([])
    with torch.no_grad():
        for data in dataloader:
            data = data.to(torch.float32)[:, :, 1:]
            data = data.to(device)
            output1 = model1(data).view(-1, data.shape[1])
            output2 = model2(output1)
            y_pred = np.append(y_pred, output2.to('cpu'))
    return y_pred

def get_optimizer(model1, model2, optimizer_name, lr, weight_decay):
    optimizer_names = ['MomentumSGD', 'Adam', 'Adagrad']
    if optimizer_name == optimizer_names[0]: 
        optimizer1 = torch.optim.SGD(model1.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
        optimizer2 = torch.optim.SGD(model2.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
    elif optimizer_name == optimizer_names[1]:
        optimizer1 = torch.optim.Adam(model1.parameters(), lr=lr, weight_decay=weight_decay)
        optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_name == optimizer_names[2]:
        optimizer1 = torch.optim.Adagrad(model1.parameters(), lr=lr, weight_decay=weight_decay)
        optimizer2 = torch.optim.Adagrad(model2.parameters(), lr=lr, weight_decay=weight_decay)
    return optimizer1, optimizer2

def loss_plot(logs_train, logs_test):
    plt.plot(logs_train[0][1:], logs_train[1][1:], '-b', label='train')
    plt.plot(logs_test[0][1:], logs_test[1][1:], '-r', label='test')
    plt.xlabel('epoch', fontsize=15)
    plt.ylabel('loss', fontsize=15)
    plt.legend()
    plt.show()

## Training

In [None]:
from tqdm import tqdm
num_epochs = 100
# get parameter
num_layer1 = params['num_layer1']
num_layer2 = params['num_layer2']
num_nodes1 = [int(params[s]) for s in params.keys() if 'num_nodes_1_' in s]
num_nodes2 = [int(params[s]) for s in params.keys() if 'num_nodes_2_' in s]
dropout_rate1 = params['dropout_rate1']
dropout_rate2 = params['dropout_rate2']
activation_name = params['activation']
# Model initialization
model1 = Net1(dataloader_train.dataset.data.shape[1]-1, num_layer1, num_nodes1, dropout_rate1, activation_name)
model2 = Net2(sequence_num[0], num_layer2, num_nodes2, dropout_rate2, activation_name)
model1, model2 = model1.to(device), model2.to(device)
# Define update method
optimizer_name = params['optimizer']
lr = params[optimizer_name+'_lr']
weight_decay = params['weight_decay']
optimizer1, optimizer2 = get_optimizer(model1, model2, optimizer_name, lr, weight_decay)
# 学習
logs_train = [[0], [np.inf]]
logs_test = [[0], [np.inf]]
for epoch in tqdm(range(num_epochs)):
    epoch_loss = train(model1, model2, dataloader_train, optimizer1, optimizer2)
    test_pred = predict(model1, model2, dataloader_test)
    test_loss = mean_squared_error(y_test[:, 1], test_pred)
    if test_loss < min(logs_test[1]):
        torch.save(model1.state_dict(), './model1')
        torch.save(model2.state_dict(), './model2')
    logs_train[0].append(epoch+1)
    logs_train[1].append(epoch_loss)
    logs_test[0].append(epoch+1)
    logs_test[1].append(test_loss)

## Check learning curve

In [None]:
loss_plot(logs_train, logs_test)

In [None]:
model1.load_state_dict(torch.load('./model1'))
model2.load_state_dict(torch.load('./model2'))

In [None]:
y_pred = predict(model1, model2, dataloader_test)
roc = roc_curve(y_test[:, 1], y_pred)
print("roc", roc_auc_score(y_test[:, 1], y_pred))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
dataset_train = MyDataset(X_train_std, sequence_num=sequence_num[0], mode='valid')
dataloader_train = create_dataset(dataset_train, X_train_std.shape[0], sequence_num[0], X_train_std.shape[1], batch_size)
y_pred_train = predict(model1, model2, dataloader_train)
roc = roc_curve(y_train[:, 1], y_pred_train)
print("roc", roc_auc_score(y_train[:, 1], y_pred_train))
fpr, tpr, thresholds = roc
plt.plot(fpr, tpr, marker='o')
plt.xlabel('FPR: False positive rate')
plt.ylabel('TPR: True positive rate')
plt.grid()

In [None]:
X_submit = test_df.values
X_submit_std = np.hstack([X_submit[:, 0].reshape(-1, 1), stdsc.transform(X_submit[:, 3:])])
dataset_submit = MyDataset(X_submit_std, sequence_num=sequence_num[0], mode='valid')
dataloader_submit = create_dataset(dataset_submit, X_submit_std.shape[0], sequence_num[0], X_submit_std.shape[1], batch_size)
y_submit = predict(model1, model2, dataloader_submit)
print(y_submit.shape)
plt.hist(y_submit, bins=30, density=True)
plt.show()

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
submission_df.head()

In [None]:
submission_df['state'] = pd.DataFrame(y_submit)
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False, header=True)