# Import

In [53]:
import pandas as pd
import numpy as np
import cupy as cp
import os
import gc
import time
import torch
import torchvision
from torch import nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
print(torch.__version__)

1.7.1+cu110


In [2]:
#%%writefile test.py
#print('hello world!')

## Parameter setting

In [49]:
TRAINING = True
USE_FINETUNE = True     
FOLDS = 5
SEED = 66
INPUTPATH = '../../input'
NUM_EPOCH = 100
BATCH_SIZE = 16384
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MDL_PATH  = '../models'

In [57]:
#%%time
#train = pd.read_csv(f'{INPUTPATH}/train.csv')

In [58]:
#%%time
#train.to_parquet(f'{INPUTPATH}/train.parquet')

In [4]:
%%time
train = pd.read_parquet(f'{INPUTPATH}/train.parquet')

CPU times: user 8.35 s, sys: 3.92 s, total: 12.3 s
Wall time: 1.09 s


# Features

In [103]:
%%time
train = train.query('date > 85').reset_index(drop = True) 
print(train.shape)
train.fillna(train.mean(),inplace=True)
train = train.query('weight > 0').reset_index(drop = True)
train['action'] =  \
(  (train['resp_1'] > 0.00001 ) & \
   (train['resp_2'] > 0.00001 ) & \
   (train['resp_3'] > 0.00001 ) & \
   (train['resp_4'] > 0.00001 ) & \
   (train['resp'] > 0.00001 )   ).astype('int')

features = [c for c in train.columns if 'feature' in c]

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

X = train[features].values
y = np.stack([(train[c] > 0.000001).astype('int') for c in resp_cols]).T
#X = cp.array(train[features].values)
#y = cp.array(np.stack([(train[c] > 0.000001).astype('int') for c in resp_cols]).T) #Multitarget

#f_mean = np.mean(train[features[1:]].values,axis=0)

f_mean = np.load( f'{INPUTPATH}/f_mean.npy')

(1571415, 139)
CPU times: user 1.89 s, sys: 741 ms, total: 2.64 s
Wall time: 2.63 s


In [131]:
#%%time
#np.save( f'{INPUTPATH}/f_mean.npy',f_mean)

CPU times: user 1.15 ms, sys: 15 µs, total: 1.17 ms
Wall time: 805 µs


In [74]:
test_df = pd.read_csv(f'{INPUTPATH}/example_test.csv')
pred_df  = pd.read_csv(f'{INPUTPATH}/example_sample_submission.csv')

In [6]:
print(X.shape[-1])
print(y.shape[-1])

130
5


# AutoEncoder MLP

In [7]:
class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.

    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """

    def __init__(self, sigma=0.1, is_relative_detach=True, **kwargs):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.noise = torch.tensor(0).to(device)

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x

In [8]:
#https://github.com/L1aoXingyu/pytorch-beginner/blob/master/08-AutoEncoder/simple_autoencoder.py
#https://discuss.pytorch.org/t/pytorch-equivalent-of-keras/29412/2
class autoencoder(nn.Module):
    '''
    >> model = 
        autoencoder(input_size = X.shape[-1], output_size = y.shape[-1],\
        noise = 0.1).to(DEVICE)
    '''
    def __init__(self, **kwargs):
        super(autoencoder, self).__init__()
        input_size = kwargs['input_size']
        output_size = kwargs['output_size']
        noise = kwargs['noise']
        self.encoder = nn.Sequential(
            nn.BatchNorm1d(input_size),
            #GaussianNoise(noise),
            nn.Linear(input_size, 640),
            nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(640, input_size)
        )
        self.hidden = nn.Linear(input_size, 320)
        self.bat = nn.BatchNorm1d(320)
        self.drop = nn.Dropout(0.2)
        self.hidden2 = nn.Linear(320, output_size)
        self.act = nn.Sigmoid()

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        x = self.hidden(x)
        x = self.bat(x)
        x = self.drop(x)
        x = self.hidden2(x)
        x = self.act(x)
        return x

In [9]:
class MLPNet (nn.Module):
    '''
    >> model = 
        MLPNet(input_size = X.shape[-1], output_size = y.shape[-1]).to(DEVICE)
    '''
    def __init__(self,  **kwargs):
        super(MLPNet, self).__init__()
        input_size = kwargs['input_size']
        output_size = kwargs['output_size']
        self.fc1 = nn.Linear(input_size, 512)   
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, output_size)
        self.dropout1 = nn.Dropout2d(0.2)
        self.dropout2 = nn.Dropout2d(0.2)
        self.act = nn.Sigmoid()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.act(self.fc3(x))
        return x

In [10]:
class CustomDataset:
    def __init__(self, dataset, target):
        self.dataset = dataset
        self.target = target

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        return {
            'x': torch.tensor(self.dataset[item, :], dtype=torch.float),
            'y': torch.tensor(self.target[item, :], dtype=torch.float)
        }

# Trainning

## AutoEncoder

In [11]:
model = autoencoder(input_size = X.shape[-1], output_size = y.shape[-1], noise=0.1).to(DEVICE)

In [12]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=LR, weight_decay=1e-5)

In [13]:
dataset = CustomDataset(X, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

In [38]:
for epoch in tqdm(range(NUM_EPOCH)):
    for data in dataloader:
        x = data['x'].to(DEVICE)
        y = data['y'].to(DEVICE)
        # ===================forward=====================
        output = model(x)
        loss = criterion(output, y)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, NUM_EPOCH, loss.data.to('cpu').detach().numpy().copy()))

  0%|          | 0/100 [00:00<?, ?it/s]

epoch [1/100], loss:0.2452
epoch [2/100], loss:0.2452
epoch [3/100], loss:0.2453
epoch [4/100], loss:0.2453
epoch [5/100], loss:0.2477
epoch [6/100], loss:0.2457
epoch [7/100], loss:0.2453
epoch [8/100], loss:0.2451
epoch [9/100], loss:0.2450
epoch [10/100], loss:0.2454
epoch [11/100], loss:0.2453
epoch [12/100], loss:0.2452
epoch [13/100], loss:0.2450
epoch [14/100], loss:0.2452
epoch [15/100], loss:0.2451
epoch [16/100], loss:0.2452
epoch [17/100], loss:0.2450
epoch [18/100], loss:0.2449
epoch [19/100], loss:0.2450
epoch [20/100], loss:0.2448
epoch [21/100], loss:0.2447
epoch [22/100], loss:0.2447
epoch [23/100], loss:0.2446
epoch [24/100], loss:0.2445
epoch [25/100], loss:0.2444
epoch [26/100], loss:0.2444
epoch [27/100], loss:0.2443
epoch [28/100], loss:0.2442
epoch [29/100], loss:0.2442
epoch [30/100], loss:0.2442
epoch [31/100], loss:0.2440
epoch [32/100], loss:0.2441
epoch [33/100], loss:0.2440
epoch [34/100], loss:0.2442
epoch [35/100], loss:0.2439
epoch [36/100], loss:0.2437
e

In [44]:
MDL_NAME = 'autoencoder'
VER = 'test'

In [52]:
if not os.path.exists(f'{MDL_PATH}/{MDL_NAME}_{VER}'):
    os.mkdir(f'{MDL_PATH}/{MDL_NAME}_{VER}')
save_path = f'{MDL_PATH}/{MDL_NAME}_{VER}/{MDL_NAME}_'+str(epoch)+'.pth'
torch.save(model.state_dict(),save_path)

In [57]:
gc.collect()

20

## Predict Test 

In [105]:
th = 0.5
x_tt = test_df.loc[:, features].values
if np.isnan(x_tt[:, 1:].sum()):
    x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
pred = 0.0
X_test = torch.FloatTensor(x_tt).to(DEVICE)
pred = model(X_test).cpu().detach().numpy()
pred_df.action = np.where(pred >= th, 1, 0).astype(int) ## 5つの予測値をどうするか。

In [62]:
print(f'{MDL_PATH}/{MDL_NAME}_{VER}')

../models/autoencoder_test


In [68]:
!kaggle datasets init -p ../models/autoencoder_test

Data package template written to: ../models/autoencoder_test/dataset-metadata.json


In [None]:

"""
{
    "title": "Jane-Street",
    "id": "shinsei66/Jane-Street",
    "subtitle": "",
    "description": "",
    "isPrivate": true,
    "licenses": [
        {
            "name": "unknown" ##Invalid value for `license_name` (MIT), must be one of ['CC0-1.0', 'CC-BY-SA-4.0', 'GPL-2.0', 'ODbL-1.0', 'CC-BY-NC-SA-4.0', 'unknown', 'DbCL-1.0', 'CC-BY-SA-3.0', 'copyright-authors', 'other', 'reddit-api', 'world-bank']
1

        }
    ],
    "keywords": [],
    "collaborators": [],
    "data": [
        {
            "description": null,
            "name": "autoencoder_99.pth",
            "totalBytes": 848,
            "columns": []
        }
    ]
}
"""

In [72]:
!kaggle datasets create -p  ../models/autoencoder_test

Starting upload for file autoencoder_99.pth
100%|████████████████████████████████████████| 836k/836k [08:47<00:00, 1.62kB/s]
Upload successful: autoencoder_99.pth (836KB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/shinsei66/Jane-Street
