# Import

In [1]:
!nvidia-smi

Mon Jan 11 18:10:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 460.27.04    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3090    On   | 00000000:01:00.0 Off |                  N/A |
|  0%   57C    P8    32W / 350W |    325MiB / 24268MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [1]:
import glob
#import cupy as cp
import os
import gc
import sys
import time
import yaml
import argparse
import logging
import pandas as pd
import numpy as np
import torch
import torchvision
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch import nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
#from tqdm import tqdm
from torch.utils.data import DataLoader
print(torch.__version__)
import matplotlib.pyplot as plt
from numba import njit
%matplotlib inline
from janest_model import MLPNet , CustomDataset, train_model, autoencoder2
from utils import PurgedGroupTimeSeriesSplit, get_args

1.7.1+cu110


In [None]:
#%%writefile test.py
#print('hello world!')

## Parameter setting

In [2]:
TRAINING = False
USE_FINETUNE = True     
FOLDS = 5
GROUP_GAP = 20
SEED = 66
INPUTPATH = '../../input'
NUM_EPOCH = 500
BATCH_SIZE = 16384
PATIANCE = 15
LR = 0.0001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
MDL_PATH  = '../models'
MDL_NAME = 'autoencoder'
VER = 'early_stopping'

cuda


## Import Data 

In [23]:
%%time
train = pd.read_parquet(f'{INPUTPATH}/train.parquet')
train = train.query('date > 85').reset_index(drop = True) 
print(train.shape)
train.fillna(train.mean(),inplace=True)
train = train.query('weight > 0').reset_index(drop = True)
train['action'] =  \
(  (train['resp_1'] > 0.00001 ) & \
   (train['resp_2'] > 0.00001 ) & \
   (train['resp_3'] > 0.00001 ) & \
   (train['resp_4'] > 0.00001 ) & \
   (train['resp'] > 0.00001 )   ).astype('int')

features = [c for c in train.columns if 'feature' in c]

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

X = train[features].values
y = np.stack([(train[c] > 0.000001).astype('int') for c in resp_cols]).T
f_mean = np.mean(train[features[1:]].values,axis=0)

#f_mean = np.load( f'{INPUTPATH}/f_mean.npy')
date = np.load( f'{INPUTPATH}/date.npy')
weight = np.load( f'{INPUTPATH}/weight.npy' )
resp = np.load( f'{INPUTPATH}/resp.npy')
test_df = pd.read_csv(f'{INPUTPATH}/example_test.csv')
pred_df  = pd.read_csv(f'{INPUTPATH}/example_sample_submission.csv')

(1571415, 139)
CPU times: user 9.97 s, sys: 5.06 s, total: 15 s
Wall time: 3.98 s


# Features

In [6]:
print(X.shape[-1])
print(y.shape[-1])

158
5


In [25]:
class autoencoder(nn.Module):
    '''
    >> model = 
        autoencoder(input_size = X.shape[-1], output_size = y.shape[-1],\
        noise = 0.1).to(DEVICE)
    '''
    def __init__(self, **kwargs):
        super(autoencoder, self).__init__()
        input_size = kwargs['input_size']
        output_size = kwargs['output_size']
        noise = kwargs['noise']
        self.encoder = nn.Sequential(
            nn.BatchNorm1d(input_size),
            #GaussianNoise(noise),
            nn.Linear(input_size, 640),
            nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(640, input_size)
        )
        self.hidden = nn.Linear(input_size, 320)
        self.bat = nn.BatchNorm1d(320)
        self.drop = nn.Dropout(0.2)
        self.hidden2 = nn.Linear(320, output_size)
        self.act = nn.Sigmoid()

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        x = self.hidden(x)
        x = self.bat(x)
        x = self.drop(x)
        x = self.hidden2(x)
        x = self.act(x)
        return x

# Trainning

## Data Split

In [9]:
if not TRAINING:   
    gkf =  PurgedGroupTimeSeriesSplit(n_splits = FOLDS,  group_gap = GROUP_GAP)
    for fold, (tr, vl) in enumerate(gkf.split(y, y, date)):
        pass

## Autoencoder

### CV 

In [26]:
model = autoencoder(input_size = X.shape[-1], output_size = y.shape[-1], noise=0.1).to(DEVICE)

In [27]:
model

autoencoder(
  (encoder): Sequential(
    (0): BatchNorm1d(130, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=130, out_features=640, bias=True)
    (2): ReLU(inplace=True)
  )
  (decoder): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=640, out_features=130, bias=True)
  )
  (hidden): Linear(in_features=130, out_features=320, bias=True)
  (bat): BatchNorm1d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop): Dropout(p=0.2, inplace=False)
  (hidden2): Linear(in_features=320, out_features=5, bias=True)
  (act): Sigmoid()
)

In [12]:
model_list  = glob.glob(f'{MDL_PATH}/{MDL_NAME}_{VER}/*.pth')
print(model_list)

['../models/autoencoder_early_stopping/autoencoder_254.pth']


In [29]:
THRESHOLD=0.5

In [11]:
@njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

#https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function

In [40]:
len(weight_vl)

312613

In [36]:
len(vl)

312613

In [38]:
len(pred_all)

311296

In [37]:
len(action)

311296

In [41]:
loop = int(np.round(len(X[vl])/BATCH_SIZE))
pred_all = np.array([])
x_tt = X[vl].copy()
#x_tt = x_t[BATCH_SIZE*n:BATCH_SIZE*(n+1),:]
if np.isnan(x_tt[:, 1:].sum()):
    x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
pred = 0.0
X_test = torch.FloatTensor(x_tt).to(DEVICE)
for mdl in model_list:
    load_weights = torch.load(mdl)
    model.load_state_dict(load_weights)
    model.eval()
    pred += model(X_test).cpu().detach().numpy() 
if len(pred_all) == 0:
    pred_all = pred.copy()
else:
    pred_all = np.vstack([pred_all, pred]).copy()

action = np.where(pred_all[:,0] >= THRESHOLD, 1, 0).astype(int).copy()
if np.sum(action)>0:
    date_vl = date[vl].copy()
    weight_vl = weight[vl].copy()
    resp_vl = resp[vl].copy()
    action_ans_vl = np.where(y[vl,0]> THRESHOLD, 1, 0).astype(int).copy()
    cv_score = utility_score_numba(date_vl , weight_vl , resp_vl , action)
    max_score = utility_score_numba(date_vl , weight_vl , resp_vl , action_ans_vl )
    print('CV score is {}, Max score is {}, return ration is {:.1f} '.format(cv_score, max_score, 100*(cv_score/max_score)))

CV score is 2858.543108529276, Max score is 12894.874005197811, return ration is 22.2 


In [4]:
f_mean = np.load( f'{INPUTPATH}/f_mean.npy')
X = np.load( f'{INPUTPATH}/X.npy')
y = np.load( f'{INPUTPATH}/y.npy')
date = np.load( f'{INPUTPATH}/date.npy')
weight = np.load( f'{INPUTPATH}/weight.npy' )
resp = np.load( f'{INPUTPATH}/resp.npy')

In [5]:
model = autoencoder2(input_size = X.shape[-1], output_size = y.shape[-1], noise=0.1).to(DEVICE)

In [6]:
model

autoencoder2(
  (hidden): Linear(in_features=316, out_features=640, bias=True)
  (bat): BatchNorm1d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop): Dropout(p=0.2, inplace=False)
  (hidden2): Linear(in_features=640, out_features=5, bias=True)
  (act): Sigmoid()
  (encoder): Sequential(
    (0): BatchNorm1d(158, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=158, out_features=640, bias=True)
    (2): ReLU(inplace=True)
  )
  (decoder): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=640, out_features=158, bias=True)
  )
  (layer): Sequential(
    (0): Linear(in_features=316, out_features=640, bias=True)
    (1): BatchNorm1d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=640, out_features=320, bias=True)
    (4): Dropout(p=0.2, inplace=False)
    (5): Linear(in_features=320, out_features=640, bia

In [7]:
MDL_NAME = 'autoencoder'
VER = 'ho_base_007'
model_list  = glob.glob(f'{MDL_PATH}/{MDL_NAME}_{VER}/*.pth')
print(model_list)

['../models/autoencoder_ho_base_007/autoencoder_fold_ho_19.pth']


In [12]:

THRESHOLD = 0
loop = int(np.round(len(X[vl])/BATCH_SIZE))
pred_all = np.array([])
x_tt = X[vl].copy()
#x_tt = x_t[BATCH_SIZE*n:BATCH_SIZE*(n+1),:]
if np.isnan(x_tt[:, 1:].sum()):
    x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
pred = 0.0
X_test = torch.FloatTensor(x_tt).to(DEVICE)
for mdl in model_list:
    load_weights = torch.load(mdl)
    model.load_state_dict(load_weights)
    model.eval()
    pred += model(X_test).cpu().detach().numpy() 
if len(pred_all) == 0:
    pred_all = pred.copy()
else:
    pred_all = np.vstack([pred_all, pred]).copy()

action = np.where(pred_all[:,0] >= THRESHOLD, 1, 0).astype(int).copy()
if np.sum(action)>0:
    date_vl = date[vl].copy()
    weight_vl = weight[vl].copy()
    resp_vl = resp[vl].copy()
    action_ans_vl = np.where(y[vl,0]> THRESHOLD, 1, 0).astype(int).copy()
    cv_score = utility_score_numba(date_vl , weight_vl , resp_vl , action)
    max_score = utility_score_numba(date_vl , weight_vl , resp_vl , action_ans_vl )
    print('CV score is {}, Max score is {}, return ration is {:.1f} '.format(cv_score, max_score, 100*(cv_score/max_score)))

CV score is 504.43170307177337, Max score is 12893.32054803533, return ration is 3.9 


In [34]:
torch.cuda.empty_cache()

In [37]:
date = train['date'].values
weight = train['weight'].values
resp = train['resp'].values
train['action'] = (train['resp'] > 0).astype('int')
action_ans = train['action'].values

In [39]:
th=0.5
action = np.where(pred_all[:,0] >= th, 1, 0).astype(int).copy()
utility_score_numba(date, weight, resp, action)

3666.890810146739

In [40]:
utility_score_numba(date, weight, resp, action_ans)

173797.76047460194

In [41]:
gc.collect()

17884

## Predict Test 

In [3]:
print(f'{MDL_PATH}/{MDL_NAME}_{VER}')

../models/ae_cv_base


In [None]:
#!kaggle datasets init -p ../models/autoencoder_test

In [9]:
%%writefile ../models/ae_cv_base/dataset-metadata.json
{
    "title": "Jane-Street",
    "id": "shinsei66/Jane-Street",
    "subtitle": "",
    "description": "",
    "isPrivate": true,
    "licenses": [
        {
            "name": "unknown" 
        }
    ],
    "keywords": [],
    "collaborators": [],
    "data": [
        {
            "description": null,
            "name": "autoencoder_99.pth",
            "totalBytes": 848,
            "columns": []
        },
        {
            "description": null,
            "name": "autoencoder_254.pth",
            "totalBytes": 856,
            "columns": []
        },
        {
            "description": null,
            "name": "mlp_base_984.pth",
            "totalBytes": 1316,
            "columns": []
        },
         {
            "description": null,
            "name": "ae_fold_1_18.pth",
            "totalBytes": 840,
            "columns": []
        },
        {
            "description": null,
            "name": "ae_fold_2_428.pth",
            "totalBytes": 840,
            "columns": []
        },
        {
            "description": null,
            "name": "ae_fold_3_500.pth",
            "totalBytes": 840,
            "columns": []
        },
        {
            "description": null,
            "name": "ae_fold_4_199.pth",
            "totalBytes": 840,
            "columns": []
        },
        {
            "description": null,
            "name": "ae_fold_5_497.pth",
            "totalBytes": 840,
            "columns": []
        }
    ]
}

Writing ../models/ae_cv_base/dataset-metadata.json


In [None]:
#!kaggle datasets create -p  ../models/autoencoder_early_stopping

In [7]:
!du ../models/ae_cv_base/ -a

840	../models/ae_cv_base/ae_fold_5_497.pth
76	../models/ae_cv_base/ae_learning_history.csv
840	../models/ae_cv_base/ae_fold_2_428.pth
4	../models/ae_cv_base/.ipynb_checkpoints
840	../models/ae_cv_base/ae_fold_3_500.pth
840	../models/ae_cv_base/ae_fold_4_199.pth
840	../models/ae_cv_base/ae_fold_1_18.pth
4284	../models/ae_cv_base/


In [10]:
!kaggle datasets version -p  ../models/ae_cv_base -m "auto encoder 5 fold cv baseline"

Starting upload for file ae_fold_5_497.pth
100%|████████████████████████████████████████| 836k/836k [08:47<00:00, 1.62kB/s]
Upload successful: ae_fold_5_497.pth (836KB)
Starting upload for file ae_learning_history.csv
100%|████████████████████████████████████████| 73.9k/73.9k [08:46<00:00, 144B/s]
Upload successful: ae_learning_history.csv (74KB)
Starting upload for file ae_fold_2_428.pth
100%|████████████████████████████████████████| 836k/836k [08:47<00:00, 1.62kB/s]
Upload successful: ae_fold_2_428.pth (836KB)
Skipping folder: .ipynb_checkpoints; use '--dir-mode' to upload folders
Starting upload for file ae_fold_3_500.pth
100%|████████████████████████████████████████| 836k/836k [08:46<00:00, 1.63kB/s]
Upload successful: ae_fold_3_500.pth (836KB)
Starting upload for file ae_fold_4_199.pth
100%|████████████████████████████████████████| 836k/836k [08:45<00:00, 1.63kB/s]
Upload successful: ae_fold_4_199.pth (836KB)
Starting upload for file ae_fold_1_18.pth
100%|█████████████████████████