# Summary

### First Submission 
- 3*2ensemble model 
- SimpleNN,DenseNet,with Neutralization and CNN.
- Fillna with 0 => FeatureNeutralization => NN.
- Fillna with 0 => FeatureNeutralization => DenseNet. 
- Fillna with mean => FeatureNeutralization => CNN.
- For each of these 3 models, I used model trained in last fold, and model trained with all data(except for weight = 0 and first 85days)

### Second Submission 
- 4*2ensemble model 
- FirstSubmission + pytorch model 
- pytorch model's architecture is the same as [This Notebook](https://www.kaggle.com/a763337092/pytorch-resnet-starter-training), but trained with my CV Strategy. 

### Scores 
I forgot PublicLB for these submittions...

|        | FirstSub | SecondSub |
| :---:  |   :---:  |   :---:   |
|PublicLB| 8000~9000| 8000~9000 |
|FirstRun| 4717.338 | 5100.367  |
|SecondRun|4950.418|5417.607|


### Training 
training code is [here](). Only densenet was trained in that notebook, but other models are trained almost same way. 

In [None]:
import os,gc,pickle,requests,json
import numpy as np
from numba import njit 
import pandas as pd 
import matplotlib.pyplot as plt 

from hyperopt import hp,fmin,tpe,Trials 
from hyperopt.pyll.base import scope 

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score,roc_curve 
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler,QuantileTransformer 

from matplotlib.colors import ListedColormap 
from tqdm.notebook import tqdm 
from joblib import dump,load

import tensorflow_addons as tfa
import tensorflow as tf 
import tensorflow.keras.layers as L 
import tensorflow.keras.backend as B  
import tensorflow.keras.optimizers as O
import tensorflow.keras.activations as A
import tensorflow.keras.metrics as M 
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback,ReduceLROnPlateau,ModelCheckpoint,EarlyStopping

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
FEATURES = [f"feature_{i}" for i in range(130)]

# For Preprocess

In [None]:
class NeutralizeTransform:
    def __init__(self,proportion=1.0):
        self.proportion = proportion
    
    def fit(self,X,y):
        self.lms = []
        self.mean_exposure = np.mean(y,axis=0)
        self.y_shape = y.shape[-1]
        for x in X.T:
            scores = x.reshape((-1,1))
            exposures = y
            exposures = np.hstack((exposures, np.array([np.mean(scores)] * len(exposures)).reshape(-1, 1)))
            
            transform = np.linalg.lstsq(exposures, scores, rcond=None)[0]
            self.lms.append(transform)
            
    def transform(self,X,y=None):
        out = []
        for i,transform in enumerate(self.lms):
            x = X[:,i]
            scores = x.reshape((-1,1))
            exposures = np.repeat(self.mean_exposure,len(x),axis=0).reshape((-1,self.y_shape))
            exposures = np.concatenate([exposures,np.array([np.mean(scores)] * len(exposures)).reshape((-1,1))],axis=1)
            correction = self.proportion * exposures.dot(transform)
            out.append(x - correction.ravel())
        return np.asarray(out).T
    
    def fit_transform(self,X,y):
        self.fit(X,y)
        return self.transform(X,y)

def create_autoencoder(input_dim,output_dim,hidden_units,dropout_rate,learning_rate,noise_ratio = 0.05):
    i = tf.keras.layers.Input(input_dim)
    encoded = tf.keras.layers.BatchNormalization()(i) 
    encoded = tf.keras.layers.GaussianNoise(noise_ratio)(encoded)
    encoded = tf.keras.layers.Dense(hidden_units[0],activation = "relu")(encoded)
    decoded = tf.keras.layers.Dropout(dropout_rate[0])(encoded) 
    decoded = tf.keras.layers.Dense(input_dim,name = "decoded")(decoded)
    x = tf.keras.layers.Dense(hidden_units[1],activation="relu")(decoded)
    x = tf.keras.layers.BatchNormalization()(x) 
    x = tf.keras.layers.Dropout(dropout_rate[1])(x) 
    x = tf.keras.layers.Dense(output_dim,activation = "sigmoid",name = "label_output")(x) 

    encoder = Model(inputs = i,outputs = decoded)
    autoencoder = Model(inputs = i,outputs = [decoded,x])

    autoencoder.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                        loss = {"decoded" : "mse","label_output":"binary_crossentropy"})
    return autoencoder,encoder

# Models

## Simple NN

In [None]:
def create_model(input_dim,output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder):
    inputs = tf.keras.layers.Input(input_dim)
    
    x = encoder(inputs)
    x = tf.keras.layers.Concatenate()([x,inputs])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rate[0])(x)
    
    for i in range(n_layers):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Lambda(tf.keras.activations.relu)(x)
        x = tf.keras.layers.Dropout(dropout_rate[i+1])(x)
    x = tf.keras.layers.Dense(output_dim,activation='sigmoid')(x)
    model = Model(inputs=inputs,outputs=x)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing),
                  metrics = [tf.keras.metrics.AUC(name = 'auc')])
    return model

## DenseNet

In [None]:
def mish(x):
    return tf.keras.layers.Lambda(lambda x: x*B.tanh(B.softplus(x)))(x)
tf.keras.utils.get_custom_objects().update({'mish': tf.keras.layers.Activation(mish)})

def create_densenet(input_dim,output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder):
    inp = tf.keras.layers.Input(input_dim)
    tmp = encoder(inp)
    tmp = tf.keras.layers.Concatenate()([inp,tmp]) 
    tmp = tf.keras.layers.BatchNormalization()(tmp)
    xs = [tmp]
    for i in range(n_layers):
        if len(xs) > 1:
            tmp = tf.keras.layers.Concatenate(axis=-1)(xs)
        else:
            tmp = xs[0]
        tmp = tf.keras.layers.Dense(hidden_units[i],activation='mish')(tmp)
        tmp = tf.keras.layers.BatchNormalization()(tmp)
        tmp = tf.keras.layers.Dropout(dropout_rate[i])(tmp)
        xs.append(tmp)
    
    output = tf.keras.layers.Dense(output_dim,activation='sigmoid')(tf.keras.layers.Concatenate()(xs))
    model = tf.keras.models.Model(inp,output)
    optimizer = tfa.optimizers.RectifiedAdam(learning_rate = learning_rate) 
    model.compile(optimizer, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
                    metrics=[tf.keras.metrics.AUC(name='auc')])
    return model

## CNN

In [None]:
def create_1dcnn(input_dim, output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder):
    inputs = tf.keras.layers.Input(input_dim)
    
    x = encoder(inputs)
    x = tf.keras.layers.Concatenate()([x,inputs]) 
    x = tf.keras.layers.BatchNormalization()(x) 
    x = tf.keras.layers.Dense(1024,activation = "relu")(x) 
    x = tf.keras.layers.Reshape((32,32))(x) 
    x = tf.keras.layers.Conv1D(filters=16,
                               kernel_size=7,
                               strides=1,
                               activation='relu')(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Flatten()(x)
    
    for i in range(n_layers-1):
        x = tf.keras.layers.Dense(hidden_units[i], activation='relu')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout_rate[i])(x)
    x = tf.keras.layers.Dense(output_dim, activation='sigmoid')(x)
    model = tf.keras.models.Model(inputs=inputs,outputs=x)
    
    # compile
    opt = tfa.optimizers.RectifiedAdam(learning_rate=learning_rate)
    opt = tfa.optimizers.SWA(opt)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)
    model.compile(optimizer=opt, 
                  loss=loss, 
                  metrics=[tf.keras.metrics.AUC(name = 'auc')])
    return model

## Torch Model 

In [None]:
all_feat_cols = [f"feature_{i}" for i in range(130)] + ["cross_41_42_43","cross_1_2"]
target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']

class torch_Model(nn.Module):
    def __init__(self):
        super(torch_Model, self).__init__()
        self.batch_norm0 = nn.BatchNorm1d(len(all_feat_cols))
        self.dropout0 = nn.Dropout(0.2)

        dropout_rate = 0.2
        hidden_size = 256
        self.dense1 = nn.Linear(len(all_feat_cols), hidden_size)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.dense2 = nn.Linear(hidden_size+len(all_feat_cols), hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.dense3 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout_rate)

        self.dense4 = nn.Linear(hidden_size+hidden_size, hidden_size)
        self.batch_norm4 = nn.BatchNorm1d(hidden_size)
        self.dropout4 = nn.Dropout(dropout_rate)

        self.dense5 = nn.Linear(hidden_size+hidden_size, len(target_cols))

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # self.GeLU = nn.GELU()
        
        self.RReLU = nn.RReLU()
    
    def forward(self, x):
        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x1 = self.dense1(x)
        x1 = self.batch_norm1(x1)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x1 = self.LeakyReLU(x1)
        x1 = self.dropout1(x1)

        x = torch.cat([x, x1], 1)

        x2 = self.dense2(x)
        x2 = self.batch_norm2(x2)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x2 = self.LeakyReLU(x2)
        x2 = self.dropout2(x2)

        x = torch.cat([x1, x2], 1)

        x3 = self.dense3(x)
        x3 = self.batch_norm3(x3)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x3 = self.LeakyReLU(x3)
        x3 = self.dropout3(x3)
        
        x = torch.cat([x2, x3], 1)

        x4 = self.dense4(x)
        x4 = self.batch_norm4(x4)
        # x = F.relu(x)
        # x = self.PReLU(x)
        x4 = self.LeakyReLU(x4)
        x4 = self.dropout4(x4)

        x = torch.cat([x3, x4], 1)

        x = self.dense5(x)

        return x

# For Load weights

In [None]:
def load_encoder(params,path):
    input_dim = 130 
    output_dim = 5 
    hidden_units = [params[f"hidden_units{i}"] for i in range(2)]  
    dropout_rate = [params[f"dropout_rate{i}"] for i in range(2)]
    learning_rate = params["learning_rate"] 

    autoencoder,encoder = create_autoencoder(input_dim,output_dim,hidden_units,dropout_rate,learning_rate,noise_ratio=params["noise_ratio"]) 
    encoder.load_weights(path)
    return encoder 

def load_nn_weights(params,encoder,path):
    input_dim = 130 
    output_dim = 5 
    n_layers = params["n_layers"] 
    hidden_units = [params[f"hidden_units{i}"] for i in range(n_layers)]
    dropout_rate = [params[f"dropout_rate{i}"] for i in range(n_layers + 1)]
    learning_rate = params["learning_rate"]
    label_smoothing = params["label_smoothing"]
    
    model = create_model(input_dim,output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder)
    model.load_weights(path) 
    return model 

def load_densenet_weights(params,encoder,path):
    input_dim = 130 
    output_dim = 5 
    n_layers = params["n_layers"] 
    hidden_units = [params[f"hidden_units{i}"] for i in range(n_layers)]
    dropout_rate = [params[f"dropout_rate{i}"] for i in range(n_layers + 1)]
    learning_rate = params["learning_rate"]
    label_smoothing = params["label_smoothing"]
    
    model = create_densenet(input_dim,output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder)
    model.load_weights(path) 
    return model

def load_cnn_weights(params,encoder,path):
    input_dim = 130 
    output_dim = 5
    n_layers = params["n_layers"] 
    hidden_units = [params[f"hidden_units{i}"] for i in range(n_layers)]
    dropout_rate = [params[f"dropout_rate{i}"] for i in range(n_layers + 1)]
    learning_rate = params["learning_rate"]
    label_smoothing = params["label_smoothing"]
    
    model = create_1dcnn(input_dim,output_dim,n_layers,hidden_units,dropout_rate,learning_rate,label_smoothing,encoder)
    model.load_weights(path) 
    return model 

# Params

In [None]:
# params for encoder 
param_encode = {"output_dim": 5,
                "seed" :42,
                "noise_ratio" : 0.05,  
                "epochs" : 200, 
                "batch_size" : 4096, 
                "hidden_units0" : 64,
                "hidden_units1" : 32,
                "learning_rate" : 0.001}            
for i in range(2):
    param_encode[f"dropout_rate{i}"] = 0.2

# params for simple nn
params_predict_nn = {"output_dim" : 5,
                      "n_layers" : 5,
                      "learning_rate" : 0.001,
                      "label_smoothing" : 0.056961346402280545,
                      "seed" : 1983,
                      "epochs" : 500,
                      "batch_size" : 16382,
                      "tune_epochs" : 3,
                      "nfold" : 5}    

hidden = [256] + [64]*4
drop = [0.03228315981516344,0.34046464288383715,0.2476448175592981] + [0.2]*3
for i in range(params_predict_nn["n_layers"]):
    params_predict_nn[f"hidden_units{i}"] = hidden[i] 
for i in range(params_predict_nn["n_layers"]+1):
    params_predict_nn[f"dropout_rate{i}"] = drop[i]
    
# params for densenet 
params_predict_best = {"output_dim" : 5,
                      "n_layers" : 10,
                      "learning_rate" : 0.001,
                      "label_smoothing" : 0.056961346402280545,
                      "seed" : 20,
                      "epochs" : 1000,
                      "batch_size" : 4096,
                      "tune_epochs" : 3,
                      "nfold" : 5,}    

hidden = [64]*20  
drop = [0.1] + [0.2]*20
for i in range(params_predict_best["n_layers"]):
    params_predict_best[f"hidden_units{i}"] = hidden[i] 
for i in range(params_predict_best["n_layers"]+1):
    params_predict_best[f"dropout_rate{i}"] = drop[i]

# params for cnn 
params_predict_cnn = {"output_dim" : 5,
                      "n_layers" : 5,
                      "learning_rate" : 0.001,
                      "label_smoothing" : 0.056961346402280545,
                      "seed" : 2091,
                      "epochs" : 500,
                      "batch_size" : 4096,
                      "tune_epochs" : 3,
                      "nfold" : 5}    

hidden = [64]*5
drop = [0.2]*6
for i in range(params_predict_cnn["n_layers"]):
    params_predict_cnn[f"hidden_units{i}"] = hidden[i] 
for i in range(params_predict_cnn["n_layers"]+1):
    params_predict_cnn[f"dropout_rate{i}"] = drop[i]

# Prepare Submition

## Preprocess

In [None]:
NTs = pickle.load(open("../input/janestreetdata/NeutralizeTransform_025.pkl","rb"))
f_mean = np.load("../input/js-nn-models/f_mean.npy")
f_mean = np.concatenate(([0.5],f_mean))

In [None]:
def preprocess_f(x_tt):
    x_tt = np.nan_to_num(x_tt)
    x_tt[:,1:] = np.hstack(tuple(nt.transform(x_tt[:,i+1].reshape(-1,1)).ravel() for i,nt in enumerate(NTs)))
    return x_tt  
 
@njit
def fast_fillna(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

## Models

In [None]:
encoder5 = load_encoder(param_encode,"../input/nnmodels/encoder_weight5.hdf5")
encoder5.trainable = False
encoder6 = load_encoder(param_encode,"../input/trial1/encoder6.hdf5")
encoder6.trainable = False
encoder3 = load_encoder(param_encode,"../input/nnmodels/encoder_weight3.hdf5")
encoder3.trainable = False
encoder2 = load_encoder(param_encode,"../input/nnmodels/encoder_weight2.hdf5")
encoder2.trainable = False

In [None]:
model1_1 = load_densenet_weights(params_predict_best,encoder3,"../input/nnmodels/nn_trainall2.hdf5") # 8118.784
model1_2 = load_nn_weights(params_predict_nn,encoder2,"../input/nnmodels/nn_trainall4.hdf5") # 5759.227
model2_3 = load_cnn_weights(params_predict_cnn,encoder5,"../input/nnmodels/nn_trainall6.hdf5") # 6424.009 

model1_1_b = load_densenet_weights(params_predict_best,encoder3,"../input/nnmodels/nn36ver2_4.hdf5") # 6834.318
model1_2_b = load_nn_weights(params_predict_nn,encoder2,"../input/nnmodels/nn50ver2_4.hdf5") # 6620.780
model2_3_b = load_cnn_weights(params_predict_cnn,encoder5,"../input/nnmodels/nn52_4.hdf5") # 7131.743

In [None]:
tf.config.optimizer.set_jit(True)
model_sub1_1 = tf.function(model1_1.call,experimental_relax_shapes = True)
model_sub1_2 = tf.function(model1_2.call,experimental_relax_shapes = True)
model_sub2_3 = tf.function(model2_3.call,experimental_relax_shapes = True)

model_sub1_1_b = tf.function(model1_1_b.call,experimental_relax_shapes = True)
model_sub1_2_b = tf.function(model1_2_b.call,experimental_relax_shapes = True)
model_sub2_3_b = tf.function(model2_3_b.call,experimental_relax_shapes = True)

In [None]:
device = torch.device("cuda:0")
torch.cuda.empty_cache()
model_torch = torch_Model()
model_torch.to(device)
model_weights = "../input/pytorch-model/pytorch_trainall_42.pth" #6847.578
model_torch.load_state_dict(torch.load(model_weights))
model_torch.eval()

torch.cuda.empty_cache()
model_torch_b = torch_Model()
model_torch_b.to(device)
model_weights = "../input/pytorch-model/online_model4.pth" #6549.154
model_torch_b.load_state_dict(torch.load(model_weights))
model_torch_b.eval()

# Submission

This submission is second submission version. 

In [None]:
import janestreet
env = janestreet.make_env() 
th = 0.505
test_df_columns = ['weight'] + [f'feature_{i}' for i in range(130)] + ['date']
index_features = [n for n,col in enumerate(test_df_columns) if col in FEATURES]
f = np.median 
g = np.dot 
weight = np.array([0.10,0.05,0.17,0.13,0.20,0.15,0.13,0.07]) # 0.15 0.30 0.35 0.20 
# weight = np.array([0.25,0.15,0.13,0.07,0.25,0.15]) first submission weight 

for (test_df,pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].values[0] > 0:
        x_tt = test_df.values[0][index_features].reshape(1,-1)
        x_tt_f = preprocess_f(x_tt)
        pred = [0]*8 
        pred[0] = f(model_sub1_1(x_tt_f)) 
        pred[1] = f(model_sub1_1_b(x_tt_f))
        pred[2] = f(model_sub1_2(x_tt_f)) 
        pred[3] = f(model_sub1_2_b(x_tt_f)) 
        x_tt[0,:] = fast_fillna(x_tt[0,:],f_mean) 
        pred[4] = f(model_sub2_3(x_tt))
        pred[5] = f(model_sub2_3_b(x_tt))
        
        #torch 
        cross_41_42_43 = x_tt[:, 41] + x_tt[:, 42] + x_tt[:, 43]
        cross_1_2 = x_tt[:, 1] / (x_tt[:, 2] + 1e-5)
        feature_inp = np.concatenate((
            x_tt,
            np.array(cross_41_42_43).reshape(x_tt.shape[0], 1),
            np.array(cross_1_2).reshape(x_tt.shape[0], 1),
        ), axis=1)
        pred[6] = f(model_torch(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy())
        pred[7] = f(model_torch_b(torch.tensor(feature_inp, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy())
        pred = g(pred,weight)
        pred_df.action = int(pred >= th) 
    else:
        pred_df["action"].values[0] = 0
    env.predict(pred_df)