In the last seven days, I have been reading some notebooks of this competition and gradually built up my own ideas for solving problems. In order to give full play to my creativity, I chose to translate the currently popular model code into Pytorch. For clarity, I will also add some explanations.

In these notebooks, some people inspired me a lot. Here are some of their notebooks:
* https://www.kaggle.com/snippsy/bottleneck-encoder-mlp-keras-tuner/log#Submission
* https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function
* https://www.kaggle.com/aimind/bottleneck-encoder-mlp-keras-tuner-8601c5
* https://www.kaggle.com/marketneutral/purged-time-series-cv-xgboost-optuna#Optuna-Hyperparam-Search-for-XGBoost


The modifications are:

* combine slanted triangular learning rate, discriminative fine-tuning, accumulation of gradients, gradual freezing and other methods to prevent catastrophic forgetting (after loading the trained Encoder weight).
* save running memory through type conversion.
* use forward filling method to deal with missing, making it more in line with the characteristics of time series.
* fusion of Encoded MLP model and XGBoost model.
* calculate the optimal weights of the two models.
* assign an optimal threshold to the predicted probability of the fused model for classification tasks.
* the Optuna framework was used to optimize the hyperparameters of the model, such as the number of layers in the model and the number of neurons in the hidden layer.



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Environment configuration

In [None]:
import os
import sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import random
import math

import datatable as dtable
from numba import njit

import warnings
warnings.filterwarnings('ignore')


import torch as t
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AdamW,get_linear_schedule_with_warmup

from tqdm import tqdm_notebook as tqdm
from tqdm import trange

from  collections import OrderedDict
import optuna
import joblib

import scipy.stats

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import xgboost as xgb

print(t.device('cuda') if t.cuda.is_available() else t.device('cpu'))

# Class definition

## BottleNeck Classifier

With the help of supervised learning, an encoder with more effective classification information is obtained. Its function is somewhat similar to the word embedding of natural language processing. At the same time, it also participates in the training of noise reduction autoencoders, so it can be regarded as Feature dimensionality reduction operation. For more details about the model, please see [this article](https://www.semanticscholar.org/paper/Deep-Bottleneck-Classifiers-in-Supervised-Dimension-Parviainen/fb86483f7573f6430fe4597432b0cd3e34b16e43).

In [None]:
class BottleNeck_Classifier(nn.Module):
    
    def __init__(self,trial):
        super(BottleNeck_Classifier,self).__init__()
        
        num_encoder_layers = trial.suggest_int('num_encoder_layers',1,3)
        num_classifier_layers = trial.suggest_int('num_classifier_layers',1,3)
        
        en_dict = {}#
        en_dict['bn'] = nn.BatchNorm1d(INPUT_SIZE)
        
        en_hidden_size = []
        en_dropout_rate = []
        hidden_size_0 = INPUT_SIZE
        en_hidden_size.append(hidden_size_0)
        for i in range(num_encoder_layers):
         
            hidden_size_1 = trial.suggest_int('hidden_size_en_{}'.format(i),64,128)
            dropout_1 = trial.suggest_float('dropout_en_{}'.format(i),0.2,0.5)  
            
            en_hidden_size.append(hidden_size_1)
            en_dropout_rate.append(dropout_1)
            
            en_dict['linear_{}'.format(i)] = nn.Linear(hidden_size_0,hidden_size_1)
            en_dict['drop_{}'.format(i)] = nn.Dropout(dropout_1)
            en_dict['ac_{}'.format(i)] = nn.PReLU()
            en_dict['nb_{}'.format(i)] = nn.BatchNorm1d(hidden_size_1)
            
            hidden_size_0 = hidden_size_1
        
        de_dict = {}  #

        hidden_size_0 = en_hidden_size.pop()
        for i in range(num_encoder_layers):
            
            hidden_size_1 = en_hidden_size.pop()
            dropout_1 = en_dropout_rate.pop()
            
            de_dict['linear_{}'.format(i)] = nn.Linear(hidden_size_0,hidden_size_1)
            if i != num_encoder_layers-1:
                de_dict['drop_{}'.format(i)] = nn.Dropout(dropout_1)
                de_dict['ac_{}'.format(i)] = nn.PReLU()
                de_dict['nb_{}'.format(i)] = nn.BatchNorm1d(hidden_size_1)
            
            hidden_size_0 = hidden_size_1
            
        cf_dict = {}  #
        
        hidden_size_0 = INPUT_SIZE
        for i in range(num_classifier_layers):
            
            hidden_size_1 = trial.suggest_int('hidden_size_cf_{}'.format(i),32,64)
            dropout_1 = trial.suggest_float('dropout_cf_{}'.format(i),0.2,0.5)
            
            cf_dict['linear_{}'.format(i)] = nn.Linear(hidden_size_0,hidden_size_1)
            cf_dict['drop_{}'.format(i)] = nn.Dropout(dropout_1)
            cf_dict['ac_{}'.format(i)] = nn.PReLU()
            cf_dict['nb_{}'.format(i)] = nn.BatchNorm1d(hidden_size_1)
            
            hidden_size_0 = hidden_size_1
            
        cf_dict['output'] = nn.Linear(hidden_size_0,OUTPUT_SIZE)
        
        en_dict = OrderedDict(en_dict.items())
        de_dict = OrderedDict(de_dict.items())
        cf_dict = OrderedDict(cf_dict.items())
        
        self.encoder = nn.Sequential(en_dict)
        
        self.decoder = nn.Sequential(de_dict)
        
        self.classifier = nn.Sequential(cf_dict)
        
    def forward(self,X,Y=None):
        
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        class_res = self.classifier(decoded)
        logits = F.softmax(class_res)
        
        if Y != None:
            
            loss_fct_1 = nn.MSELoss()
            loss_1 = loss_fct_1(decoded,X)
            
            loss_fct_2 = nn.MultiLabelMarginLoss()   
            loss_2 = loss_fct_2(logits,Y)
            
            loss = 0.25 * loss_1 + 10 * loss_2
            
            return loss 
        else: 
            return logits
    

## Encoded MultiLayer Perceptron

After the above encoder is trained, it needs to be spliced ​​to a classifier to complete the classification task. Here we choose a multilayer perceptron, the model obtained after stitching is also one of our final models (there is also an XGBoost model).

In [None]:
class Encoded_MLP(nn.Module):
    
    def __init__(self,trial):
        super(Encoded_MLP,self).__init__()
        
        num_mlp_layers = trial.suggest_int('num_mlp_layers',1,3)
        
        en_study = joblib.load('./study_encoder_seed_{}.pkl'.format(SEED))
        #en_study = joblib.load('../input/mlp-encoded/study_encoder_seed_{}.pkl'.format(SEED))
        en_params = en_study.best_trial.params
       
        en_dict = {}
        en_dict['bn'] = nn.BatchNorm1d(INPUT_SIZE)
        
        hidden_size_0 = INPUT_SIZE
        for i in range(en_params['num_encoder_layers']):
         
            hidden_size_1 = en_params['hidden_size_en_{}'.format(i)]
            dropout_1 = en_params['dropout_en_{}'.format(i)]  
            
            en_dict['linear_{}'.format(i)] = nn.Linear(hidden_size_0,hidden_size_1)
            en_dict['drop_{}'.format(i)] = nn.Dropout(dropout_1)
            en_dict['ac_{}'.format(i)] = nn.PReLU()
            en_dict['nb_{}'.format(i)] = nn.BatchNorm1d(hidden_size_1)
            
            hidden_size_0 = hidden_size_1
            
        # --- #
         
        mlp_dict = {} 
        
        hidden_size_0 = INPUT_SIZE + hidden_size_0
        for i in range(num_mlp_layers):
            
            hidden_size_1 = trial.suggest_int('hidden_size_mlp_{}'.format(i),32,64)
            dropout_1 = trial.suggest_float('dropout_mlp_{}'.format(i),0.2,0.5)
            
            mlp_dict['linear_{}'.format(i)] = nn.Linear(hidden_size_0,hidden_size_1)
            mlp_dict['drop_{}'.format(i)] = nn.Dropout(dropout_1)
            mlp_dict['ac_{}'.format(i)] = nn.PReLU()
            mlp_dict['nb_{}'.format(i)] = nn.BatchNorm1d(hidden_size_1)
            
            hidden_size_0 = hidden_size_1
            
        mlp_dict['output'] = nn.Linear(hidden_size_0,OUTPUT_SIZE)
        
        
        en_dict = OrderedDict(en_dict.items())
        mlp_dict = OrderedDict(mlp_dict.items())
        
        self.encoder = nn.Sequential(en_dict)        
        self.mlp = nn.Sequential(mlp_dict)
        
        checkpoint = t.load('./best_encoder_seed_{}.bin'.format(SEED))
        #checkpoint = t.load('../input/mlp-encoded/best_encoder_fold_4.bin')
        self.encoder.load_state_dict(checkpoint['state_dict']) 

    def forward(self,X,Y=None,encoder=None):
        
        x = self.encoder(X)
        x = t.cat((X,x),1)
        x = self.mlp(x)
        logits = F.softmax(x)
        
        if Y != None:
            loss_fct = nn.MultiLabelMarginLoss()
            loss = loss_fct(logits,Y)
            return logits,loss
        
        return logits

## DataSet

A data class, used to load data to the pre-set device (CPU/GPU) for facilitating model training.

In [None]:
class Train_Val_Dataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'X' : t.tensor(self.features[idx, :], dtype=t.float),
            'Y' : t.tensor(self.targets[idx, :], dtype=t.long)            
        }
        return dct
    
class Test_Dataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        
        dct = {
            'X' : t.tensor(self.features[idx, :], dtype=t.float)
        }
        return dct

# Function implementation

In [None]:
# gets the optimizer for the BottleNeck Classifier Model.

def Get_BC_Optimizer(model):   
    
    lr = LEARN_RATE
    dft_rate = DFT_RATE

    if DISCRIMINATE:
        optimizer_grouped_parameters = []
        params = []
        
        i = 0
        
        encoder_layer_num = math.ceil(len(list(model.encoder.named_parameters()))/2)
        decoder_layer_num = math.ceil(len(list(model.decoder.named_parameters()))/2)
        classifier_layer_num = math.ceil(len(list(model.classifier.named_parameters()))/2)
        
        layer_num = encoder_layer_num + decoder_layer_num + classifier_layer_num
        
        no_decay = ['bias', 'nb']
        
        is_completed = True
        for n,p in list(model.named_parameters()):
            
            if (not any(nd in n for nd in no_decay))  and is_completed == False:
                is_completed =True 
                if i < layer_num:
                    i = i + 1            
                params.append(decay)
                
            if (not any(nd in n for nd in no_decay)) and is_completed == True:
                decay = {
                    'params': [p],
                    'weight_decay': 0.01,
                    'lr': lr/(dft_rate**(layer_num-i))
                }
                is_completed = False
            
            if (any(nd in n for nd in no_decay)) and is_completed == False:
                nodecay = {
                    'params': [p],
                    'weight_decay': 0.0,
                    'lr': lr/(dft_rate**(layer_num-i))
                }
                is_completed =True
                if i < layer_num:
                    i = i + 1         
                params.append(decay)
                params.append(nodecay)
            
        optimizer_grouped_parameters.extend(params)
            
    else:
        param_optimizer = list(model.named_parameters())

        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
          ]

        
    optimizer = AdamW(optimizer_grouped_parameters,lr=lr, eps=1e-8)
    return optimizer

In [None]:
# a method designed to optimize parameters using Optuna for the BottleNeck Classifier Model.

def Objective_BC(trial):
    
    if LOCAL_RANK == -1 or NO_CUDA:
        device = t.device("cuda" if t.cuda.is_available() and not NO_CUDA else "cpu")
        n_gpu = t.cuda.device_count()
    else:
        t.cuda.set_device(LOCAL_RANK)
        device = t.device("cuda",LOCAL_RANK)
        n_gpu = 1
        t.distributed.init_process_group(backend='nccl')
      
    model = BottleNeck_Classifier(trial) 
    
    print('BC:')
    for p in model.named_parameters():
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
    valid_loss_avg_batchs_best_list = []
    
    gkf = PurgedGroupTimeSeriesSplit(
        n_splits=FOLDS,
        max_train_group_size=MAX_TRAIN_GROUP_SIZE, 
        group_gap=GROUP_GAP,
        max_test_group_size=MAX_TEST_GROUP_SIZE 
    )
    splits = list(gkf.split(data_Y, groups=groups))     
    
    for fold,(train_ids,val_ids) in enumerate(splits): 
        
        train_X, train_Y = data_X[train_ids], data_Y[train_ids] 
        val_X, val_Y = data_X[val_ids], data_Y[val_ids]
          
        dataset_T = Train_Val_Dataset(train_X, train_Y)
        dataloader_T = DataLoader(dataset_T,batch_size=TRAIN_BATCH_SIZE,shuffle=False,drop_last=True)
        
        dataset_V = Train_Val_Dataset(val_X, val_Y)
        dataloader_V = DataLoader(dataset_V,batch_size=EVAL_BATCH_SIZE,shuffle=False,drop_last=True)    
        
        optimizer = Get_BC_Optimizer(model)
        num_train_optimization_steps = int(data_X.shape[0] / TRAIN_BATCH_SIZE / GRADIENT_ACCMULATION_STEPS
                                          ) * NUM_TRAIN_EPOCHS   
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = num_train_optimization_steps
                                                )             
        model.to(device)
               
        valid_loss_avg_batchs_best = np.inf
        
        for epoch in trange(NUM_TRAIN_EPOCHS,desc='epoch'):
            
            model.train()

            train_loss_avg_batchs = 0 

            for batch in tqdm(dataloader_T,desc='Iteration'): 

                X , Y = batch['X'].to(device) , batch['Y'].to(device)

                loss = model(X,Y)

                train_loss_avg_batchs = train_loss_avg_batchs + loss.item()

                loss.backward()

                optimizer.step()
                scheduler.step()

                model.zero_grad()

            train_loss_avg_batchs = train_loss_avg_batchs/len(dataloader_T)             
            print('BC -- epoch {} train_loss_avg_batchs: {}'.format(epoch,train_loss_avg_batchs))  
                
            model.eval()   
            
            val_loss_avg_batchs = 0

            for batch in tqdm(dataloader_V,desc='Iteration'):

                X , Y = batch['X'].to(device) , batch['Y'].to(device)

                with t.no_grad():
                    loss = model(X,Y)

                val_loss_avg_batchs = val_loss_avg_batchs + loss.item()

            val_loss_avg_batchs = val_loss_avg_batchs/len(dataloader_V)    
            print('BC -- epoch {} val_loss_avg_batchs: {}'.format(epoch,val_loss_avg_batchs))
                
            if valid_loss_avg_batchs_best > val_loss_avg_batchs:
                try:
                    os.remove('./best_encoder_fold_{}.bin'.format(fold))
                except:
                    pass
                t.save({'state_dict': model.encoder.state_dict()},'./best_encoder_fold_{}.bin'.format(fold))      
                valid_loss_avg_batchs_best = val_loss_avg_batchs
                
        print('BC -- valid_loss_avg_batchs_best_{} : {}'.format(fold,valid_loss_avg_batchs_best))
        
        valid_loss_avg_batchs_best_list.append(valid_loss_avg_batchs_best)
            
    return np.mean(valid_loss_avg_batchs_best_list)    

In [None]:
# gets the optimizer for the Encoded MLP Model.

def Get_Encoded_MLP_Optimizer(model):   
    
    lr = LEARN_RATE
    dft_rate = DFT_RATE

    if DISCRIMINATE:
        optimizer_grouped_parameters = []
        params = []
        
        i = 0
        
        encoder_layer_num = math.ceil(len(list(model.encoder.named_parameters()))/2)
        mlp_layer_num = math.ceil(len(list(model.mlp.named_parameters()))/2)
        
        layer_num = encoder_layer_num + mlp_layer_num
        
        no_decay = ['bias', 'nb']
        
        is_completed = True
        for n,p in list(model.named_parameters()):
            
            if (not any(nd in n for nd in no_decay))  and is_completed == False:
                is_completed =True 
                if i < layer_num:
                    i = i + 1            
                params.append(decay)
                
            if (not any(nd in n for nd in no_decay)) and is_completed == True:
                decay = {
                    'params': [p],
                    'weight_decay': 0.01,
                    'lr': lr/(dft_rate**(layer_num-i))
                }
                is_completed = False
            
            if (any(nd in n for nd in no_decay)) and is_completed == False:
                nodecay = {
                    'params': [p],
                    'weight_decay': 0.0,
                    'lr': lr/(dft_rate**(layer_num-i))
                }
                is_completed =True
                if i < layer_num:
                    i = i + 1         
                params.append(decay)
                params.append(nodecay)
            
        optimizer_grouped_parameters.extend(params)
            
    else:
        param_optimizer = list(model.named_parameters())

        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
          ]

        
    optimizer = AdamW(optimizer_grouped_parameters,lr=lr, eps=1e-8)
    return optimizer

In [None]:
# a method designed to optimize parameters using Optuna for the Encoded MLP Model.

def Objective_Encoded_MLP(trial,is_oof=False):
    
    if is_oof:            
        oof_encoded_mlp = np.zeros(Labels.shape[0])
        
    if LOCAL_RANK == -1 or NO_CUDA:
        device = t.device("cuda" if t.cuda.is_available() and not NO_CUDA else "cpu")
        n_gpu = t.cuda.device_count()
    else:
        t.cuda.set_device(LOCAL_RANK)
        device = t.device("cuda",LOCAL_RANK)
        n_gpu = 1
        t.distributed.init_process_group(backend='nccl')
      
    model = Encoded_MLP(trial) 
    
    print('Encoded_MLP:')
    for p in model.named_parameters():
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
    valid_loss_avg_batchs_best_list = []
    
    gkf = PurgedGroupTimeSeriesSplit(
        n_splits=FOLDS,
        max_train_group_size=MAX_TRAIN_GROUP_SIZE, 
        group_gap=GROUP_GAP,
        max_test_group_size=MAX_TEST_GROUP_SIZE 
    )
    splits = list(gkf.split(data_Y, groups=groups))     
    
    for fold,(train_ids,val_ids) in enumerate(splits): 
        
        train_X, train_Y = data_X[train_ids], data_Y[train_ids] 
        val_X, val_Y = data_X[val_ids], data_Y[val_ids]
          
        dataset_T = Train_Val_Dataset(train_X, train_Y)
        dataloader_T = DataLoader(dataset_T,batch_size=TRAIN_BATCH_SIZE,shuffle=False,drop_last=True)
        
        dataset_V = Train_Val_Dataset(val_X, val_Y)
        dataloader_V = DataLoader(dataset_V,batch_size=EVAL_BATCH_SIZE,shuffle=False,drop_last=True)    
        
        optimizer = Get_Encoded_MLP_Optimizer(model)
        num_train_optimization_steps = int(train_X.shape[0] / TRAIN_BATCH_SIZE / GRADIENT_ACCMULATION_STEPS
                                          ) * NUM_TRAIN_EPOCHS   
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = num_train_optimization_steps
                                                )             
        model.to(device)
               
        valid_loss_avg_batchs_best = np.inf
        
        param_list = []
        if IS_GRADUAL_UNFREEZE:  
            for param in model.parameters():
                param.requires_grad = False
                param_list.append(param)
        
        for epoch in trange(NUM_TRAIN_EPOCHS,desc='epoch'):
            
            model.train()

            train_loss_avg_batchs = 0 

            for step,batch in enumerate(tqdm(dataloader_T,desc='Iteration')): 

                if IS_GRADUAL_UNFREEZE and step % (len(dataloader_T)//3) == 0:
                    if 	len(param_list) != 0:
                        param = param_list.pop()
                        param.requires_grad = True    

                X , Y = batch['X'].to(device) , batch['Y'].to(device)

                logits,loss = model(X,Y)

                loss = loss / GRADIENT_ACCMULATION_STEPS
                
                train_loss_avg_batchs = train_loss_avg_batchs + loss.item()

                loss.backward()

                if (step + 1) % GRADIENT_ACCMULATION_STEPS == 0: 
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

            train_loss_avg_batchs = train_loss_avg_batchs/len(dataloader_T)             
            print('Encoded_MLP -- epoch {} train_loss_avg_batchs: {}'.format(epoch,train_loss_avg_batchs))  
                
            model.eval()   
            
            val_loss_avg_batchs = 0

            for batch in tqdm(dataloader_V,desc='Iteration'):

                X , Y = batch['X'].to(device) , batch['Y'].to(device)

                with t.no_grad():
                    logits,loss = model(X,Y)

                val_loss_avg_batchs = val_loss_avg_batchs + loss.item()

            val_loss_avg_batchs = val_loss_avg_batchs/len(dataloader_V)    
            print('Encoded_MLP -- epoch {} val_loss_avg_batchs: {}'.format(epoch,val_loss_avg_batchs))
                
                
            if valid_loss_avg_batchs_best > val_loss_avg_batchs:
                try:
                    os.remove('./best_encoded_mlp_fold_{}.bin'.format(fold))
                except:
                    pass
                t.save({'state_dict': model.state_dict()},'./best_encoded_mlp_fold_{}.bin'.format(fold))      
                valid_loss_avg_batchs_best = val_loss_avg_batchs
        
        print('Encoded_MLP -- valid_loss_avg_batchs_best_{} : {}'.format(fold,valid_loss_avg_batchs_best))
        
        valid_loss_avg_batchs_best_list.append(valid_loss_avg_batchs_best)
        
        if is_oof:
            
            checkpoint = t.load('./best_encoded_mlp_fold_{}.bin'.format(fold))
            model = Encoded_MLP(trial)         
            model.load_state_dict(checkpoint['state_dict']) 
                   
            oof_val_X = t.tensor(val_X, dtype=t.float)
        
            logits = model(oof_val_X).detach().numpy()
            
            oof_encoded_mlp[val_ids] = np.median(logits) 
            
            
    if is_oof:
        return oof_encoded_mlp
    
    return np.mean(valid_loss_avg_batchs_best_list)    

In [None]:
# a method designed to optimize parameters using Optuna for the XGBoost Model.

def Objective_XGBoost(trial):
 
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.10),
        'subsample': trial.suggest_uniform('subsample', 0.50, 0.90),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 0.90),
        'gamma': trial.suggest_int('gamma', 0, 20),
        'tree_method': 'gpu_hist'  
    }
    
    scaler = StandardScaler()
    clf = xgb.XGBClassifier(**params)

    pipe = Pipeline(steps=[('scaler', scaler),('xgb', clf)])

    gkf = PurgedGroupTimeSeriesSplit(
        n_splits=FOLDS,
        max_train_group_size=MAX_TRAIN_GROUP_SIZE, 
        group_gap=GROUP_GAP,
        max_test_group_size=MAX_TEST_GROUP_SIZE 
    )
    splits = list(gkf.split(Labels, groups=groups))     
    
    aucs = []
    for fold, (train_ids, val_ids) in enumerate(splits):
        
        train_X, train_Y = data_X[train_ids], Labels[train_ids] 
        val_X, val_Y = data_X[val_ids], Labels[val_ids]
        
        _ = pipe.fit(train_X, train_Y)
        pred = pipe.predict(val_X)
        aucs.append(roc_auc_score(val_Y, pred))
    
    mean_auc = np.mean(aucs)
    
    print('XGB -- The Mean AUC : {}'.format(mean_auc))
    
    return mean_auc

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
# Optimise speed of filling-nan function credit to https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [None]:
# forward-filling function of credit to https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function

def for_loop_ffill(df,method):
    matrix = df.values
    tmp = np.zeros(matrix.shape[1])
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], tmp)
        tmp = matrix[i]
    return pd.DataFrame(matrix,columns = df.columns)

In [None]:
# Seed setting function credit to https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    t.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if t.cuda.is_available(): 
        t.cuda.manual_seed(seed_value)
        t.cuda.manual_seed_all(seed_value)
        t.backends.cudnn.deterministic = True
        t.backends.cudnn.benchmark = False

In [None]:
# gussian noise function credit to https://discuss.pytorch.org/t/writing-a-simple-gaussian-noise-layer-in-pytorch/469

def gaussian(ins, is_training, mean, stddev):
    if is_training:
        noise = Variable(ins.data.new(ins.size()).normal_(mean, stddev))
        return ins + noise
    return ins

In [None]:
# cross validation method based on time series credit to https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243

from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [None]:
# a faster method of calculating utility score credit to https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function

@njit(fastmath = True)
def utility_score_numba(date, weight, resp, action):
    Pi = np.bincount(date, weight * resp * action)
    if np.sum(Pi ** 2) == 0:
        return 0
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / len(Pi))
    u = min(max(t, 0), 6) * np.sum(Pi)
    return u

In [None]:
# optimal threshold calculation method for classification model prediction credit to https://www.kaggle.com/gogo827jz/jane-street-super-fast-utility-score-function

#@njit(fastmath = True)
def decision_threshold_optimisation(preds, date, weight, resp, low = 0, high = 1, bins = 100, eps = 1):
    opt_threshold = low
    gap = (high - low) / bins
    action = np.where(preds >= opt_threshold, 1, 0)
    opt_utility = utility_score_numba(date, weight, resp, action)
    
    for threshold in np.arange(low, high, gap):
        action = np.where(preds >= threshold, 1, 0)
        utility = utility_score_numba(date, weight, resp, action)
        print('threshold:{} utility:{}'.format(threshold,utility))
        if utility - opt_utility > eps:
            opt_threshold = threshold
            opt_utility = utility
    return opt_threshold, opt_utility

In [None]:
# calculate JS score credit to https://blog.csdn.net/blmoistawinde/article/details/84329103

def JS_divergence(p,q):
    M=(p+q)/2
    return 0.5*scipy.stats.entropy(p, M)+0.5*scipy.stats.entropy(q, M)

In [None]:
# method of setting model weights (only two models) credit to https://www.kaggle.com/gogo827jz/blending-nn-and-lgbm-rf

def weight_opt(oof_encoded_mlp, oof_xgb, y_true):
    weight_encoded_mlp = np.inf
    best_crps = np.inf

    
    for i in np.arange(0, 1.01, 0.01):

        crps_blend = JS_divergence(i * oof_encoded_mlp + (1-i) * oof_xgb, y_true)

        print('crps_blend: {}'.format(crps_blend))
        
        if crps_blend < best_crps:
            best_crps = crps_blend
            weight_encoded_mlp = round(i, 2)
            
    print('-'*36)
    print('Best weight for Encoded_MLP: ', weight_encoded_mlp)
    print('Best weight for XGBoost: ', round(1-weight_encoded_mlp, 2))
    print('Best mean crps (Blend): ', round(best_crps, 6))
    
    return weight_encoded_mlp, round(1-weight_encoded_mlp, 2)

In [None]:
# faster missing value filling method (for test submission) credit to https://www.kaggle.com/gogo827jz/jane-street-ffill-xgboost-purgedtimeseriescv

@njit
def fast_fillna(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

# Execute

## Data pre-processing

In [None]:
train_data = (
    dtable.fread('../input/jane-street-market-prediction/train.csv')
          .to_pandas()
          .query('date > 85')
          .query('weight > 0')
          .pipe(for_loop_ffill,fillna_npwhere_njit)
          .pipe(reduce_mem_usage)
)

In [None]:
feature_names = train_data.columns[train_data.columns.str.contains('feature')]

groups = train_data['date'].astype(np.int64).values
weight = train_data['weight'].astype(np.float64).values
resp = train_data['resp'].values

data_X = train_data[feature_names].values
data_Y = np.stack([(train_data[c]>0).astype('int') for c in ['resp_1','resp_2','resp_3','resp_4','resp']]).T

Labels = ((train_data['weight'].values * train_data['resp'].values) > 0).astype('int')


In [None]:
del train_data

## Macro definition

In [None]:
INPUT_SIZE = data_X.shape[-1]
OUTPUT_SIZE = data_Y.shape[-1]

DROPOUT_RATE = 0.2
HIDDEN_SIZE_1 = 64
HIDDEN_SIZE_2 = 32

LOCAL_RANK = -1
NO_CUDA = False
LEARN_RATE = 5e-5
DFT_RATE = 1.2
TRAIN_BATCH_SIZE = 1200
EVAL_BATCH_SIZE = 1200
NUM_TRAIN_EPOCHS = 60 #

DISCRIMINATE = True

IS_GRADUAL_UNFREEZE = True

GRADIENT_ACCMULATION_STEPS = 3 #
TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE // GRADIENT_ACCMULATION_STEPS

FOLDS = 5
GROUP_GAP = 31
MAX_TRAIN_GROUP_SIZE = 150
MAX_TEST_GROUP_SIZE = 60

IS_TRAIN = True

SEED = 123

IS_OOF = True


## Training of BottleNeck Classifier Model

In [None]:
if IS_TRAIN:

    seed_everything(SEED)

    study=optuna.create_study(direction="minimize")
    study.optimize(Objective_BC,n_trials=30)

    joblib.dump(study,'./study_encoder_seed_{}.pkl'.format(SEED))

    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    if LOCAL_RANK == -1 or NO_CUDA:
        device = t.device("cuda" if t.cuda.is_available() and not NO_CUDA else "cpu")
        n_gpu = t.cuda.device_count()
    else:
        t.cuda.set_device(LOCAL_RANK)
        device = t.device("cuda",LOCAL_RANK)
        n_gpu = 1
        t.distributed.init_process_group(backend='nccl')

    BCmodel = BottleNeck_Classifier(trial)  

    dataset_T = Train_Val_Dataset(data_X, data_Y)
    dataloader_T = DataLoader(dataset_T,batch_size=TRAIN_BATCH_SIZE,shuffle=False,drop_last=True)

    optimizer = Get_BC_Optimizer(BCmodel)
    num_train_optimization_steps = int(data_X.shape[0] / TRAIN_BATCH_SIZE/ GRADIENT_ACCMULATION_STEPS
                                      ) * NUM_TRAIN_EPOCHS   
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = num_train_optimization_steps
                                                )     
    BCmodel.to(device)

    train_loss_avg_batchs_best = np.inf

    encoder = None

    for epoch in trange(NUM_TRAIN_EPOCHS,desc='epoch'):

        BCmodel.train()

        train_loss_avg_batchs = 0   

        for batch in tqdm(dataloader_T,desc='Iteration'): 

            X , Y = batch['X'].to(device) , batch['Y'].to(device)

            loss = BCmodel(X,Y)

            train_loss_avg_batchs = train_loss_avg_batchs + loss.item()

            loss.backward()

            optimizer.step()
            scheduler.step()

            BCmodel.zero_grad()

        train_loss_avg_batchs = train_loss_avg_batchs/len(dataloader_T)     
        print('T - BC -- epoch {} train_loss_avg_batchs: {}'.format(epoch,train_loss_avg_batchs))  

        if train_loss_avg_batchs_best > train_loss_avg_batchs:
            try:
                os.remove('./best_encoder_seed_{}.bin'.format(SEED))
            except:
                pass

            t.save({'state_dict': BCmodel.encoder.state_dict()},'./best_encoder_seed_{}.bin'.format(SEED))  
            
            train_loss_avg_batchs_best = train_loss_avg_batchs

    print('T - BC -- train_loss_avg_batchs_best : {}'.format(train_loss_avg_batchs_best)) 
  

# Training of Encoded MLP Model

In [None]:
if IS_TRAIN:

    seed_everything(SEED)

    study=optuna.create_study(direction="minimize")
    study.optimize(Objective_Encoded_MLP,n_trials=3)

    joblib.dump(study,'./study_encoded_mlp_seed_{}.pkl'.format(SEED))

    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))


    if LOCAL_RANK == -1 or NO_CUDA:
        device = t.device("cuda" if t.cuda.is_available() and not NO_CUDA else "cpu")
        n_gpu = t.cuda.device_count()
    else:
        t.cuda.set_device(LOCAL_RANK)
        device = t.device("cuda",LOCAL_RANK)
        n_gpu = 1
        t.distributed.init_process_group(backend='nccl')

    MLPmodel = Encoded_MLP(trial)  
    
    # --- #

    oof_encoded_mlp = Objective_Encoded_MLP(trial,IS_OOF)  
    np.save('oof_encoded_mlp_{}.npy'.format(SEED),oof_encoded_mlp)
    
    # --- #
    
    dataset_T = Train_Val_Dataset(data_X, data_Y)
    dataloader_T = DataLoader(dataset_T,batch_size=TRAIN_BATCH_SIZE,shuffle=False,drop_last=True)

    optimizer = Get_Encoded_MLP_Optimizer(MLPmodel)
    num_train_optimization_steps = int(data_X.shape[0] / TRAIN_BATCH_SIZE/ GRADIENT_ACCMULATION_STEPS
                                      ) * NUM_TRAIN_EPOCHS   
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, 
                                                num_training_steps = num_train_optimization_steps
                                                )     
    MLPmodel.to(device)

    param_list = []
    if IS_GRADUAL_UNFREEZE:  
        for param in MLPmodel.parameters():
            param.requires_grad = False
            param_list.append(param)
            
    train_loss_avg_batchs_best = np.inf
    
    for epoch in trange(NUM_TRAIN_EPOCHS,desc='epoch'):

        MLPmodel.train()

        train_loss_avg_batchs = 0 

        for step,batch in enumerate(tqdm(dataloader_T,desc='Iteration')):
                                    
            if IS_GRADUAL_UNFREEZE and step % (len(dataloader_T)//3) == 0:
                if 	len(param_list) != 0:
                    param = param_list.pop()
                    param.requires_grad = True  
                                    
            X , Y = batch['X'].to(device) , batch['Y'].to(device)

            logits,loss = MLPmodel(X,Y)

            loss = loss / GRADIENT_ACCMULATION_STEPS                        
                                    
            train_loss_avg_batchs = train_loss_avg_batchs + loss.item()

            loss.backward()
                                    
            if (step + 1) % GRADIENT_ACCMULATION_STEPS == 0:                        
                optimizer.step()
                scheduler.step()
                MLPmodel.zero_grad()

        train_loss_avg_batchs = train_loss_avg_batchs/len(dataloader_T)     
        print('T - MLP -- epoch {} train_loss_avg_batchs: {}'.format(epoch,train_loss_avg_batchs))  

        if train_loss_avg_batchs_best > train_loss_avg_batchs:
            try:
                os.remove('./best_encoded_mlp_seed_{}.bin'.format(SEED))
            except:
                pass

            t.save({'state_dict': MLPmodel.state_dict()},'./best_encoded_mlp_seed_{}.bin'.format(SEED))  
            train_loss_avg_batchs_best = train_loss_avg_batchs

    print('T - MLP -- train_loss_avg_batchs_best : {}'.format(train_loss_avg_batchs_best))
    

# Training of XGBoost Model

In [None]:
if IS_TRAIN:

    study = optuna.create_study(direction="maximize")
    study.optimize(Objective_XGBoost, n_trials=20) # 60

    trial = study.best_trial

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    best_params = trial.params
    best_params['tree_method'] = 'gpu_hist' 
    
    scaler = StandardScaler()
    clf = xgb.XGBClassifier(**best_params)

    pipe_xgb = Pipeline(steps=[('scaler', scaler),('xgb', clf)])
    
    # --- #
    
    oof_xgb = np.zeros(Labels.shape[0])
    
    gkf = PurgedGroupTimeSeriesSplit(
        n_splits=FOLDS,
        max_train_group_size=MAX_TRAIN_GROUP_SIZE, 
        group_gap=GROUP_GAP,
        max_test_group_size=MAX_TEST_GROUP_SIZE 
    )
    splits = list(gkf.split(Labels, groups=groups))     
      
    for fold,(train_ids,val_ids) in enumerate(splits): 

        train_X, train_Y = data_X[train_ids], Labels[train_ids] 
        val_X, val_Y = data_X[val_ids], Labels[val_ids]

        pipe_xgb.fit(train_X, train_Y)

        oof_xgb[val_ids] = pipe_xgb.predict_proba(val_X)[:,1]
    
    np.save('oof_xgb_{}.npy'.format(SEED),oof_xgb)
    
    # --- #
    
    pipe_xgb.fit(data_X,Labels)
    
    joblib.dump(pipe_xgb,"./xgboost_seed_{}.joblib.dat".format(SEED))

## Test to submit

In [None]:
if IS_TRAIN:
    
    seed_everything(SEED)
       
    study = joblib.load('./study_encoded_mlp_seed_{}.pkl'.format(SEED))
    trial = study.best_trial

    checkpoint = t.load('./best_encoded_mlp_seed_{}.bin'.format(SEED))
    MLPmodel = Encoded_MLP(trial)          
    MLPmodel.load_state_dict(checkpoint['state_dict']) 

    xgboost = joblib.load("./xgboost_seed_{}.joblib.dat".format(SEED))
    
    oof_encoded_mlp = np.load('oof_encoded_mlp_{}.npy'.format(SEED))
    oof_xgb = np.load('oof_xgb_{}.npy'.format(SEED))
    
    weight_encoded_mlp, weight_xgb = weight_opt(oof_encoded_mlp,oof_xgb,Labels)
    pred = weight_encoded_mlp * oof_encoded_mlp + weight_xgb * oof_xgb
    opt_threshold,opt_utility = decision_threshold_optimisation(pred, groups, weight,Labels,pred.min(),pred.max(),1000, 1)
    
    print('Optimal Decision Threshold:', opt_threshold)
    print('Optimal Utility Score:', opt_utility)
    
    import janestreet
    janestreet.competition.make_env.__called__ = False
    env = janestreet.make_env()
    env_iter = env.iter_test()
    
    opt_th = opt_threshold
    tmp = np.zeros(len(feature_names))
    for (test_df, pred_df) in tqdm(env_iter):        
        if test_df['weight'].item() > 0:
            
            x_tt = test_df.loc[:, feature_names].values
            x_tt[0, :] = fast_fillna(x_tt[0, :], tmp)
            tmp = x_tt[0, :]
            
            logits1 = MLPmodel(x_tt)
            logits2 = xgboost.predict_proba(x_tt)
            
            pred = weight_encoded_mlp * logits1 + weight_xgb * logits2
            pred_df.action = np.where(pred > opt_th, 1, 0).astype(int)
            
        else:
            pred_df.action = 0
            
        env.predict(pred_df)
    