In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from os.path import join, isfile
from os import path, scandir, listdir

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Embedding,  Flatten
from tensorflow.keras.models import Model, Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import RMSprop
import keras_tuner as kt

from sklearn.pipeline import Pipeline

from tensorflow.data import Dataset
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from tensorflow import keras
from sklearn import metrics

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    BINS = 128
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    BINS = 128
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #tf.compat.v1.set_random_seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
def list_all_files(location='../input/tps-oct-lv0', pattern=None, recursive=True):
    """
    This function returns a list of files at a given location (including subfolders)
    
    - location: path to the directory to be searched
    - pattern: part of the file name to be searched (ex. pattern='.csv' would return all the csv files)
    - recursive: boolean, if True the function calls itself for every subdirectory it finds
    """
    subdirectories= [f.path for f in scandir(location) if f.is_dir()]
    files = [join(location, f) for f in listdir(location) if isfile(join(location, f))]
    if recursive:
        for directory in subdirectories:
            files.extend(list_all_files(directory))
    if pattern:
        files = [f for f in files if pattern in f]
    return files

In [None]:
names = ['bizen', 'henke', 'hamza', '28smiles','kashif', 'kosta', 'kaveh', 'dlaststark', 'pca', 'xgb2']

In [None]:
namesec = ['lonnie', 'hgb', 'xgb_d2s', 'ctb_d2s']

In [None]:
pred = list_all_files(pattern='oof')


for i in range(len(names)):
    avv = []
    
    for file in pred:
        if names[i] in file.split('/')[3]:
            avv.append(np.load(file))
    train[names[i]] = np.mean(avv, axis=0)
            
pred = list_all_files(location='../input/tps-oct-lv0-sec', pattern='oof')

for i in range(len(namesec)):
    avv = []
    
    for file in pred:
        if namesec[i] in file.split('/')[3]:
            avv.append(np.load(file))
    train[namesec[i]] = np.mean(avv, axis=0)
            
            
    
train.columns

In [None]:
avv = []
for i in range(5):
    avv.append(np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_oof.npy"))
train['lgb'] = np.mean(avv, axis=0)

avv = []
for i in range(5):
    avv.append(np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_oof.npy"))
train['xgb'] = np.mean(avv, axis=0)

In [None]:
train

# Preprocessing

In [None]:
features = train.columns[1:]

In [None]:
pipe = Pipeline([
       # ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=BINS,output_distribution='normal')),
        ('bin', KBinsDiscretizer(n_bins=BINS, encode='ordinal',strategy='uniform'))
        ])

In [None]:
train[features]

In [None]:
train[target]

# Model

In [None]:
def make_model(hp):
    
    lr = hp.Float('learning_rate', min_value=1.52e-2, max_value=1.62e-2)
    dropout = hp.Float('dropout', min_value=0.115, max_value=0.135)
    embed_dim = hp.Int('embed_dim', min_value=4, max_value=8, step=4)
    hidden_dim = hp.Int('hidden_dim', min_value=136, max_value=152, step=8)
    n_layers = hp.Int('n_layers', min_value=2, max_value=4, step=1)
    act = hp.Choice('activation', values=['swish', 'relu', 'elu', 'tanh', 'sigmoid'])
    #dstep = hp.Int('decay_steps', min_value=2000, max_value=4000, step=200)
    drate = hp.Float('decay_rate', min_value=0.880, max_value=0.890)
    eps = hp.Float('epsilon', min_value=1.1e-8, max_value=6.1e-8)
    
    inputs = Input(train[features].shape[1:])
    X = Embedding(input_dim=BINS, output_dim=embed_dim, embeddings_initializer = "glorot_normal")(inputs)
    X = Dropout(dropout)(X)
    #X = BatchNormalization()(X)
    X = Flatten()(X)
    
    for i in range(n_layers):
        #units = hp.Int('units_{i}'.format(i=i), min_value=8, max_value=256, step=8)
        X = layers.Dense(hidden_dim/(2**i), activation=act, kernel_initializer=tf.keras.initializers.GlorotNormal())(X)
        X = Dropout(dropout)(X)
        #X = BatchNormalization()(X)
    outputs = layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.GlorotNormal())(X)
    model = keras.Model(inputs, outputs)

    #learning_rate = hp.Float('learning_rate', min_value=3e-4, max_value=3e-3)
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=lr,
        decay_steps = 400,
        decay_rate= drate)

    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=eps)
    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer=optimizer,
                  metrics=[tf.keras.metrics.AUC(name='aucroc')])
    #model.summary()
    return model

# Keras Tuner

In [None]:
def parse_trial_state(trial):
    state = trial.get_state()
    out = {}
    out['best_step'] = state['best_step']
    out['trial_id'] = state['trial_id']
    out['score'] = state['score']
    out.update(state['hyperparameters']['values'])
    
    return out

In [None]:
class CVTuner(kt.engine.tuner.Tuner):
  def run_trial(self, trial, x, y, batch_size=1024, epochs=100):
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
    val_losses = np.zeros(x.shape[0])
    
    for fold, (train_indices, test_indices) in enumerate(cv.split(x, y)):
     if fold == 0:
      print(f"===== fold {fold} =====")
      x_train, x_test = x.iloc[train_indices], x.iloc[test_indices]
      y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
    
      x_train[features] = pipe.fit_transform(x_train[features])
      x_test[features] = pipe.transform(x_test[features])
    
      model = self.hypermodel.build(trial.hyperparameters)
      model.fit(x_train, y_train, 
                validation_data=(x_test, y_test),
                shuffle=True,
                verbose=0,
                #callbacks=[model_checkpoint_callback],
                callbacks=[
                #tf.keras.callbacks.ReduceLROnPlateau(monitor='val_aucroc', mode='max', patience=2),
                tf.keras.callbacks.EarlyStopping(monitor='val_aucroc', mode='max', patience=5)  ],
                batch_size=batch_size, 
                epochs=epochs)
      val_losses[test_indices] += model.predict(x_test)[:,-1]
      auc=roc_auc_score(y_test, val_losses[test_indices])
      
      print(f"fold {fold} - nn aucroc: {auc:.6f}\n")
    #auc_oof = roc_auc_score(y, val_losses)
    #print(f"agg_val_aucroc: {auc_oof}")
    self.oracle.update_trial(trial.trial_id, {'agg_val_aucroc': auc})
    #self.save_model(trial.trial_id, model)

In [None]:
tuner = CVTuner(
  hypermodel=make_model,
  oracle=kt.oracles.BayesianOptimization(
    objective=kt.Objective('agg_val_aucroc', direction='max'),
    max_trials=200,
    #executions_per_trial=3,
    seed=SEED,
  ))

In [None]:
tuner.search(x=train[features], y=train[target], 
             batch_size=1024, 
             epochs=100,
             #validation_data=(x_val, y_val),
                  
            )

In [None]:
trials_df = pd.DataFrame([
    parse_trial_state(t) for t in tuner.oracle.trials.values()
])

trials_df

In [None]:
best_hp = tuner.get_best_hyperparameters()[0]
best_hp.get_config()['values']

# Log

/////// best average /////////

/// 1024 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units ////////
0.8566036560783801 ver5

/// 512 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units ////////
0.8566107965350984 ver4

/// 256 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566356062584418 ver2

/// 128 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566247198481989 ver3


//// best raw //////

/// 256 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566257218491418 ver6


//// all raw //////

/// 256 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566231753967457 ver8


//// all average //////

/// 512 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units ////////
0.8566152693393069 ver10

/// 256 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566498945718858 ver9

/// 128 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8566580880795951 ver11
0.8566806237507989 ver13
0.8566838905038725 ver14
0.856689976609599 ver15
no improve ver16

/// 64 quant normal kbins uniform with dropout, dstep=400, decreasing hidden units //////// 
0.8565663814933083 ver12