In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
from path import Path
import gc

import optuna
from sklearn.model_selection import StratifiedKFold

from scipy.special import erfinv

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,True)
    
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1
from tensorflow.keras.metrics import AUC

from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation, LeakyReLU
get_custom_objects().update({'leaky-relu':Activation(LeakyReLU(alpha=0.2))})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test.csv


In [2]:
def gpu_cleanup(objects):
    if objects:
        del(objects)
        K.clear_session()
        gc.collect()

In [3]:
class Config:
    input_path = Path('/kaggle/input/porto-seguro-safe-driver-prediction')
    dae_batch_size = 128 
    dae_num_epoch = 50
    dae_architecture = [1500,1500,1500]
    reuse_autoencoder = False
    
    batch_size = 128
    num_epoch = 150
    units = [64,32]
    input_dropout = 0.06
    dropout = 0.08
    activation = 'selu'
    
    cv_folds = 5
    nas = False
    random_state = 0 

config = Config()

In [4]:
train = pd.read_csv(config.input_path / 'train.csv', index_col='id')
test = pd.read_csv(config.input_path / 'test.csv', index_col='id')
submission = pd.read_csv(config.input_path / 'sample_submission.csv', index_col='id')

calc_features = [feat for feat in train.columns if '_calc' in feat]
cat_features = [feat for feat in train.columns if '_cat' in feat]

target = train["target"]
train = train.drop(['target'],axis=1)

train = train.drop(calc_features, axis=1)
test = test.drop(calc_features, axis=1)

train = pd.get_dummies(train, columns = cat_features)
test = pd.get_dummies(test, columns = cat_features)

assert((train.columns==test.columns).all())

In [5]:
print("Applying GaussRank to columns: ", end='')
to_normalize = list()
for k, col in enumerate(train.columns):
    if '_bin' not in col and '_cat' not in col and '_missing' not in col:
        to_normalize.append(col)
print(to_normalize)

Applying GaussRank to columns: ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']


In [6]:
def to_gauss(x): return np.sqrt(2)* erfinv(x)

def normalize(data, norm_cols):
    n = data.shape[0]
    for col in norm_cols:
        sorted_idx = data[col].sort_values().index.tolist()
        uniform = np.linspace(start=-0.99, stop=0.99, num=n)
        normal = to_gauss(uniform)
        normalized_col = pd.Series(index = sorted_idx, data=normal)
        data[col] = normalized_col
    return data

train = normalize(train, to_normalize)
test = normalize(test, to_normalize)

In [7]:
features = train.columns
train_index = train.index
test_index = test.index

train = train.values.astype(np.float32)
test = test.values.astype(np.float32)

In [13]:
def plt_keras_history(history, measures):
    row = len(measures) // 2 + len(measures)%2
    fig, panels = plt.subplots(rows, 2, figsize=(15,5))
    plt.subplots_adjust(top = 0.99, bottom = 0.01,
                       hspace = 0.4, wspace = 0.2)
    try:
        panels = [item for sublist in panels for item in sublist]
    except:
        pass
    for k, measure in enumerate(measures):
        panel = panels[k]
        panel.set_title(measure + 'history')
        panel.plot(history.epoch, history.history[measure],
                  label='Train ' + measure)
        try:
            panel.plot(history.epoch, history.history['val_'+measure],
                      label = "Validation "+measure)
        except:
            pass
        panel.set(xlabel='epochs', ylabel=measure)
        panel.legend()
    plt.show(fig)
    

from numba import jit
@jit

def eval_gini(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_pred)]
    ntrue = 0
    gini = 0
    delta = 0 
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i 
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue *(n - ntrue))
    return gini 

  def eval_gini(y_true, y_pred):


4