In [None]:
import numpy as np
import pandas as pd
import sys
import os
from time import time
import tensorflow as tf
import gc
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold

# But why?

No real reason.  Why not?

In [None]:
def ff(num_input_columns, BLOCKS = 4, drop_rate=.3, kernel_size = 2):
    
    #Input
    inp = tf.keras.layers.Input(num_input_columns)
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(drop_rate)(x)
    x = tf.keras.layers.Dense(125, activation = 'relu')(x)
    x = tf.keras.layers.Reshape(target_shape=(5,5,5,1))(x)
    
    num_filters = 4
    for i in range(1,BLOCKS+1):
        num_filters
        x = tf.keras.layers.Conv3D(filters = num_filters, kernel_size = kernel_size, 
                                   padding='same', name=f'Block{i}_conv1', activation='relu')(x)
        x = tf.keras.layers.BatchNormalization(name = f'Block{i}_BN1')(x)
        x = tf.keras.layers.Conv3D(filters = num_filters, kernel_size = kernel_size, padding='same', 
                                   name=f'Block{i}_conv2', activation='relu')(x)
        x = tf.keras.layers.BatchNormalization(name = f'Block{i}_BN2')(x)
        x = tf.keras.layers.Conv3D(filters = num_filters, kernel_size = kernel_size, padding='valid', 
                                   name=f'Block{i}_conv3', activation='relu')(x)
        x = tf.keras.layers.BatchNormalization(name = f'Block{i}_BN3')(x)
        num_filters = num_filters * 4

    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(100, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(rate=.1)(x)
    x = tf.keras.layers.Dense(50, activation='relu')(x)
    x = tf.keras.layers.Dropout(rate=.1)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=inp, outputs=x)
    
    
    model.compile(optimizer=tf.keras.optimizers.Adam(.001),
                  loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.AUC()])
    return model

In [None]:
#model = ff(108, 4)
#tf.keras.utils.plot_model(model, show_shapes=True)

# Data Prep

In [None]:
train = pd.read_feather('/kaggle/input/september-feather/train_rg')
train_nan_count = pd.read_feather('/kaggle/input/september-feather/train_rg_min', columns = ['nan_count'])
test =  pd.read_feather('/kaggle/input/september-feather/test_rg')
test_nan_count = pd.read_feather('/kaggle/input/september-feather/test_rg_min', columns = ['nan_count'])

ss = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
FEATURES = [feat for feat in train.columns if 'f' in feat]
TARGET = 'claim'

######################
#Feature Engineering
######################

#Adding row wise features
ROW_WISE = ['min','max','std','var','mean']
for df in [train, test]:
    df['min'] = df[FEATURES].min(axis=1)
    df['max'] = df[FEATURES].max(axis=1)
    df['std'] = df[FEATURES].std(axis=1)
    df['var'] = df[FEATURES].var(axis=1)
    df['mean'] = df[FEATURES].mean(axis=1)

#Adding count all all nas
train['nan_count'] = train_nan_count['nan_count'] / 14
test['nan_count'] = test_nan_count['nan_count'] / 14

#Final Features for nn
FEATURES = FEATURES + ROW_WISE + ['nan_count']

In [None]:
#######################
#Cross Val Params
#######################
NUM_FOLDS = 5 # the number of folds in the KFold validation
NUM_STARTS = 1 #Number of random starts to train per fold
NUM_SPLITS = 1 #Number of times to repeat the KFold validation

In [None]:
oof = pd.DataFrame()
preds = pd.DataFrame()
ES = tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=0, patience=20, verbose=0, mode='max')

#Number of times to do KFold cross val
for random_state in tqdm(range(NUM_SPLITS)):
    skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=random_state)
    oof[random_state] = np.zeros(train.shape[0])
    preds[f'Fold_{random_state}'] = np.zeros(test.shape[0])
    
    for f, (t_idx, v_idx) in enumerate(skf.split(X=train, y=train[TARGET])):
        tr_temp = train[FEATURES].iloc[t_idx]
        tr_temp_target = train[TARGET].iloc[t_idx]
        val_temp = train[FEATURES].iloc[v_idx]
        val_temp_target = train[TARGET].iloc[v_idx]
        Repeat_start = time()
        
        #Number of random starts per fold
        for repeat in range(NUM_STARTS):
            tf.keras.backend.clear_session()
            start = time()
            model = ff(len(FEATURES), BLOCKS = 4, drop_rate=.3, kernel_size = 2)
            
            model.fit(tr_temp, tr_temp_target, batch_size=1024, callbacks=[ES], epochs=200,
                      validation_data=(val_temp, val_temp_target))
            oof[random_state].iloc[v_idx] = np.squeeze(model.predict(val_temp, batch_size=100000)) / NUM_STARTS
            preds[f'Fold_{random_state}'] += np.squeeze(model.predict(test[FEATURES], 
                                                                     batch_size=100000)) / (NUM_STARTS * NUM_FOLDS)
            print(f'{time() - start :.2f}', end=', ')
        print(f'Repeat total: {time() - Repeat_start :.2f}')

In [None]:
scores = [roc_auc_score(train[TARGET].values, oof[col].values) for col in oof.columns]
score = np.mean(scores)
print(f'Scores on all Cross validation splits: {scores}')
print(f'Mean AUC from splits: {score}')

In [None]:
print(f'Null Preds: {preds.isnull().sum()}')
preds.fillna(value=.5, inplace=True)

In [None]:
preds.to_csv('predictions.csv', index=False)
oof.to_csv('oof.csv', index=False)
ss[TARGET] = np.mean(preds, axis=1)
ss.to_csv('ss.csv', index=False)