# Tensorflow pipeline

I think NN will play pivotal role in this months competition. Here is a Tensorflow Baseline model.

While your GPU is busy with XGboost, make use of your TPU quota :).

This notebook is copied from @sishihara 's excellent notebook https://www.kaggle.com/sishihara/neural-network-for-tabular-example

Learning rate scheduler code is from @shivansh002 's excellent notebook https://www.kaggle.com/shivansh002/i-am-groot-tpu-war


# Import Libraries

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import tensorflow as tf, re, math
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

TF_CPP_MIN_LOG_LEVEL = 2

import gc

# Enable TPU/GPU

In [None]:
DEVICE = "TPU" #or "GPU"

In [None]:

if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

# Configuration

In [None]:
datenow = datetime.now().strftime('%d%m%Y_%H%M%S')  # will be appended to the oof and submission
modelname = 'nn' # will be appended to the oof and submission

n_folds = 5      # will be appended to the oof and submission

batch_size = 1024
epochs = 30
initial_learning_rate = 1e-5
decay_rate = 1e-8

# Helper Method

In [None]:
def visualize_history(history):
    # Setting Parameters
    auc = history.history['auc']
    val_auc = history.history['val_auc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(len(auc))

    # 1) AUC Plt
    plt.plot(epochs, auc, 'bo' ,label = 'training auc')
    plt.plot(epochs, val_auc, 'r' , label= 'validation auc')
    plt.title('Training and Validation auc')
    plt.legend()

    plt.figure()

    # 2) Loss Plt
    plt.plot(epochs, loss, 'bo' ,label = 'training loss')
    plt.plot(epochs, val_loss, 'r' , label= 'validation loss')
    plt.title('Training and Validation loss')
    plt.legend()

# Read Input Data

In [None]:
train = pd.read_csv(f'../input/stratifiedkfoldsplits-oct2021/Stratified{n_folds}Fold_OCT2021_TPS.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

numerical_cols = [col for col in train if col.startswith('f')]
target_col = 'target'

for c in numerical_cols:
    prep = StandardScaler()
    train[c] = prep.fit_transform(train[[c]])
    test[c] = prep.transform(test[[c]])

X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop('id', axis=1)

del train,test
_ = gc.collect()

# Train

In [None]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train),))

for fold in range(n_folds):
    
    print(f'************************** FOLD {fold + 1} **************************')
    
    train_index = X_train['Fold'] != fold
    valid_index = X_train['Fold'] == fold
    
    X_tr = X_train.loc[train_index][numerical_cols]
    X_val = X_train.loc[valid_index][numerical_cols]
    y_tr = y_train.loc[train_index]
    y_val = y_train.loc[valid_index]

    
    model = keras.Sequential([
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid'),
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['AUC']
    )
    
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=10,
        mode = 'max',
        verbose=1,
        restore_best_weights=True
    )
    
    # from @shivansh002 's excellent notebook https://www.kaggle.com/shivansh002/i-am-groot-tpu-war
    scheduler = ExponentialDecay(initial_learning_rate, 400*((len(train_index)*0.8)/batch_size), decay_rate)
    lr = LearningRateScheduler(scheduler, verbose=1)

    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stopping,lr],
    )

    oof_train[valid_index] = model.predict(X_val).reshape(1, -1)[0]
    y_pred = model.predict(X_test).reshape(1, -1)[0]

    y_preds.append(y_pred)
    models.append(model)
    
    del X_tr,X_val,y_tr,y_val,train_index,valid_index
    _ = gc.collect()

In [None]:
print(f'CV: {roc_auc_score(y_train, oof_train)}')
pd.DataFrame(oof_train).to_csv(f'oof_{modelname}_{datenow}.csv',index=0)

In [None]:
visualize_history(history)

# Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
y_sub = sum(y_preds) / len(y_preds)
sub['target'] = y_sub
sub.to_csv(f'submission_{modelname}_{datenow}.csv', index=False)
sub.head()