In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Source

Application of the TF-DF tutorial available at:
https://www.tensorflow.org/decision_forests/tutorials/intermediate_colab

it's highly recommended to set iper-parameters, try other models GB, apply cv folds..

In [None]:
!pip install tensorflow_decision_forests

In [None]:
!pip install wurlitzer

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

In [None]:
tf.random.set_seed(22)

In [None]:
import keras
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Import Data

In [None]:
train= pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv', sep=',', index_col='id')
train.head()

In [None]:
test= pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv', sep=',', index_col='id')
test.head()

In [None]:
#Create Train df and target df
target=train['target']
train = train.drop(columns='target')

In [None]:
#Adding 2 very simply feautres
train['tot_mean'] = train.mean(axis=1)
test['tot_mean'] = test.mean(axis=1)

train['tot_std'] = train.std(axis=1)
test['tot_std'] = test.std(axis=1)

### Scaling Data/Split Train df/Convert train/test into tensorflow dataset

In [None]:
rs = MinMaxScaler()
train = pd.DataFrame(rs.fit_transform(train), index=train.index, columns=train.columns)
test = pd.DataFrame(rs.transform(test), index=test.index, columns=test.columns)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=42, stratify=target)

In [None]:
train_ds=tf.data.Dataset.from_tensor_slices((X_train,y_train))

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices((X_val,y_val))

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices((test))

In [None]:
train_ds=train_ds.batch(128)
val_ds=val_ds.batch(128)
test_ds=test_ds.batch(128)

## The Model

the example shows that it's possible training the model in two steps:
1) Normalization > NN > Classification

2) Replace the last layer of the NN (before classification output) with a TF-DF tree model (RF, GB..)

## NN Model

In [None]:
#callbaks
reduce_lr=tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=0)
early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
input_1 = tf.keras.Input(shape=(102,))

y = tf.keras.layers.Dense(300, activation=tf.nn.elu, kernel_initializer='he_normal')(input_1)
yn = tf.keras.layers.BatchNormalization()(y)
yd = tf.keras.layers.Dropout(0.2)(yn)

y0 = tf.keras.layers.Dense(150, activation=tf.nn.elu, kernel_initializer='he_normal')(yd)
y0n = tf.keras.layers.BatchNormalization()(y0) 
y0d = tf.keras.layers.Dropout(0.2)(y0n)


y1 = tf.keras.layers.Dense(150, activation=tf.nn.elu, kernel_initializer='he_normal')(y0d)
y1n = tf.keras.layers.BatchNormalization()(y1)
y1d = tf.keras.layers.Dropout(0.1)(y1n)

y2 = tf.keras.layers.Dense(93, activation=tf.nn.elu, kernel_initializer='he_normal')(y1d)
y2n = tf.keras.layers.BatchNormalization()(y2)
y2d = tf.keras.layers.Dropout(0.1)(y2n)

last_layer = tf.keras.layers.Dense(93, activation=tf.nn.elu, kernel_initializer='he_normal', name="last")(y2d)

classification_output = tf.keras.layers.Dense(1, activation='sigmoid')(y2d)

nn_model = tf.keras.models.Model(input_1, classification_output)

In [None]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.9, epsilon=1e-07),
  loss=tf.keras.losses.BinaryCrossentropy(),
  metrics=[tf.keras.metrics.AUC()])

In [None]:
tf.keras.utils.plot_model(
    nn_model,
    to_file="model1.png",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True)

In [None]:
nn_model.fit(x=train_ds, validation_data=val_ds, epochs=50, callbacks=[reduce_lr, early])
nn_model.summary()

# Gradient Boosting

In [None]:
# Creating the GB model
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.GradientBoostedTreesModel(preprocessing=nn_without_head,hyperparameter_template='benchmark_rank1')

In [None]:
ds_train=tf.data.Dataset.from_tensor_slices((train,target))
ds_train=ds_train.batch(128)

In [None]:
df_and_nn_model.compile(metrics=[tf.keras.metrics.AUC()])
with sys_pipes():
    df_and_nn_model.fit(x=ds_train)

In [None]:
logs = df_and_nn_model.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

### Calibration using Ligistic Regression Model

In [None]:
p = df_and_nn_model.predict(ds_train)
p2 = df_and_nn_model.predict(test_ds)
parameters = {'C':[0.0001,0.0002,0.0003, 0.0004,0.0005,0.0006,0.0007,0.001, 0.005, 0.01, 0.1, 1, 10]}
lr = LogisticRegression()
clf = GridSearchCV(lr, param_grid=parameters, cv=10, scoring='roc_auc')

In [None]:
clf.fit(p,target)

In [None]:
p3=clf.best_estimator_.predict_proba(p2)

# Submission

In [None]:
sub= pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv', sep=',', index_col='id')
sub.head()

In [None]:
sub['target'] = p3[:,1]
sub.head()

In [None]:
sub = sub.reset_index()
sub.to_csv('submission.csv',index=False)

# continue.....