In [None]:
!pip install tensorflow_decision_forests

In [None]:
!pip install wurlitzer

In [None]:
import tensorflow_decision_forests as tfdf
import numpy as np
import pandas as pd
import tensorflow as tf


In [None]:
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
# Read in the data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col=0)
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col=0)
train.head()

As we see the dataset contains numerical and null values. TF-DF supports all types of features (i.e., numerical, categorial and null/missing) and we don't need to any data preprocessing (e.g., one-hot-encoding, normalization). 

Labesl: Keras metrics expect integers. The label (```claim```) is already stored as integer, so we don't need to do any additional work here. 

In [None]:
#creating a new column 'nan_count' that counts nan values per row
train['nan_count']=train.isnull().sum(axis=1)
test['nan_count']=test.isnull().sum(axis=1)

In [None]:
#splitting train and validation data
xtrain = train.sample(frac=0.3, random_state=0)
valid = train.drop(xtrain.index).sample(frac=0.05, random_state=0)

In [None]:
#cheking total numbers in each categories
xtrain['claim'].value_counts().to_frame().T

In [None]:

valid['claim'].value_counts().to_frame().T

In [None]:
#filling nan values with 0
#xtrain=xtrain.fillna(0)
#valid_data=valid.fillna(0)
#test=test.fillna(0)

In [None]:
# Convert pandas dataframe into a TensorFlow dataset.
train = tfdf.keras.pd_dataframe_to_tf_dataset(xtrain, label='claim')
valid = tfdf.keras.pd_dataframe_to_tf_dataset(valid, label='claim')
test = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [None]:
%%time

# Train a Random Forest model.
#model = tfdf.keras.RandomForestModel()

# Train a Gradient Boosted Trees model.
model = tfdf.keras.GradientBoostedTreesModel(num_trees=500)
model.compile(metrics=["accuracy"])
model.fit(x=train, validation_data=valid)

In [None]:
model.summary()

In [None]:
# The input features
model.make_inspector().features()

In [None]:
# The feature importances
model.make_inspector().variable_importances()

In [None]:
evaluation = model.evaluate(test, return_dict=True)
print()

for name, value in evaluation.items():
    print(f"{name}: {value: .4f}")

In [None]:
#save the model
model.save("/tmp/my_saved_model1")

In [None]:
#plot the model
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0, max_depth=3)

In [None]:
model.make_inspector().evaluation()

In [None]:
import matplotlib.pyplot as plt
logs = model.make_inspector().training_logs()

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")
plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")
plt.show()

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col=0)


In [None]:
#creating a new column 'nan_count' that counts nan values per row
train['nan_count']=train.isnull().sum(axis=1)

xtrain = train.sample(frac=0.3, random_state=0)
valid = train.drop(xtrain.index).sample(frac=0.05, random_state=0)
valid_data = valid.fillna(0)

In [None]:
valid = tfdf.keras.pd_dataframe_to_tf_dataset(valid, label='claim')

In [None]:
predictions = model.predict(valid)
y_true      = valid_data["claim"]

from sklearn.metrics import roc_auc_score
ROC_AUC = roc_auc_score(y_true, predictions)
print("The ROC AUC score is %.5f" % ROC_AUC )

In [None]:
sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample['claim'] = model.predict(test)

In [None]:
sample.to_csv('submission.csv',index=False)

### Reference: 
https://www.kaggle.com/carlmcbrideellis/classification-using-tensorflow-decision-forests

https://www.tensorflow.org/decision_forests/tutorials/beginner_colab