# Notebook Goal

<ul><li>This notebook aims to demonstrate the working of TensorFlow Decision Forests.</li>
<li>TensorFlow Decision Forests (TF-DF) is a library for the training, evaluation, interpretation and inference of Decision Forest models.</li></ul>

# Importing libraries

In [None]:
!pip install tensorflow_decision_forests

In [None]:
import tensorflow_decision_forests as tfdf

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math

from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [None]:
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

# Datasets

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col=0)
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.describe().T

In [None]:
print('Train data shape:', train.shape)
print('Test data shape:', test.shape)

# Duplicate Values

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

In [None]:
train.drop_duplicates(keep='first', inplace=True)
duplicates_train = train.duplicated().sum()

print('Train data shape:', train.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

In [None]:
# Encode the categorical label into an integer.
#
# Details:
# This stage is necessary if your classification label is represented as a
# string. Note: Keras expected classification labels to be integers.

# Name of the label column.
label = "target"

classes = train[label].unique().tolist()
print(f"Label classes: {classes}")

train[label] = train[label].map(classes.index)

In [None]:
# Split the dataset into a training and a validation dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, val_ds_pd = split_dataset(train)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(val_ds_pd)))

# Convert the pandas dataframe into tensorflow datasets

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_ds_pd, label=label)

In [None]:
feature_1 = tfdf.keras.FeatureUsage(name="A1T2G6C1", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_2 = tfdf.keras.FeatureUsage(name="A1T2G7C0", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_3 = tfdf.keras.FeatureUsage(name="A1T2G6C1", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_4 = tfdf.keras.FeatureUsage(name="A3T0G3C4", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_5 = tfdf.keras.FeatureUsage(name="A3T1G3C3", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_6 = tfdf.keras.FeatureUsage(name="A4T0G1C5", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_7 = tfdf.keras.FeatureUsage(name="A4T1G4C1", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_8 = tfdf.keras.FeatureUsage(name="A3T0G5C2", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_9 = tfdf.keras.FeatureUsage(name="A3T4G3C0", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
feature_10 = tfdf.keras.FeatureUsage(name="A4T0G1C5", semantic=tfdf.keras.FeatureSemantic.NUMERICAL)

all_features = [feature_1, feature_2, feature_3, feature_4, feature_5, feature_6,feature_7, feature_8, feature_9,feature_10]

# Train the model

In [None]:
# Specify the model.
model_1 = tfdf.keras.GradientBoostedTreesModel(features=all_features, exclude_non_specified_features=True)

# Optionally, add evaluation metrics.
model_1.compile(
    metrics=["accuracy"])

# Train the model.
# "sys_pipes" is optional. It enables the display of the training logs.
model_1.fit(x=train_ds)

# Evaluate the model

In [None]:
evaluation = model_1.evaluate(val_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

# Model Structure

In [None]:
model_1.summary()

# Feature Importance

In [None]:
model_1.make_inspector().features()


In [None]:
# The feature importances
model_1.make_inspector().variable_importances()

# Model Self Evaluation

In [None]:
model_1.make_inspector().evaluation()

# Plotting the training logs

In [None]:
model_1.make_inspector().training_logs()

<p> Above cell allows one to see the logs</p>

In [None]:
import matplotlib.pyplot as plt

logs = model_1.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

# If you found this notebook interesting & helpful, please consider to upvote!!