In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Installing tensorflow Decision tree forest library
We need to explicitly install this library since it's not supported on kaggle yet

In [None]:

pip install tensorflow_decision_forests

## Importing the necessary libraries
We need to explicitly install this library since it's not supported on kaggle yet

In [None]:
import tensorflow_decision_forests as tfdf
try:
  from wurlitzer import sys_pipes
except:
  from colabtools.googlelog import CaptureLog as sys_pipes

from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [None]:
data = pd.read_csv('/kaggle/input/robi-datathon-2-pre-assessment/train.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.info()

### Checking how many missing values are there 

In [None]:
data.isnull().sum().sort_values(ascending=False)

## Pre-processing 

Tf Decision Trees will work with both categorical and numerical data and account for any missing data as well. 

## Training and evaluation

We will split the dataset into train and test set with a 90%-10% split percentage. I just like keeping most of the data for training purposes. You will also notice that we ditched scikit-learn and used numpy and pandas to do the splitting by utilizing randomness from the numpy module. Although there remains a question for reproducability since there are no random_seed param here unlike that in Scikit-learn. T

In [None]:
# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.10):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


train_ds_pd, test_ds_pd = split_dataset(data)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

## Tf Datasets

#### We convert the pandas dataframes to tensorflow datasets which are more efficient and provide faster operations. Check out them [here](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label="label")
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label="label")

## Training and model selection

#### Tf-df provides a few variants learning algorithms for Decision trees namely:
- RandomForest
- GradientBoosted Trees
- CartModel,
- DistributedGradientBoostedTreesModel

##### For this notebook I tried the first two and found the second to perform better

In [None]:
# Specify the model.
model_1 = tfdf.keras.GradientBoostedTreesModel(
    num_trees=500,
    growing_strategy="BEST_FIRST_GLOBAL",
    max_depth=8,
    split_axis="SPARSE_OBLIQUE",
    categorical_algorithm="RANDOM",)

# Optionally, add evaluation metrics.
model_1.compile(
    metrics=["AUC"])

# Train the model.

model_1.fit(x=train_ds, validation_data=test_ds)

## Evaluation
When the model finishes training you can evaluate it on the test dataset that we set aside earlier. Based on the performance here we can then decide to tune hyperparameters or change the learning algorithm to suit our needs

In [None]:
evaluation = model_1.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

## Inspecting model structures and hidden score
The different algorithms of Decision Trees will score the features differently and assign importance, we can inspect them like so

In [None]:
model_1.summary()

In [None]:
model_1.make_inspector().features()

In [None]:
model_1.make_inspector().variable_importances()

## Inspecting and Plotting the training logs 📈📉
The training logs which are availbale through the `make_inspector()` method contain wealth of information of how the training progresses. It is more useful to plot the result and inpect the trend

In [None]:
model_1.make_inspector().training_logs()

In [None]:
import matplotlib.pyplot as plt

logs = model_1.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

### We can see that training process fluctuates quite a lot but the model converges after 25 or so number of trees

## Submission Time

In [None]:
updated_test = pd.read_csv('/kaggle/input/robi-datathon-2-pre-assessment/test.csv')
test = tfdf.keras.pd_dataframe_to_tf_dataset(updated_test)
preds = model_1.predict(test)

In [None]:
updated_test['label'] = preds

In [None]:
submission = updated_test[['id','label']]
#submission.columns = ['id', 'label']
submission.to_csv('submission.csv', index=False)

Finally we shouldn't forget that the model is returning a probability between 0 and 1 and we need to define a threshold above which the prediction is true and false otherwise

## References:
* [Official Tensorflow Decision Forest Blog](https://blog.tensorflow.org/2021/05/introducing-tensorflow-decision-forests.html)
* [Official Tensorflow Page for Decision Forest](https://www.tensorflow.org/decision_forests)
