# Tensorflow Decision Forests

In this notebook we get benchmarks for the gradient boosting model provided with the [Tensorflow Decision Forests](https://www.tensorflow.org/decision_forests) library. For various input sizes (10k to 100k samples) we get AUC scores and training times for models with and without categorical features explicitly specified.

Personally, I don't think this library is ready to be used seriously for these competitions primarily because it runs so slowly (no GPU/TPU optimizations yet) and requires a linux environment (which I can only easily access through these notebooks). You can get similar results much easier and faster using one of the established gradient boosting frameworks like XGBoost, LightGBM, and CatBoost.


**Note:** This notebook will take several hours to run

In [None]:
# Global Variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_TREES = 1000
EARLY_STOP = 25

In [None]:
# Install TFDF library
!pip3 install -q tensorflow_decision_forests --upgrade

In [None]:
import numpy as np
import pandas as pd
import pyarrow
import warnings
import time
import gc
import os

# Hide warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Model and evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Tensorflow
import tensorflow as tf
import tensorflow_decision_forests as tfdf
tf.random.set_seed(RANDOM_SEED)

# Preparing Data

1. Load original data
2. Downcast datatypes wherever possible
3. Split data into halves, use one half for estimation
4. Save data in pandas and tensorflow formats

In [None]:
%%time
# Load original training data
train = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")

# List for tracking categorical features
categorical_features = list()

# Downcast training data while keeping track of categorical variables
for col in train.columns:
    if train[col].dtype == "int64":
        train[col] = train[col].astype('int32')
        # ignore target column
        if col == "target": continue
        categorical_features.append(
            tfdf.keras.FeatureUsage(
                name = col, 
                semantic = tfdf.keras.FeatureSemantic.CATEGORICAL
            )
        )
    elif train[col].dtype == "float64":
        train[col] = pd.to_numeric(train[col], downcast ='float')

# Halve the data, we will get AUC estimates on a holdout set
train, holdout = train_test_split(
    train,
    train_size = 500000,
    stratify = train['target'],
    shuffle = True
)

# Save pandas dataframe for quick retrieval later
train.reset_index(drop = True, inplace = True)
holdout.reset_index(drop = True, inplace = True)
train.to_feather('train_500k.feather')
holdout.to_feather('holdout_full.feather')
del train; gc.collect()

# Create tensorflow data
holdout_tf = tfdf.keras.pd_dataframe_to_tf_dataset(
    holdout,
    label = 'target',
    in_place = True
)

# Save Train data
tf.data.experimental.save(holdout_tf, "holdout_tf")

# Garbage Collection (free up memory)
del holdout, holdout_tf; gc.collect()

# Helper Functions

We create a several function to perform various steps of the training and evaluation process, mostly to avoid having too many things loaded in memory at once.

In [None]:
def get_training_data(n_rows = 10000):
    
    assert 0 < n_rows < 500000
    train = pd.read_feather('train_500k.feather')
    
    train, test = train_test_split(
        train,
        train_size = n_rows,
        stratify = train['target'],
        shuffle = True
    )
    
    # Prepare Train Data
    train_df = tfdf.keras.pd_dataframe_to_tf_dataset(
        train,
        label = 'target',
        in_place = True
    )
    
    return train_df

In [None]:
def train_model(n_rows = 10000, categorical = False):

    train_df = get_training_data(n_rows)
    gc.collect()

    start = time.time()
    # Define model, using explicitly defined categoricals
    if categorical:
        model = tfdf.keras.GradientBoostedTreesModel(
            task = tfdf.keras.Task.CLASSIFICATION,
            num_trees = NUM_TREES,
            early_stopping_num_trees_look_ahead = EARLY_STOP,
            features = categorical_features,
            exclude_non_specified_features = False,
            verbose = 0
        )
    else:
        model = tfdf.keras.GradientBoostedTreesModel(
            task = tfdf.keras.Task.CLASSIFICATION,
            num_trees = NUM_TREES,
            early_stopping_num_trees_look_ahead = EARLY_STOP,
            verbose = 0
        )

    # Metric for validation 
    model.compile(
        metrics=[tf.metrics.AUC()]
    )

    # Training
    model.fit(train_df, verbose = 0)
    end = time.time()

    # Delete training data (free up memory)
    del train_df
    gc.collect()
    
    return model, round(end-start, 6)

In [None]:
def get_holdout_preds(model):
    
    holdout_df = tf.data.experimental.load("holdout_tf")

    preds = model.predict(holdout_df)[:,0]

    del holdout_df
    gc.collect()
    return preds

In [None]:
def get_holdout_score(y_preds):
    
    holdout = pd.read_feather('holdout_full.feather')
    y_true = holdout['target']
    
    return roc_auc_score(y_true, y_preds)

# Benchmarks

In [None]:
def get_benchmarks():
    
    data = defaultdict(list)
    
    for training_size in [10000, 20000, 30000, 40000, 50000, 75000]:

        # Train model, no specified categorical features
        model, training_time = train_model(n_rows = training_size)
        preds = get_holdout_preds(model)
        score = get_holdout_score(preds)
        print('All Numerical Features')
        print(f'Rows: {training_size}, Time: {round(training_time, 2)}')
        print(f'Validation Score: {round(score, 6)}\n')
        
        # save results
        data['size'].append(training_size)
        data['features'].append('numerical')
        data['time'].append(training_time)
        data['auc'].append(score)
        
        # free up memory
        del model
        gc.collect()
        
        # Train model, specify categorical features
        model, training_time = train_model(
            n_rows = training_size, 
            categorical = True
        )
        preds = get_holdout_preds(model)
        score = get_holdout_score(preds)
        print('Categorical Features')
        print(f'Rows: {training_size}, Time: {round(training_time, 2)}')
        print(f'Validation Score: {round(score, 6)}\n')
        
        # save results
        data['size'].append(training_size)
        data['features'].append('categorical')
        data['time'].append(training_time)
        data['auc'].append(score)
        
        # free up memory
        del model
        gc.collect()
        
    return pd.DataFrame(data)

In [None]:
# Output has been hidden
data = get_benchmarks()

In [None]:
data

We see that explicitly specifying the categorical features significantly increases the training times but does not result in notably better models so we will not bother specifying categorical features for subsequent models.