# Auto-Tuning TensorFlow GradientBoostedTreesModel

In [None]:
!pip install -q tensorflow_decision_forests

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import keras_tuner as kt
import matplotlib.pyplot as plt

In [None]:
# Display settings
plt.rcParams['figure.figsize'] = 12, 8
plt.rcParams.update({'font.size': 11})
plt.style.use('fivethirtyeight')

In [None]:
# Original data
TRAIN_PATH = '../input/tabular-playground-series-sep-2021/train.csv'
TEST_PATH = '../input/tabular-playground-series-sep-2021/test.csv'

train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
print('Train data shape:', train_data.shape)
print('Test data shape:', test_data.shape)

## Feature engineering

In [None]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    """Function adds new features based on missing values
    (45% correlation with the target) and statistics for each row.
    :param df: Original DataFrame
    :return: Updated DataFrame
    """
    df['n_nans'] = df[features].isnull().sum(axis=1)
    df['std'] = df[features].std(axis=1)
    df['var'] = df[features].var(axis=1)
    return df

In [None]:
# Original features (int and float values of various scale)
features = [f'f{i}' for i in range(1, 119)]

# Add new features
train_data = train_data.pipe(add_features)
test_data = test_data.pipe(add_features)

# Update input features list
features += ['n_nans', 'std', 'var']

target = 'claim'

## Data Processing

In [None]:
# Create TF Dataset from all labeled samples.
n_samples = len(train_data)

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    train_data[features + [target]], label=target).unbatch()

# This method creates a batched Dataset with batch_size=64.
# We unbatch it to be able to split into smaller subsets and use other batch size.

In [None]:
# Small subsets of data to use in quick search for optimal hyperparameters.
train_subset = 100_000  # Number of samples
valid_subset = 10_000

batch_size = 256

train_small_ds = train_ds.take(train_subset).batch(batch_size)
valid_small_ds = train_ds.skip(train_subset).take(valid_subset).batch(batch_size)

In [None]:
# Larger train and validation sets for retraining and evaluating
# the model with optimal parameters selected.
n_valid_samples = int(0.1 * n_samples)  # 10% of all train samples

valid_ds = train_ds.take(n_valid_samples).batch(batch_size)
train_ds = train_ds.skip(n_valid_samples).batch(batch_size)

## AutoML with KerasTuner

In [None]:
def build_model(hp):
    """Function initializes the model and defines search space.
    :param hp: Hyperparameters
    :return: Compiled GradientBoostedTreesModel model
    """
    model = tfdf.keras.GradientBoostedTreesModel(
        num_trees=hp.Int('num_trees', min_value=10, max_value=710, step=25),
        growing_strategy=hp.Choice('growing_strategy', values=['BEST_FIRST_GLOBAL', 'LOCAL']),
        max_depth=hp.Int('max_depth', min_value=3, max_value=16, step=1),
        subsample=hp.Float('subsample', min_value=0.1, max_value=0.95, step=0.05),
        num_threads=4,
        missing_value_policy='GLOBAL_IMPUTATION')  # Default parameter,
        # missing values are replaced by the mean or the most frequent value.

    model.compile(metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

In [None]:
# Keras tuner
tuner = kt.BayesianOptimization(  # Or RandomSearch, or Hyperband
    build_model,
    objective=kt.Objective('val_auc', direction='max'),  # Or 'val_loss'
    max_trials=20,
    project_name='classifier')

# Select the best parameters using a small subset of the train data.
tuner.search(train_small_ds, epochs=1, validation_data=valid_small_ds)

In [None]:
# Display the results
tuner.results_summary()

In [None]:
# Best model trained on a small subset of the thain data
# (could be used for predictions as is).
best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
# Instantiate untrained model with the best parameters
# and train on the larger training set.
best_hp = tuner.get_best_hyperparameters()[0]
model = tuner.hypermodel.build(best_hp)

history = model.fit(train_ds, validation_data=valid_ds,
                    shuffle=False,
                    workers=4, use_multiprocessing=True)

In [None]:
# Train metrics
inspect = model.make_inspector()
inspect.evaluation()

In [None]:
# Visualize training progress
logs = inspect.training_logs()

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], 
         [log.evaluation.accuracy for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Accuracy (out-of-bag)')
plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], 
         [log.evaluation.loss for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Logloss (out-of-bag)')
plt.show()

In [None]:
# Model accuracy on the validation set
evaluation = model.evaluate(valid_ds, return_dict=True)
for name, value in evaluation.items():
    print(f'{name}: {value:.4f}')

In [None]:
# Prediction on the test set
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data[features])
test_data['claim'] = model.predict(
    test_ds, workers=4, use_multiprocessing=True)

In [None]:
# Save predicted values for the test set
test_data[['id', 'claim']].to_csv('submission.csv', index=False)
test_data[['id', 'claim']].head()