In [None]:
!pip install tensorflow_decision_forests

In [None]:
# Import Python packages
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_decision_forests as tfdf
print("TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
# Define helper functions for plotting training evaluation curves

def plot_tfdf_model_training_curves(model):
    # This function was adapted from the following tutorial:
    # https://www.tensorflow.org/decision_forests/tutorials/beginner_colab
    logs = model.make_inspector().training_logs()
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    # Plot RMSE vs number of trees
    plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
    plt.xlabel("Number of trees")
    plt.ylabel("RMSE (out-of-bag)")
    plt.show()

In [None]:
# print list of all data and files attached to this notebook
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load to pandas dataframe (for data exploration)
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')

# load to tensorflow dataset (for model training)
train_tfds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="target", task=tfdf.keras.Task.REGRESSION)
test_tfds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, task=tfdf.keras.Task.REGRESSION)

In [None]:
# print column names
print(train_df.columns)

In [None]:
# preview first few rows of data
train_df.head(10)

In [None]:
# print basic summary statistics
train_df.describe()

In [None]:
# check for missing values
sns.heatmap(train_df.isnull(), cbar=False)

# Random Forest

In [None]:
print(tfdf.keras.RandomForestModel.predefined_hyperparameters())

In [None]:
# Train the model
rf_model = tfdf.keras.RandomForestModel(hyperparameter_template="better_default", task=tfdf.keras.Task.REGRESSION)
rf_model.compile(metrics=[tf.keras.metrics.RootMeanSquaredError()]) 
rf_model.fit(x=train_tfds)

In [None]:
plot_tfdf_model_training_curves(rf_model)

In [None]:
inspector = rf_model.make_inspector()
inspector.evaluation()

In [None]:
rf_model.evaluate(train_tfds)

In [None]:
print("Model type:", inspector.model_type())
print("Objective:", inspector.objective())
print("Evaluation:", inspector.evaluation())

In [None]:
# Adapted from https://www.tensorflow.org/decision_forests/tutorials/advanced_colab
# See list of inspector methods from:
# [field for field in dir(inspector) if not field.startswith("_")]
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

In [None]:
inspector.variable_importances()["SUM_SCORE"]

# Gradient Boosted Trees

In [None]:
# As mentioned previously, TF-DF gives you lots of different "default" hyper-parameter settings to choose from.
print(tfdf.keras.GradientBoostedTreesModel.predefined_hyperparameters())

In [None]:
# Train the model
gb_model = tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1", task=tfdf.keras.Task.REGRESSION)
gb_model.compile(metrics=[tf.keras.metrics.RootMeanSquaredError()])
gb_model.fit(x=train_tfds)

In [None]:
plot_tfdf_model_training_curves(gb_model)

In [None]:
inspector = gb_model.make_inspector()
inspector.evaluation()

In [None]:
gb_model.evaluate(train_tfds)

In [None]:
print("Model type:", inspector.model_type())
print("Objective:", inspector.objective())
print("Evaluation:", inspector.evaluation())

In [None]:
inspector.variable_importances()["SUM_SCORE"]

In [None]:
gb_model.summary()

In [None]:
sample_submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv')
sample_submission_df['target'] = gb_model.predict(test_tfds)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()