# Importing the libraries

In [None]:
import time
import xgboost
import numpy as np
import pandas as pd
import tensorflow as tf
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from urllib.request import urlretrieve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Importing the dataset
For this problem, I considered the CoverType dataset. It has seven classes and 581012 instances, and 54 features.

In [None]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', header = None)
data.head()


featuer = []
for i in range(len(data.columns)-1):
  featuer.append(str(i))

df = pd.DataFrame(data)
# Replace the header with String value 
col_rename = {i:j for i,j in zip(df.columns, featuer)}
df = df.rename(columns=col_rename, inplace=False)
df.head()

X = (df.iloc[:,:-1]).values
y = (df.iloc[:, -1]).values

for i in range(len(np.unique(y)-1)):
    y[:][y[:] == i+1] = i

X = df.iloc[:, :-1]
y = df.iloc[:, -1]
dftrain, dfeval, y_train, y_eval = train_test_split(
    X, y, test_size=0.3, random_state=2)

## Defining the TensorFlow BoostedTree


In [None]:
NUM_EXAMPLES = len(y_train)


def make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)

        dataset = dataset.repeat(n_epochs)

        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn


# Training and evaluation input functions.
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)


NUMERIC_COLUMNS = featuer


feature_columns = []


for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                            dtype=tf.float32))


est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=1,
                                          n_classes=7,
                                          n_trees=1,
                                          max_depth=1)

est.train(train_input_fn, max_steps=100)

est.evaluate(eval_input_fn)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp1rk6_i60', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:ten

{'accuracy': 0.6341392,
 'average_loss': 1.7333094,
 'global_step': 1,
 'loss': 1.7333094}

# Splitting the dataset for the evaluation purpose
Due to the size of the dataset and calculation time, I did not consider h-params tunning. The h-params here are the optimal values, based on the previous studies.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2)

# Training the models
The purpose of this training is to have a comparison between two gradient boosting models. The important thing in this comparison is that the [TFBT](https://www.tensorflow.org/tutorials/estimator/boosted_trees) model had made over the XGboost logic.

Also, I modified the TFBT here based on the Sklearn Standard. You may check the [class](https://github.com/samanemami/TFBoostedTree/blob/main/TFBT.py) here.

About the Boosting iteration, I set both to 10, so we can compare the performance of both methods at the early stage of learning.

In [None]:
xgb = xgboost.XGBClassifier(learning_rate=0.5,
                            max_depth=2,
                            n_estimators=10,
                            subsample=0.75,
                            min_child_weight=1)
pipe_xgb = Pipeline([("scaler", StandardScaler()), ("clf", xgb)])
x0 = time.time()
pipe_xgb.fit(x_train, y_train)
x1 = time.time()
fit_ti_xgb = x1-x0
err_xgb = pipe_xgb.score(x_test, y_test)

# XGboost results
For the comparison, I considered the accuracy of the classification and the training time.

In [None]:
result = {}
result['XGBoost accuracy'] = err_xgb
result['XGBoost Training time'] = fit_ti_xgb
pd.DataFrame(result, index=["Values"])

Unnamed: 0,XGBoost accuracy,XGBoost Training time
Values,0.706346,33.967959
