# Importing the libraries

In [5]:
import time
import xgboost
import numpy as np
import pandas as pd
from TFBT import TFBT
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from urllib.request import urlretrieve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Importing the dataset
For this problem, I considered the CoverType dataset. It has seven classes and 581012 instances, and 54 features.

In [6]:
url = 'https://archive.ics.uci.edu/ml/datasets/covertype/covertype.csv'
urlretrieve(url, 'covertype.csv')
df = pd.read_csv('covertype.csv')
X = (df.drop('Cover_Type', axis=1)).values
y = (df['Cover_Type']).values

# Splitting the dataset for the evaluation purpose

In [7]:
for i in range(len(np.unique(y)-1)):
    y[:][y[:] == i+1] = i

# Splitting the dataset for the evaluation purpose
Due to the size of the dataset and calculation time, I did not consider h-params tunning. The h-params here are the optimal values, based on the previous studies.

In [11]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2)

# Training the models
The purpose of this training is to have a comparison between two gradient boosting models. The important thing in this comparison is that the [TFBT](https://www.tensorflow.org/tutorials/estimator/boosted_trees) model had made over the XGboost logic.

Also, I modified the TFBT here based on the Sklearn Standard. You may check the [class](https://github.com/samanemami/TFBoostedTree/blob/main/TFBT.py) here.

About the Boosting iteration, I set both to 10, so we can compare the performance of both methods at the early stage of learning.

In [12]:
tfbt = TFBT(n_batches_per_layer=1,
            label_vocabulary=None,
            n_trees=10,
            max_depth=1,
            learning_rate=0.1,
            model_dir=None)
pipe_tfbt = Pipeline([("scaler", StandardScaler()), ("clf", tfbt)])
t0 = time.time()
pipe_tfbt.fit(x_train, y_train)
t1 = time.time()
fit_ti_tfbt = t1-t0
err_tfbt = pipe_tfbt.score(x_test, y_test)

xgb = xgboost.XGBClassifier(learning_rate=0.5,
                            max_depth=2,
                            n_estimators=10,
                            subsample=0.75,
                            min_child_weight=1)
pipe_xgb = Pipeline([("scaler", StandardScaler()), ("clf", xgb)])
x0 = time.time()
pipe_xgb.fit(x_train, y_train)
x1 = time.time()
fit_ti_xgb = x1-x0
err_xgb = pipe_xgb.score(x_test, y_test)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Sami\\Desktop\\TFBTtemp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling 





# Comparing the results
For the comparison, I considered the accuracy of the classification and the training time.

In [14]:
result = {}
result['TFBT accuracy'] = err_tfbt
result['XGBoost accuracy'] = err_xgb
result['TFBT Training time'] = fit_ti_tfbt
result['XGBoost Training time'] = fit_ti_xgb
pd.DataFrame(result, index=["Values"])

Unnamed: 0,TFBT accuracy,XGBoost accuracy,TFBT Training time,XGBoost Training time
Values,0.586068,0.706346,1404.223077,17.875942
