# A comparison between three well-known gradients boosting classifier
### **Seyedsaman Emami**
#### 23/April/2021

* **1. Introduction**
* **2. Data preparation**
    * 2.1 Load data
    * 2.2 Describe the target
    * 2.3 data distribution
    * 2.4 features overview
    * 2.5 Correlation
* **3. Model Training**
* **4. Evaluation**
    * 4.1 training time
    * 4.2 Accuracy
* **5. Conclusion**

# 1. Introduction
This is comparison between three well-known gradients boosting classifier from two major libraries (Namely Sklearn, TFBT, and xgboost)

**For computational reasons, I set the number of steps (epochs) and the number of trees to 2, and 50. If you want to achieve better accuracy, set them to 10, and 100, and respectively**

In [None]:
import xgboost
import numpy as np 
import pandas as pd 
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# 2. Data preparation
## 2.1 Load data

In [None]:
df = pd.read_csv(
    '/kaggle/input/forest-cover-type-dataset/covtype.csv')
df.head()
df.describe()
X = (df.drop('Cover_Type', axis=1)).values
y = (df['Cover_Type']).values
for i in range(len(np.unique(y)-1)):
    y[:][y[:] == i+1] = i
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1)

## 2.2 Describe the target

In [None]:
print(df['Cover_Type'].describe())

## 2.3 Data distribution
### Check the distribution of the cover type

In [None]:
sns.distplot(df['Cover_Type'], color='r', hist_kws={'alpha': 0.4});

## 2.4 features Overview
### Overview of all the features

In [None]:
df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

## 2.5 Correlation
### Examine the features that correlate with Cover type. 

In [None]:
corr = df.corr()['Cover_Type']
features_list = corr[abs(corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(features_list), features_list))


In [None]:
# for i in range(0, len(df.columns), 5):
#     sns.pairplot(data=df,
#                 x_vars=df.columns[i:i+5],
#                 y_vars=['Cover_Type'])

# 3. Model preparation

In [None]:
def _dataframe(X, y):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    X = X.astype("int64")
    y = y.astype("int64")

    feature = []
    for i in range(len(X.columns)):
        feature.append(str(i))

    col_rename = {i: j for i, j in zip(X.columns, feature)}
    X = X.rename(columns=col_rename, inplace=False)

    return X, y, feature

In [None]:
def _make_input_fn(X, y, n_epochs=None, shuffle=True):
    def input_fn():
        NUM_EXAMPLES = len(y)
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
        if shuffle:
            dataset = dataset.shuffle(NUM_EXAMPLES)
        # For training, cycle thru dataset as many times as need (n_epochs=None).
        dataset = dataset.repeat(n_epochs)
        # In memory training doesn't use batching.
        dataset = dataset.batch(NUM_EXAMPLES)
        return dataset
    return input_fn

def _accuracy(evaluate):
    item = list(evaluate.items())
    array = np.array(item)
    return (array[0, 1]).astype(np.float64)

def BostedTree(X, y, step):
    X, y, feature = _dataframe(X, y)

# Training and evaluation input functions.
    train_input_fn = _make_input_fn(X, y)
    eval_input_fn = _make_input_fn(X, y,
                                   shuffle=False,
                                   n_epochs=1)

# feature selection
    num_columns = feature
    feature_columns = []
    n_classes = len(np.unique(y))

    for feature_name in num_columns:
        feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                                dtype=tf.float32))
    est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                              n_batches_per_layer=1,
                                              n_classes=n_classes,
                                              n_trees=100,
                                              max_depth=5,
                                              learning_rate=0.1)
    fit = est.train(train_input_fn, max_steps=step)
    score = _accuracy(est.evaluate
                      (eval_input_fn, steps=1))

    return fit, score

# 3. Model Training

In [None]:
skl = GradientBoostingClassifier(max_depth=2,
                                 subsample=0.75,
                                 max_features=None,
                                 learning_rate=0.25,
                                 random_state=2,
                                 criterion="mse",
                                 n_estimators=5)

pipe_skl = Pipeline([("scaler", StandardScaler()), ("clf", skl)])
pipe_skl.fit(x_train, y_train)
err_skl = pipe_skl.score(x_test, y_test)

fit, score = BostedTree(x_train, y_train, 5)
err_tfbt = score

xgb = xgboost.XGBClassifier(learning_rate=0,
                            max_depth=2,
                            n_estimators=5,
                            subsample=1,
                            min_child_weight=1)
pipe_xgb = Pipeline([("scaler", StandardScaler()), ("clf", xgb)])
pipe_xgb.fit(x_train, y_train)
err_xgb = pipe_xgb.score(x_test, y_test)

# Evaluation

In [None]:
print('xgboost accuracy:', err_xgb, '\n',
      'TFBT accuracy:', err_tfbt, '\n', 'MART accuracy:', err_skl)