In [5]:
import numpy as np
from scipy.stats import uniform, randint
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate], results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [16]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

xgb_model = xgb.XGBRegressor(objective = "reg:squarederror", random_state=42)
xgb_model.fit(X, y)

y_pred = xgb_model.predict(X)
mse = mean_squared_error(y, y_pred)
print(np.sqrt(mse))

0.2401475171547707


In [19]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
xgb_model = xgb.XGBClassifier(objective = "binary:logistic", random_state = 42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X)
print(confusion_matrix(y, y_pred))


[[212   0]
 [  0 357]]


In [20]:
wine = load_wine()
X = wine.data
y = wine.target
xgb_model = xgb.XGBClassifier(objective = "multi:softprob", random_state=42)
xgb_model.fit(X, y)
y_pred = xgb_model.predict(X)
print(confusion_matrix(y, y_pred))


[[59  0  0]
 [ 0 71  0]
 [ 0  0 48]]


In [13]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for train_index, test_index in kfold.split(X): 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_pred))
display_scores(np.sqrt(scores))

Scores: [63.93059871 61.44356415 67.49238017 69.51815605 59.9786771 ]
Mean: 64.473
Std: 3.584


In [15]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
scores = cross_val_score(xgb_model, X, y, scoring="neg_mean_squared_error", cv=5)
display_scores(np.sqrt(-scores))

Scores: [62.80101886 65.78389959 62.21211593 66.40836809 67.3001013 ]
Mean: 64.901
Std: 2.022


In [17]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
xgb_model = xgb.XGBRegressor()
params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, 
cv=3, verbose=1, n_jobs=1, return_train_score=True)
search.fit(X, y)

report_best_scores(search.cv_results_, 1)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Model with rank: 1
Mean validation score: 0.464 (std: 0.012)
Parameters: {'colsample_bytree': 0.7902634929450308, 'gamma': 0.1424202471887338, 'learning_rate': 0.041066084206359835, 'max_depth': 2, 'n_estimators': 101, 'subsample': 0.8010716092915446}



In [21]:
report_best_scores(search.cv_results_, 1)

Model with rank: 1
Mean validation score: 0.464 (std: 0.012)
Parameters: {'colsample_bytree': 0.7902634929450308, 'gamma': 0.1424202471887338, 'learning_rate': 0.041066084206359835, 'max_depth': 2, 'n_estimators': 101, 'subsample': 0.8010716092915446}



## XGBoost Early Stopping Rounds

In [22]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=20, random_state=42, eval_metric=["auc", "error", "error@0.6"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
y_pred = xgb_model.predict(X_test)

[0]	validation_0-auc:0.96348	validation_0-error:0.04895	validation_0-error@0.6:0.07692
[1]	validation_0-auc:0.97201	validation_0-error:0.04895	validation_0-error@0.6:0.06294
[2]	validation_0-auc:0.97035	validation_0-error:0.04196	validation_0-error@0.6:0.05594
[3]	validation_0-auc:0.97930	validation_0-error:0.04196	validation_0-error@0.6:0.06294
[4]	validation_0-auc:0.97857	validation_0-error:0.03497	validation_0-error@0.6:0.04895
[5]	validation_0-auc:0.97784	validation_0-error:0.04196	validation_0-error@0.6:0.04196
[6]	validation_0-auc:0.98408	validation_0-error:0.03497	validation_0-error@0.6:0.04895
[7]	validation_0-auc:0.98450	validation_0-error:0.04895	validation_0-error@0.6:0.04196
[8]	validation_0-auc:0.98616	validation_0-error:0.04895	validation_0-error@0.6:0.04895
[9]	validation_0-auc:0.99105	validation_0-error:0.04895	validation_0-error@0.6:0.04196
[10]	validation_0-auc:0.99126	validation_0-error:0.04895	validation_0-error@0.6:0.04196
[11]	validation_0-auc:0.99064	validation_0

In [23]:
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
xgb_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)
y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[51  3]
 [ 3 86]]


# Gradient Boost vs XGBoost

In general, gradient boosting works by iteratively adding decision trees to the model, with each subsequent tree attempting to correct the errors of the previous tree. The trees are built by recursively splitting the data into smaller and smaller subsets based on the values of the features. In datasets with many features, this can result in a very large number of possible splits and can make it difficult for the algorithm to find the most informative splits.

XGBoost uses several techniques to address this problem, such as subsampling and column subsampling, which can help to reduce the number of features that the algorithm considers. Subsampling involves training each tree on a random subset of the training data, which can help to reduce the variance of the model and prevent overfitting. Column subsampling involves randomly selecting a subset of the features for each tree, which can help to reduce the impact of noisy or irrelevant features.

In addition, XGBoost uses regularization techniques, such as L1 and L2 regularization, which can help to reduce the impact of uninformative features and prevent overfitting.

### Number of features > Number of samples

In [70]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


data = pd.read_csv('../data/leukemia.csv', sep = ',',header=None)
X = data.iloc[:, :12582]
y = data.iloc[:,-1:]
print(X.shape)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y_encoded,test_size = 0.2, random_state = 0)

print('==============Gradient Boosting Classifier=================')
clf = GradientBoostingClassifier(n_estimators = 100, learning_rate = 1.0, max_depth=1, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print('=============XGBoost Classifier===============')
xgb_model = xgb.XGBClassifier(n_estimators=100,eta=0.3, gamma=0.5, random_state=0)
xgb_model.fit(X_train, y_train)
print(xgb_model.score(X_test, y_test))
y_pred_xgb = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_xgb))

(57, 12582)
1.0
[[4 0 0]
 [0 3 0]
 [0 0 5]]
0.8333333333333334
[[4 0 0]
 [0 2 1]
 [1 0 4]]


### Number of features < Number of samples

In [72]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.datasets import make_blobs

X_train, y_train = make_blobs(n_samples=1000000, centers=2,random_state=7, cluster_std=1.5)
X_test, y_test = make_blobs(n_samples=10000, centers=2, random_state=7, cluster_std=1.5)
print(X_train.shape)
print('==============Gradient Boosting Classifier=================')
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

print('=============XGBoost Classifier===============')
xgb_model = xgb.XGBClassifier(n_estimators=100,eta=0.3, gamma=0.5, random_state=0)
xgb_model.fit(X_train, y_train)
print(xgb_model.score(X_test, y_test))
y_pred_xgb = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_xgb))

(1000000, 2)
0.9932
[[4971   29]
 [  39 4961]]
0.9933
[[4973   27]
 [  40 4960]]
