In [1]:
# !pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn import tree

In [3]:
X,y = load_breast_cancer(return_X_y= True)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state = 42)

In [5]:
from xgboost import XGBClassifier

In [6]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [7]:
y_training_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy_score(y_train, y_training_pred)

1.0

In [9]:
accuracy_score(y_test, y_test_pred)

0.956140350877193

In [10]:
confusion_matrix(y_test, y_test_pred)

array([[40,  3],
       [ 2, 69]], dtype=int64)

### Hyperparameter Tuning

In [12]:
params = {
    'learning_rate' : [0.05, 0.10, 0.15, 0.2, 0.25, 0.30],
    'max_depth' : [3,4,5,6,8,10,12],
    'min_child_weight' : [1,3,5,7],
    'gamma' : [0, 0.1, 0.2, 0.3, 0.4]
}

In [11]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [13]:
rs_xgb = XGBClassifier()

In [16]:
random_search = RandomizedSearchCV(rs_xgb, param_distributions = params, n_iter = 5, cv = 5)

In [17]:
random_search.fit(X_train, y_train)

In [22]:
random_search.best_estimator_

In [23]:
random_search.best_params_

{'min_child_weight': 5, 'max_depth': 12, 'learning_rate': 0.15, 'gamma': 0.3}

In [24]:
xgb = XGBClassifier(min_child_weight = 5, max_depth = 12, learning_rate = 0.15, gamma = 0.3)
xgb.fit(X_train, y_train)

In [25]:
y_training_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

In [26]:
accuracy_score(y_test, y_test_pred)

0.9736842105263158

In [27]:
confusion_matrix(y_test, y_test_pred)

array([[41,  2],
       [ 1, 70]], dtype=int64)

In [32]:
from  sklearn.model_selection import cross_val_score
score = cross_val_score(xgb, X_train, y_train, cv = 5)

In [33]:
score.mean()

0.9736263736263737