In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [1]:
# XG boost has a seperate library himself
# XG Boost Regressor

In [2]:
!pip install xgboost



In [4]:
from xgboost import XGBRegressor

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [5]:
X, y = make_regression(n_samples= 1000, n_features=2, noise=10, random_state=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
X_train.shape, X_test.shape

((700, 2), (300, 2))

In [9]:
regressor = XGBRegressor()
regressor

In [10]:
regressor.fit(X_train, y_train)

In [11]:
y_pred = regressor.predict(X_test)
y_pred

array([ -38.688965  ,  -98.05122   ,  165.04544   ,   25.92514   ,
         74.34782   ,   13.568865  ,  -15.671618  ,  148.8151    ,
       -186.5873    ,  104.940285  ,   62.147568  ,   22.332937  ,
        -91.75383   ,  109.22308   ,   22.979282  , -167.97192   ,
        157.17256   ,   46.3618    ,  -22.72984   ,  -14.404492  ,
       -165.92209   ,   10.106001  ,   11.879697  ,   44.285114  ,
         29.714306  ,   17.568573  ,  -80.4218    ,  -87.30698   ,
        132.47734   ,  218.1065    ,  188.05371   ,   31.570398  ,
         93.40019   ,   29.266151  ,   67.66439   ,   26.90779   ,
        -80.3057    ,   99.29118   , -184.46635   ,  -16.061247  ,
        145.21727   ,  138.1913    ,  -45.203396  , -205.61804   ,
        -34.282845  ,   13.602692  ,  -46.370934  ,   82.07315   ,
        -68.82875   ,  -54.19676   ,   49.08629   ,   52.35901   ,
         11.009757  ,  -53.345592  ,   -2.3280745 ,   -3.1329007 ,
        133.48596   ,   55.934177  ,  -85.83297   ,  133.31313

In [12]:
print("R2 Score: ", r2_score(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))

R2 Score:  0.9766041606160056
MAE:  10.46115674730029
MSE:  193.37775910652263


In [13]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.01, 0.05, 0.1, 0.2]
}

In [14]:
grid = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, verbose = 3)

In [15]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.01, n_estimators=100;, score=0.805 total time=   0.5s
[CV 2/5] END learning_rate=0.01, n_estimators=100;, score=0.828 total time=   0.1s
[CV 3/5] END learning_rate=0.01, n_estimators=100;, score=0.842 total time=   0.1s
[CV 4/5] END learning_rate=0.01, n_estimators=100;, score=0.833 total time=   0.5s
[CV 5/5] END learning_rate=0.01, n_estimators=100;, score=0.833 total time=   0.1s
[CV 1/5] END learning_rate=0.01, n_estimators=200;, score=0.944 total time=   0.1s
[CV 2/5] END learning_rate=0.01, n_estimators=200;, score=0.957 total time=   0.1s
[CV 3/5] END learning_rate=0.01, n_estimators=200;, score=0.967 total time=   0.1s
[CV 4/5] END learning_rate=0.01, n_estimators=200;, score=0.959 total time=   0.1s
[CV 5/5] END learning_rate=0.01, n_estimators=200;, score=0.958 total time=   0.1s
[CV 1/5] END learning_rate=0.01, n_estimators=300;, score=0.971 total time=   0.2s
[CV 2/5] END learning_rate

In [16]:
grid.best_params_

{'learning_rate': 0.05, 'n_estimators': 100}

In [18]:
best_model = grid.best_estimator_
best_model

In [20]:
y_pred = best_model.predict(X_test)
y_pred

array([-3.74482079e+01, -1.01720413e+02,  1.65499481e+02,  2.55963249e+01,
        7.93823318e+01,  1.30650311e+01, -1.71322918e+01,  1.40817459e+02,
       -1.77701660e+02,  1.10807503e+02,  6.52704926e+01,  2.03763561e+01,
       -8.66564255e+01,  1.05927353e+02,  2.17602329e+01, -1.68059677e+02,
        1.55091232e+02,  4.00303307e+01, -1.87466354e+01, -1.43764763e+01,
       -1.57493530e+02,  9.67679596e+00,  1.87945747e+01,  4.00041046e+01,
        2.81365948e+01,  1.96854877e+01, -7.86676254e+01, -8.23230591e+01,
        1.35507401e+02,  2.09632019e+02,  2.01810638e+02,  2.76435928e+01,
        9.70345840e+01,  3.00471287e+01,  6.28300934e+01,  2.65375385e+01,
       -7.90277863e+01,  1.03970047e+02, -1.86006638e+02, -1.72026958e+01,
        1.42034164e+02,  1.36954865e+02, -4.63105431e+01, -1.99974579e+02,
       -3.63797340e+01,  1.70514622e+01, -4.38418198e+01,  8.35929871e+01,
       -6.75434418e+01, -4.96478729e+01,  4.80380096e+01,  4.26059875e+01,
        1.05303249e+01, -

In [21]:
print("R2 Score: ", r2_score(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))

R2 Score:  0.9794754547745094
MAE:  9.595064400384274
MSE:  169.64514490132566


In [22]:
# XG Boost classification

In [23]:
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [24]:
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [26]:
classifier = XGBClassifier()
classifier

In [27]:
classifier.fit(X_train, y_train)

In [28]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1])

In [29]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.85
Confusion Matrix: 
 [[121  18]
 [ 27 134]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       139
           1       0.88      0.83      0.86       161

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



In [31]:
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators' : [100, 200, 300],
    'learning_rate' : [0.01, 0.05, 0.1, 0.2]
}

In [32]:
grid = GridSearchCV(estimator=classifier, param_grid=params, cv=5, verbose=3)

In [33]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.01, n_estimators=100;, score=0.850 total time=   8.0s
[CV 2/5] END learning_rate=0.01, n_estimators=100;, score=0.864 total time=   4.8s
[CV 3/5] END learning_rate=0.01, n_estimators=100;, score=0.807 total time=   5.8s
[CV 4/5] END learning_rate=0.01, n_estimators=100;, score=0.807 total time=   1.8s
[CV 5/5] END learning_rate=0.01, n_estimators=100;, score=0.821 total time=   1.8s
[CV 1/5] END learning_rate=0.01, n_estimators=200;, score=0.850 total time=   5.2s
[CV 2/5] END learning_rate=0.01, n_estimators=200;, score=0.871 total time=   2.4s
[CV 3/5] END learning_rate=0.01, n_estimators=200;, score=0.836 total time=   3.2s
[CV 4/5] END learning_rate=0.01, n_estimators=200;, score=0.857 total time=   0.7s
[CV 5/5] END learning_rate=0.01, n_estimators=200;, score=0.829 total time=   0.7s
[CV 1/5] END learning_rate=0.01, n_estimators=300;, score=0.850 total time=   3.0s
[CV 2/5] END learning_rate

In [34]:
best_model = grid.best_estimator_
best_model

In [35]:
y_pred = best_model.predict(X_test)
y_pred

array([0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1])

In [36]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy Score:  0.8633333333333333
Confusion Matrix: 
 [[122  17]
 [ 24 137]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.88      0.86       139
           1       0.89      0.85      0.87       161

    accuracy                           0.86       300
   macro avg       0.86      0.86      0.86       300
weighted avg       0.86      0.86      0.86       300

