XGBoost for Classification Problem Overiew in Python 3.x
Pipeline: 
1. Import the libraries/modules needed
2. Import data
3. Data cleaning and pre-processing
4. Train-test split
5. XGBoost training and prediction
6. Model Evaluation

## Import the libraries/modules needed

In [2]:
## import the libraries needed
import pandas as pd
import numpy as np

## Import data

In [3]:
## Import the dataset from scikit-learn library, and assign to a variable
from sklearn.datasets import load_boston
boston = load_boston()
## If you have another practice dataset import at this step

## Data cleaning and pre-processing

In [4]:
## assign your target
boston['PRICE'] = boston.target 

In [5]:
## assign the data to target and independent variables
X = boston.data
y = boston['PRICE']

## Train-test split

In [6]:
## split the data into train and test set. The test size here is 30% of the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 4)

## XGBoost training and prediction

In [7]:
## import xgboost regressor algorithm and fit the model
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [8]:
## After training the model, make a prediction on the train data
y_pred = xgb.predict(X_train)

## Model Evaluation

In [9]:
## import metrics to evaluate the performance of the XGBoost model
from sklearn import metrics
print('R^2:',metrics.r2_score(y_train, y_pred))
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_train, y_pred))
print('MSE:',metrics.mean_squared_error(y_train, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train, y_pred)))

R^2: 0.9703652512761263
Adjusted R^2: 0.9692321579425663
MAE: 1.1372202838208043
MSE: 2.230632123289034
RMSE: 1.4935300878419002


In [10]:
## Appply the model to the test set
y_test_pred = xgb.predict(X_test)

In [11]:
## Evaluate the performance of the model on the test set
acc_xgb = metrics.r2_score(y_test, y_test_pred)
print('R^2:', acc_xgb)
print('Adjusted R^2:',1 - (1-metrics.r2_score(y_test, y_test_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print('MAE:',metrics.mean_absolute_error(y_test, y_test_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_test_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

R^2: 0.8494894736313225
Adjusted R^2: 0.8353109457849979
MAE: 2.4509708843733136
MSE: 15.716320042597493
RMSE: 3.9643814199188117


In [12]:
y_test_pred.shape

(152,)