In [74]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from scipy.misc import imresize
import matplotlib.pyplot as plt
%matplotlib inline

# fix random seed for reproducibility
np.random.seed(7)

# Load MNIST dataset
from keras.datasets import mnist
(Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data()
(ntrain, xdim, ydim) = Xtrain.shape
ntest = Xtest.shape[0]


In [75]:
# *** DOWNSAMPLE THE IMAGES ***
factor = 1/4

Xtrain_down = np.ones((ntrain, int(xdim*factor), int(ydim*factor)))
for i in range(ntrain):
    Xtrain_down[i, :, :] = imresize(Xtrain[i,:,:], factor)

Xtest_down = np.ones((ntest, int(xdim*factor), int(ydim*factor)))
for i in range(ntest):
    Xtest_down[i,:,:] = imresize(Xtest[i,:,:], factor)
    

In [76]:
# *** VECTORIZE IMAGES ***
Xtrain_down = Xtrain_down.reshape(ntrain, int(xdim*factor)**2).astype('float32') / 255
Xtest_down  = Xtest_down.reshape(ntest, int(xdim*factor)**2).astype('float32') / 255
Xtrain      = Xtrain.reshape(ntrain, xdim**2).astype('float32') / 255
Xtest       = Xtest.reshape(ntest, xdim**2).astype('float32') / 255
# Categorical labels
# Ytrain = np_utils.to_categorical(Ytrain, 10)
# Ytest  = np_utils.to_categorical(Ytest, 10)


In [77]:
import os
import xgboost as xgb
import sklearn
from xgboost.sklearn import XGBClassifier

xg_train = xgb.DMatrix(Xtrain, label=Ytrain)
xg_test  = xgb.DMatrix(Xtest, label=Ytest)
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]

# setup parameters for xgboost
params = {}
# use softmax multi-class classification
params['objective'] = 'multi:softmax'
# scale weight of positive examples
params['eta'] = 0.1
params['max_depth'] = 6
params['silent'] = 1
params['nthread'] = 4
params['num_class'] = 10

num_round = 5
bst = xgb.train(params, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test );
print ('predicting, classification error=%f' % (sum( int(pred[i]) != Ytest[i] 
                                                    for i in range(len(Ytest))) / float(len(Ytest)) ))

# # do the same thing again, but output probabilities
# params['objective'] = 'multi:softprob'
# bst = xgb.train(params, xg_train, num_round, watchlist );
# # Note: this convention has been changed since xgboost-unity
# # get prediction, this is in 1D array, need reshape to (ndata, nclass)
# yprob = bst.predict( xg_test ).reshape( Ytest.shape[0], 10 )
# ylabel = np.argmax(yprob, axis=1)

# print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != Ytest[i] 
#                                                     for i in range(len(Ytest))) / float(len(Ytest)) ))

# # print("total %i/%i" % (np.sum(labels == preds), len(preds)))


[0]	train-merror:0.1321	test-merror:0.1446
[1]	train-merror:0.105	test-merror:0.1185
[2]	train-merror:0.091467	test-merror:0.1062
[3]	train-merror:0.084317	test-merror:0.0965
[4]	train-merror:0.0803	test-merror:0.0914
predicting, classification error=0.091400
[0]	train-merror:0.1321	test-merror:0.1446
[1]	train-merror:0.105	test-merror:0.1185
[2]	train-merror:0.091467	test-merror:0.1062
[3]	train-merror:0.084317	test-merror:0.0965
[4]	train-merror:0.0803	test-merror:0.0914
predicting, classification error=0.091400


** Control Overfitting

When you observe high training accuracy, but low tests accuracy, it is likely that you encounter overfitting problem.
There are in general two ways that you can control overfitting in xgboost

- The first way is to directly control model complexity
This include max_depth, min_child_weight and gamma

- The second way is to add randomness to make training robust to noise
This include subsample, colsample_bytree
You can also reduce stepsize eta, but needs to remember to increase num_round when you do so.

** Handle Imbalanced Dataset

For common cases such as ads clickthrough log, the dataset is extremely imbalanced. This can affect the training of xgboost model, and there are two ways to improve it.

- If you care only about the ranking order (AUC) of your prediction
Balance the positive and negative weights, via scale_pos_weight
Use AUC for evaluation

- If you care about predicting the right probability
In such a case, you cannot re-balance the dataset
In such a case, set parameter max_delta_step to a finite number (say 1) will help converge