## Load data

In [40]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

## Peek at data

In [53]:
X[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [52]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Split into train - test sets

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [31]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

## Set parameters

In [32]:
param = {
    'max_depth': 4,  # max depth of a tree
    'eta': 0.25,      # training step for an iteration
    'silent': 1,     # logging mode
    'objective': 'multi:softprob',  # error evaluation for multiclass
    'num_class':3    # number of classes
}

num_round = 30       # number of iterations

## Train

In [33]:
mdl = xgb.train(param, dtrain, num_round)

In [34]:
preds = mdl.predict(dtest)
preds

array([[0.00459253, 0.98662657, 0.0087809 ],
       [0.98445857, 0.01234693, 0.00319446],
       [0.00293247, 0.00578737, 0.99128014],
       [0.00404118, 0.9867971 , 0.0091618 ],
       [0.00444779, 0.95553106, 0.04002117],
       [0.9892936 , 0.00749626, 0.00321015],
       [0.00352417, 0.9928308 , 0.00364506],
       [0.01488824, 0.04948338, 0.93562835],
       [0.00431239, 0.9596322 , 0.03605538],
       [0.00352417, 0.9928308 , 0.00364506],
       [0.01510952, 0.0519454 , 0.932945  ],
       [0.9900856 , 0.00594494, 0.00396953],
       [0.9851767 , 0.01162656, 0.00319679],
       [0.9908354 , 0.00594944, 0.00321515],
       [0.9908354 , 0.00594944, 0.00321515],
       [0.00270989, 0.9926495 , 0.00464055],
       [0.00226469, 0.00307733, 0.99465793],
       [0.00351361, 0.9898543 , 0.00663213],
       [0.00351512, 0.9902818 , 0.00620302],
       [0.00226469, 0.00307733, 0.99465793],
       [0.9908354 , 0.00594944, 0.00321515],
       [0.00753851, 0.10053787, 0.89192367],
       [0.

This is a matrix of probablities. Select the class with highest probability.

In [35]:
import numpy as np

best_preds = np.asarray([np.argmax(val) for val in preds])

## Evaluate model

In [36]:
from sklearn.metrics import precision_score

print(precision_score(y_test, best_preds, average = 'macro'))

1.0


## Save model

In [37]:
from sklearn.externals import joblib
joblib.dump(mdl, 'iris_model.pkl', compress = True)

['iris_model.pkl']

## Load model

In [38]:
mdl_loaded = joblib.load('iris_model.pkl')

In [39]:
(mdl_loaded.predict(dtest) == preds).all()

True