## Load data

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

## Peek at data

In [5]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [6]:
y[:5]

array([0, 0, 0, 0, 0])

In [7]:
print(iris['DESCR'])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

## Split into train - test sets

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [9]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

## Set parameters

In [10]:
param = {
    'max_depth': 4,  # max depth of a tree
    'eta': 0.4,      # training step for an iteration
    'silent': 1,     # logging mode
    'objective': 'multi:softprob',  # error evaluation for multiclass
    'num_class':3    # number of classes
}

num_round = 30       # number of iterations

## Train

In [11]:
mdl = xgb.train(param, dtrain, num_round)

In [12]:
preds = mdl.predict(dtest)
preds[:5]

array([[0.00376102, 0.9888777 , 0.00736123],
       [0.98399264, 0.01437339, 0.00163404],
       [0.00152796, 0.00357033, 0.99490166],
       [0.00296785, 0.9905234 , 0.00650873],
       [0.0031758 , 0.9605059 , 0.03631826]], dtype=float32)

This is a matrix of probablities. Select the class with highest probability.

In [13]:
import numpy as np

best_preds = np.asarray([np.argmax(val) for val in preds])

## Evaluate model

In [14]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, best_preds))

1.0


## Save model

In [15]:
from sklearn.externals import joblib
joblib.dump(mdl, 'iris_model.pkl', compress = True)

['iris_model.pkl']

## Load model

In [16]:
mdl_loaded = joblib.load('iris_model.pkl')

In [17]:
(mdl_loaded.predict(dtest) == preds).all()

True