## XGBoost Cloud Prediction - Iris Classification

In [2]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

Matplotlib is building the font cache; this may take a moment.


### Invoke SageMaker Prediction service

In [3]:
# Acquire a realtime endpoint

endpoint_name = 'xgboost-iris-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [4]:
predictor.serializer = CSVSerializer()

In [5]:
# Test predictive quality against data in validation file

df_all = pd.read_csv(
    'iris_validation.csv',
    names=['encoded_class', 'sepal_length','sepal_width','petal_length','petal_width'])

In [6]:
df_all.head()

Unnamed: 0,encoded_class,sepal_length,sepal_width,petal_length,petal_width
0,1,5.8,2.7,4.1,1.0
1,0,4.8,3.4,1.6,0.2
2,1,6.0,2.2,4.0,1.0
3,2,6.4,3.1,5.5,1.8
4,2,6.7,2.5,5.8,1.8


In [7]:
df_all.columns

Index(['encoded_class', 'sepal_length', 'sepal_width', 'petal_length',
       'petal_width'],
      dtype='object')

In [8]:
# Need to pass as array to the prediction
# can pass a numpy array
arr_test = df_all[[ 'sepal_length','sepal_width','petal_length','petal_width']].values

In [9]:
type(arr_test)

numpy.ndarray

In [10]:
arr_test.shape

(45, 4)

In [12]:
arr_test[:5]

array([[5.8, 2.7, 4.1, 1. ],
       [4.8, 3.4, 1.6, 0.2],
       [6. , 2.2, 4. , 1. ],
       [6.4, 3.1, 5.5, 1.8],
       [6.7, 2.5, 5.8, 1.8]])

In [14]:
result = predictor.predict(arr_test[:2])

In [17]:
result

b'1.0\n0.0\n'

In [21]:
# For large set of data we can split input data into chunks

pattern = r'[^0-9.]+'

predictions = []
for arr in np.array_split(arr_test,10):
    print(arr.shape)
    result = predictor.predict(arr)
    result = re.split(pattern,result.decode("utf-8"))
    predictions += [int(float(r)) for r in result if r!=""]

(5, 4)
(5, 4)
(5, 4)
(5, 4)
(5, 4)
(4, 4)
(4, 4)
(4, 4)
(4, 4)
(4, 4)


In [22]:
len(predictions)

45

In [25]:
predictions[:5]

[1, 0, 1, 2, 2]

In [28]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

In [30]:
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='<U15')

In [33]:
df_all['class']=le.inverse_transform(df_all.encoded_class)

In [35]:
df_all['predicted_class']=le.inverse_transform(predictions)

In [37]:
df_all.head()

Unnamed: 0,encoded_class,sepal_length,sepal_width,petal_length,petal_width,class,predicted_class
0,1,5.8,2.7,4.1,1.0,Iris-versicolor,Iris-versicolor
1,0,4.8,3.4,1.6,0.2,Iris-setosa,Iris-setosa
2,1,6.0,2.2,4.0,1.0,Iris-versicolor,Iris-versicolor
3,2,6.4,3.1,5.5,1.8,Iris-virginica,Iris-virginica
4,2,6.7,2.5,5.8,1.8,Iris-virginica,Iris-virginica


In [38]:
print('Confusion matrix - Actual versus Predicted')
pd.crosstab(df_all['class'], df_all['predicted_class'])

Confusion matrix - Actual versus Predicted


predicted_class,Iris-setosa,Iris-versicolor,Iris-virginica
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,16,0,0
Iris-versicolor,0,10,1
Iris-virginica,0,1,17


In [39]:
import sklearn.metrics as metrics

print(metrics.classification_report(df_all['class'], df_all['predicted_class']))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        16
Iris-versicolor       0.91      0.91      0.91        11
 Iris-virginica       0.94      0.94      0.94        18

       accuracy                           0.96        45
      macro avg       0.95      0.95      0.95        45
   weighted avg       0.96      0.96      0.96        45

