### XGBoost Endpoint Versions

#### Invoke specific version, invoke generic endpoind

In [20]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# sdk 2 serializer and deserializer
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

In [5]:
# Create a predictor and point to an existing endpoint
# This endpoint has two versions models

endpoint_name = 'xgboost-bikerental'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)
predictor.serializer = CSVSerializer()


In [6]:
# read Test data
df_all = pd.read_csv('bike_test_rev3.csv')

In [7]:
df_all.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,20,3,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,4


In [8]:
# Need to pass an array to the prediction
# pass except the first column 'datetime'
arr_test = df_all[df_all.columns[1:]]

In [9]:
arr_test.shape

(6493, 13)

In [10]:
#### No variant specified - Endpoint distributes request based on configured weight
# target_variant = NNone

result = predictor.predict(arr_test[:5],target_variant=None)

In [11]:
print(result)

b'2.332122325897217,1.9005593061447144,1.6316101551055908,1.0376962423324585,0.9574321508407593'


### Variant specified - Endpoint routs request to configured variant

In [12]:
#target_variant = 'version-0-90-2'
result = predictor.predict(arr_test[:5],target_variant='version-0-90-2')
print(result)

b'2.332122325897217,1.9005593061447144,1.6316101551055908,1.0376962423324585,0.9574321508407593'


In [13]:
#target_variant = 'version-1-2-2'
result = predictor.predict(arr_test[:5],target_variant='version-1-2-2')
print(result)

b'2.332122325897217\n1.9005593061447144\n1.6316101551055908\n1.0376962423324585\n0.9574321508407593\n'


### Split the input data into chunks

There are thousands of rows in this data which needs to inference<br>
It is good to split the data into chunks to prevent payload or timeout error



In [14]:
# splitting using regular expression as xgboost 1-2-2 is returning

# pattern looks for one or more non-numeric characters
pattern = r'[^0-9.]+'

def inference_by_version(targetVariant = None):
    predictions = []
    for arr in np.array_split(arr_test,10):
        result = predictor.predict(arr, target_variant=targetVariant)
        result = re.split(pattern,result.decode('utf8'))
        
        print(arr.shape)
        predictions += [float(r) for r in result if r!=""]
        
    return predictions

### Use all available variants

In [15]:
# None - no preference to variant
targetVariants = [None, 'version-0-90-2', 'version-1-2-2']
countColumns = {}
for variant in targetVariants:
    countColumns[str(variant)] = 'count_'+ variant if variant else 'count_all'

In [16]:
print(countColumns)

{'None': 'count_all', 'version-0-90-2': 'count_version-0-90-2', 'version-1-2-2': 'count_version-1-2-2'}


In [17]:
for variant in targetVariants:
    print(f'Target Variant: {variant}')

Target Variant: None
Target Variant: version-0-90-2
Target Variant: version-1-2-2


In [21]:
for variant in targetVariants:
    print(f'Target Variant: {variant}')
    df_all[countColumns[str(variant)]] = np.expm1(inference_by_version(variant))

Target Variant: None
(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
Target Variant: version-0-90-2
(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
Target Variant: version-1-2-2
(650, 13)
(650, 13)
(650, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)
(649, 13)


In [25]:
countColumns.values()

dict_values(['count_all', 'count_version-0-90-2', 'count_version-1-2-2'])

In [23]:
df_all[list(countColumns.values())]

Unnamed: 0,count_all,count_version-0-90-2,count_version-1-2-2
0,9.299778,9.299778,9.299778
1,5.689635,5.689635,5.689635
2,4.112099,4.112099,4.112099
3,1.822707,1.822707,1.822707
4,1.604999,1.604999,1.604999
...,...,...,...
6488,257.326183,257.326183,257.326183
6489,171.026137,171.026137,171.026137
6490,133.685980,133.685980,133.685980
6491,92.357978,92.357978,92.357978


In [24]:
df_all[list(countColumns.values())].describe()

Unnamed: 0,count_all,count_version-0-90-2,count_version-1-2-2
count,6493.0,6493.0,6493.0
mean,183.41239,183.41239,183.41239
std,171.869936,171.869936,171.869936
min,0.419208,0.419208,0.419208
25%,41.106633,41.106633,41.106633
50%,141.825441,141.825441,141.825441
75%,273.490148,273.490148,273.490148
max,942.930326,942.930326,942.930326
