### XGBoost Cloud Prediction - Diabetes prediction

In [3]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

### Invoke SageMaker Prediction service

In [28]:
# Acuqure the realtime sagemaker endpoint

endpoint_name = 'xgboost-diabetes-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [29]:
predictor.serializer = CSVSerializer()

In [4]:
with open('diabetes_training_column_list.txt','r') as f:
    diabetes_column_names = f.read().split(',')
diabetes_column_names

['diabetes_class',
 'preg_count',
 'glucose_concentration',
 'diastolic_bp',
 'triceps_skin_fold_thickness',
 'two_hr_serum_insulin',
 'bmi',
 'diabetes_pedi',
 'age']

In [5]:
# Test predictive quality against data in validation file

df_all = pd.read_csv(
    'diabetes_validation_imputed.csv',
    names = diabetes_column_names
)

In [6]:
df_all.head()

Unnamed: 0,diabetes_class,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age
0,0,1.0,130.0,70.0,13.0,105.0,25.9,0.472,22
1,1,8.0,133.0,72.0,33.0,207.0,32.9,0.27,39
2,0,4.0,137.0,68.0,14.0,148.0,24.8,0.143,21
3,0,2.0,88.0,74.0,19.0,53.0,29.0,0.229,22
4,1,9.0,130.0,70.0,33.0,207.0,34.2,0.652,45


In [7]:
df_all.columns

Index(['diabetes_class', 'preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age'],
      dtype='object')

In [21]:
# need to pass as array to the predictions
arr_test = np.array(df_all.drop('diabetes_class',axis=1))

In [23]:
type(arr_test)

numpy.ndarray

In [24]:
np.array(arr_test)

array([[1.00e+00, 1.30e+02, 7.00e+01, ..., 2.59e+01, 4.72e-01, 2.20e+01],
       [8.00e+00, 1.33e+02, 7.20e+01, ..., 3.29e+01, 2.70e-01, 3.90e+01],
       [4.00e+00, 1.37e+02, 6.80e+01, ..., 2.48e+01, 1.43e-01, 2.10e+01],
       ...,
       [4.00e+00, 9.70e+01, 6.00e+01, ..., 2.82e+01, 4.43e-01, 2.20e+01],
       [6.00e+00, 1.25e+02, 7.80e+01, ..., 2.76e+01, 5.65e-01, 4.90e+01],
       [8.00e+00, 1.96e+02, 7.60e+01, ..., 3.75e+01, 6.05e-01, 5.70e+01]])

In [25]:
arr_test.shape

(231, 8)

In [44]:
df_all.shape

(231, 9)

In [27]:
arr_test[:5]

array([[1.00e+00, 1.30e+02, 7.00e+01, 1.30e+01, 1.05e+02, 2.59e+01,
        4.72e-01, 2.20e+01],
       [8.00e+00, 1.33e+02, 7.20e+01, 3.30e+01, 2.07e+02, 3.29e+01,
        2.70e-01, 3.90e+01],
       [4.00e+00, 1.37e+02, 6.80e+01, 1.40e+01, 1.48e+02, 2.48e+01,
        1.43e-01, 2.10e+01],
       [2.00e+00, 8.80e+01, 7.40e+01, 1.90e+01, 5.30e+01, 2.90e+01,
        2.29e-01, 2.20e+01],
       [9.00e+00, 1.30e+02, 7.00e+01, 3.30e+01, 2.07e+02, 3.42e+01,
        6.52e-01, 4.50e+01]])

In [45]:
result = predictor.predict(arr_test[:2])

In [46]:
result

b'0.08117855340242386\n0.9971675276756287\n'

In [74]:
# For large data set we can split into multiple chunks for prediction

pattern = r'\n'

predictions = []
for arr in np.array_split(arr_test,10):
    print(arr.shape)
    result = predictor.predict(arr)
    result = re.split(pattern,result.decode("utf-8"))
    prediction = [float(r) for r in result if r!=""]
    predictions += prediction


(24, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)
(23, 8)


In [75]:
23*10

230

In [76]:
len(predictions)

231

In [77]:
predictions[:5]

[0.08117855340242386,
 0.9971675276756287,
 0.015525220893323421,
 0.009723171591758728,
 0.9987726807594299]

In [78]:
def proba_to_class(arr_res,margin=.5):
    
    return_res = []
    for val in arr_res:
        if float(val) >= margin:
            return_res.append(1)
        else:
            return_res.append(0)
    return return_res

In [79]:
predictions[-5:]

[0.23487329483032227,
 0.9930289387702942,
 0.0006288688164204359,
 0.9792784452438354,
 0.996086597442627]

In [80]:
binary_predictions = proba_to_class(predictions)

In [81]:
binary_predictions[:5]

[0, 1, 0, 0, 1]

In [86]:
df_all.iloc[:5,0]

0    0
1    1
2    0
3    0
4    1
Name: diabetes_class, dtype: int64

In [87]:
df_all['predicted_diabetes_class']=binary_predictions

In [89]:
df_all[['diabetes_class','predicted_diabetes_class']]

Unnamed: 0,diabetes_class,predicted_diabetes_class
0,0,0
1,1,1
2,0,0
3,0,0
4,1,1
...,...,...
226,0,0
227,1,1
228,0,0
229,1,1


In [91]:
print('Confusion matrix - Actual versul Predicted')

pd.crosstab(df_all['diabetes_class'],df_all['predicted_diabetes_class'])

Confusion matrix - Actual versul Predicted


predicted_diabetes_class,0,1
diabetes_class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,137,15
1,20,59


In [92]:
import sklearn.metrics as metrics

print(metrics.classification_report(df_all['diabetes_class'], df_all['predicted_diabetes_class']))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       152
           1       0.80      0.75      0.77        79

    accuracy                           0.85       231
   macro avg       0.83      0.82      0.83       231
weighted avg       0.85      0.85      0.85       231

