Let's begin with import of required packages

In [2]:
#CONFIGURATION STEP

import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference
import boto3
import re
from sagemaker import get_execution_role

Get an read HIGGS dataset. Split it to train/test

In [4]:
!wget https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/higgs_head_2M.csv
    
dataset = pd.read_csv('./higgs_head_2M.csv',header=None)
print(dataset.head())

train = dataset.head(1000000)
test = dataset.tail(1000000)

--2018-10-04 11:13:09--  https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/higgs_head_2M.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.82.108
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.82.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1461001471 (1.4G) [text/plain]
Saving to: ‘higgs_head_2M.csv.1’


2018-10-04 11:14:33 (16.6 MB/s) - ‘higgs_head_2M.csv.1’ saved [1461001471/1461001471]

    0         1         2         3         4         5         6         7   \
0  1.0  0.869293 -0.635082  0.225690  0.327470 -0.689993  0.754202 -0.248573   
1  1.0  0.907542  0.329147  0.359412  1.497970 -0.313010  1.095531 -0.557525   
2  1.0  0.798835  1.470639 -1.635975  0.453773  0.425629  1.104875  1.282322   
3  0.0  1.344385 -0.876626  0.935913  1.992050  0.882454  1.786066 -1.646778   
4  1.0  1.105009  0.321356  1.522401  0.882808 -1.205349  0.681466 -1.070464   

         8         9     ...           19        20        21

In [12]:
!rm train.csv
!rm validation.csv

Load train/test data into separate CSV files

In [13]:
train.to_csv('higgs_train.csv', index=None, header=None)
test.to_csv('higgs_validation.csv', index=None, header=None)

Upload files to S3

In [14]:
bucket = 'okremnyo'
prefix = 'sagemaker/DEMO-xgboost-dm'

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'higgs_train/train.csv')).upload_file('higgs_train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'higgs_validation/validation.csv')).upload_file('higgs_validation.csv')

In [15]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/higgs_train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/higgs_validation/'.format(bucket, prefix), content_type='csv')

Run the SageMaker Xgboost algorithm

In [17]:
sess = sagemaker.Session()
role = get_execution_role()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.c5.18xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=6,
                        eta=0.3,
                        silent=0,
                        objective='binary:logistic',
                        num_round=50)

xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2018-10-04-11-42-29-329


...............
[31mArguments: train[0m
[31m[2018-10-04:11:44:53:INFO] Running standalone xgboost training.[0m
[31m[2018-10-04:11:44:53:INFO] Path /opt/ml/input/data/validation does not exist![0m
[31m[2018-10-04:11:44:53:INFO] File size need to be processed in the node: 482.3mb. Available memory size in the node: 132119.0mb[0m
[31m[2018-10-04:11:44:53:INFO] Determined delimiter of CSV input is ','[0m
[31m[11:44:53] S3DistributionType set as FullyReplicated[0m
[31m[11:44:53] 1000000x28 matrix with 28000000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[11:44:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6[0m
[31m[0]#011train-error:0.319795[0m
[31m[11:44:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6[0m
[31m[1]#011train-error:0.311113[0m
[31m[11:44:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots


Billable seconds: 41


Run the prediction

In [18]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2018-10-04-11-48-22-811
INFO:sagemaker:Creating endpoint with name xgboost-2018-10-04-11-42-29-329


--------------------------------------------------------------!

In [19]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

Get the predictions

In [20]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test.drop(0, axis=1).as_matrix())

Get the SageMaker Xgboost classification report

In [21]:
from sklearn.metrics import accuracy_score
print('Accuracy = ', accuracy_score(test[0], np.round(predictions)))

from sklearn.metrics import classification_report
print(classification_report(test[0], np.round(predictions)))

Accuracy =  0.733854
             precision    recall  f1-score   support

        0.0       0.72      0.71      0.71    470855
        1.0       0.74      0.76      0.75    529145

avg / total       0.73      0.73      0.73   1000000

