In [1]:
%%bash
#wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
mkdir -p /tmp/covtype/raw
mv covtype.data.gz /tmp/covtype/raw/covtype.data.gz

In [35]:
import numpy as np
import os

data_dir = "/tmp/covtype/"
processed_subdir = "standardized"
raw_data_file = os.path.join(data_dir, "raw", "covtype.data.gz")
train_features_file = os.path.join(data_dir, processed_subdir, "train/csv/features.csv")
train_labels_file = os.path.join(data_dir, processed_subdir, "train/csv/labels.csv")
test_features_file = os.path.join(data_dir, processed_subdir, "test/csv/features.csv")
test_labels_file = os.path.join(data_dir, processed_subdir, "test/csv/labels.csv")

# read raw data
print("Reading raw data from {}".format(raw_data_file))
raw = np.loadtxt(raw_data_file, delimiter=',')

# split into train/test with a 90/10 split
np.random.seed(0)
np.random.shuffle(raw)
train_size = int(0.9 * raw.shape[0])
train_features = raw[:train_size, :-1]
train_labels = raw[:train_size, -1]
test_features = raw[train_size:, :-1]
test_labels = raw[train_size:, -1]

Reading raw data from /tmp/covtype/raw/covtype.data.gz


In [3]:
print('size - ',train_features.shape[0])

size -  2091643


In [36]:
import numpy as np
import os
import io
import sagemaker.amazon.common as smac 

import boto3
import os
import sagemaker 
import matplotlib.pyplot as plt

import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.amazon.amazon_estimator import get_image_uri


In [165]:
#print(get_execution_role())
#instance_type = 'ml.p3.8xlarge'
#instance_type = 'ml.p2.xlarge'
instance_type = 'ml.m4.xlarge'
gpu_train_instance_count=1
hyperparams = {
    'feature_dim': 54,
    'k': 5,
    'sample_size': 200000,
    'predictor_type': 'classifier' 
}

In [159]:
def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, s3_test_data=None):
    """
    Create an Estimator from the given hyperparams, fit to training data, 
    and return a deployed predictor
    
    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        train_instance_count=gpu_train_instance_count,
        train_instance_type= instance_type,# 'ml.m5.2xlarge',
        output_path=output_path,
        sagemaker_session=sagemaker.Session())
    knn.set_hyperparameters(**hyperparams)
    
    # train a model. fit_input contains the locations of the train and test data
    fit_input = {'train': s3_train_data}
    if s3_test_data is not None:
        fit_input['test'] = s3_test_data
    knn.fit(fit_input)
    return knn


def predictor_from_estimator(knn_estimator, estimator_name, instance_type, endpoint_name=None): 
    knn_predictor = knn_estimator.deploy(initial_instance_count=gpu_train_instance_count, 
                                         instance_type=instance_type,
                                        endpoint_name=endpoint_name)
    knn_predictor.content_type = 'text/csv'
    knn_predictor.serializer = csv_serializer
    knn_predictor.deserializer = json_deserializer
    return knn_predictor

In [160]:
import io
import sagemaker.amazon.common as smac

print('train_features shape = ', train_features.shape)
print('train_labels shape = ', train_labels.shape)

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, train_features, train_labels)
buf.seek(0)

train_features shape =  (2091643, 54)
train_labels shape =  (2091643,)


0

In [161]:
import boto3
import os
import sagemaker

bucket = sagemaker.Session().default_bucket() # modify to your bucket name
prefix = 'knn-blog-2019-02-22'
key = 'recordio-pb-data'

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-west-2-346891358411/knn-blog-2019-02-22/train/recordio-pb-data


In [162]:
print('test_features shape = ', test_features.shape)
print('test_labels shape = ', test_labels.shape)

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, test_features, test_labels)
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded test data location: {}'.format(s3_test_data))

test_features shape =  (232405, 54)
test_labels shape =  (232405,)
uploaded test data location: s3://sagemaker-us-west-2-346891358411/knn-blog-2019-02-22/test/recordio-pb-data


In [168]:
output_path = 's3://' + bucket + '/' + prefix + '/default_example/output'
knn_estimator = trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path, 
                                                   s3_test_data=s3_test_data)

INFO:sagemaker:Creating training-job with name: knn-2019-02-24-01-14-33-194


2019-02-24 01:14:33 Starting - Starting the training job...
2019-02-24 01:14:34 Starting - Launching requested ML instances......
2019-02-24 01:15:39 Starting - Preparing the instances for training......
2019-02-24 01:16:34 Downloading - Downloading input data...
2019-02-24 01:17:12 Training - Downloading the training image..
[31mDocker entrypoint called with argument(s): train[0m
[31m[02/24/2019 01:17:44 INFO 140635007137600] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000'}[0m
[31m[02/24/2019 01:17:44 INFO 140635007137600] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'sample_size': u'


2019-02-24 01:17:41 Training - Training image download completed. Training in progress.[31m[02/24/2019 01:17:54 INFO 140635007137600] push reservoir to kv... 1 num_workers 0 rank[0m
[31m[02/24/2019 01:17:54 INFO 140635007137600] ...done (200000)[0m
[31m[02/24/2019 01:17:54 INFO 140635007137600] #progress_metric: host=algo-1, completed 100 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 419, "sum": 419.0, "min": 419}, "Number of Batches Since Last Reset": {"count": 1, "max": 419, "sum": 419.0, "min": 419}, "Number of Records Since Last Reset": {"count": 1, "max": 2091643, "sum": 2091643.0, "min": 2091643}, "Total Batches Seen": {"count": 1, "max": 419, "sum": 419.0, "min": 419}, "Total Records Seen": {"count": 1, "max": 2091643, "sum": 2091643.0, "min": 2091643}, "Max Records Seen Between Resets": {"count": 1, "max": 2091643, "sum": 2091643.0, "min": 2091643}, "Reset Count": {"count": 1, "max": 1, "sum": 1.0, "min": 1}}, "EndTime":

In [169]:
#job_name = 'knn-2019-02-22-07-10-08-640' #k = 15 GPU - 1
#job_name = 'knn-2019-02-22-08-05-17-194' #k = 5  GPU - 1
#job_name = 'knn-2019-02-22-08-48-28-391'  #k = 10  GPU - 1
#job_name = 'knn-2019-02-22-14-38-53-090'# k = 10 GPU - 4
#job_name = 'knn-2019-02-22-20-56-36-167'# k = 5 GPU - 4
#job_name = 'knn-2019-02-22-18-20-14-286'# k = 15 GPU - 4
#job_name = 'knn-2019-02-22-17-08-40-401'# k = 5 GPU - 8
#job_name = 'knn-2019-02-22-17-26-51-801'# k = 10 GPU - 8
#job_name = 'knn-2019-02-22-17-48-58-072'# k = 10 GPU - 8
job_name = 'knn-2019-02-24-01-14-33-194' #k =5 mlm4large

sagemaker_client = boto3.client('sagemaker')
start_train_time = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingStartTime']
end_train_time = sagemaker_client.describe_training_job(TrainingJobName=job_name)['TrainingEndTime']
run_time = end_train_time - start_train_time 
print('time required for training the model %d data point: %s' % (train_features.shape[0], run_time))
#print(knn_estimator)

time required for training the model 2091643 data point: 0:02:55.571000


In [None]:
import time 
model_name = 'knn_%s'% instance_type
endpoint_name = 'knn-ml-m4-xlarge-%s'% (str(time.time()).replace('.','-'))
print('setting up the endpoint..')
predictor = predictor_from_estimator(knn_estimator, model_name, instance_type, endpoint_name=endpoint_name)

In [155]:
batches = np.array_split(test_features, 100)
print('data split into 100 batches, of size %d.' % batches[0].shape[0])

# obtain an np array with the predictions for the entire test set
start_time = time.time()
predictions = []
for batch in batches:
    result = predictor.predict(batch)
    cur_predictions = np.array([result['predictions'][i]['predicted_label'] for i in range(len(result['predictions']))])
    predictions.append(cur_predictions)
predictions = np.concatenate(predictions)
run_time = time.time() - start_time

test_size = test_labels.shape[0]
num_correct = sum(predictions == test_labels)
accuracy = num_correct / float(test_size)
print('time required for predicting %d data point: %.2f seconds' % (test_size, run_time))
print('accuracy of model: %.1f%%' % (accuracy * 100) )

data split into 100 batches, of size 2325.
time required for predicting 232405 data point: 17.59 seconds
accuracy of model: 90.8%


In [106]:
def delete_endpoint(predictor):
    try:
        boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint)
        print('Deleted {}'.format(predictor.endpoint))
    except:
        print('Already deleted: {}'.format(predictor.endpoint))

delete_endpoint(predictor)

Deleted knn-ml-m4-xlarge-1550858202-8549948


In [151]:
batches = np.array_split(test_features, 100)
#batch = np.array(batches) #batches[0] 
#print(type(batches))
print(batches[0].shape)
print(test_features.shape)

(2325, 54)
(232405, 54)


In [134]:
print(hyperparams)

{'feature_dim': 54, 'k': 5, 'sample_size': 200000, 'predictor_type': 'classifier'}


In [135]:
print(instance_type)

ml.p2.xlarge


In [136]:
print(gpu_train_instance_count)

4
