In [1]:
import pandas as pd
import boto3
import os
import sagemaktrain python.ipynber
import numpy as np


In [2]:
train = pd.read_csv('./fall_train.csv', names = list(range(37)), index_col=False)
test = pd.read_csv('./fall_test.csv', names = list(range(37)), index_col=False)



Training and Validation Set

    Target Variable as first column followed by input features
    raining, Validation files do not have a column header



In [3]:
train_labels = np.array(train.iloc[:,0]).astype("int")
test_labels = np.array(test.iloc[:,0]).astype("int")

train_features = np.array(train.iloc[:,1:]).astype("float32")
test_features  = np.array(test.iloc[:,1:]).astype("float32")

Upload Data to S3


In [4]:
bucket_name = 'sagemaker-eu-central-1-19111535'
training_file_key = 'model_data/fall_train.csv'
validation_file_key = 'model_data/fall_validation.csv'
test_file_key = 'model_data/fall_test.csv'

s3_model_output_location = r's3://{0}/model_data/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://sagemaker-eu-central-1-19111535/model_data/model
s3://sagemaker-eu-central-1-19111535/model_data/fall_train.csv
s3://sagemaker-eu-central-1-19111535/model_data/fall_validation.csv
s3://sagemaker-eu-central-1-19111535/model_data/fall_test.csv


In [6]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [7]:
write_to_s3('fall_train.csv',bucket_name,training_file_key)
write_to_s3('fall_validation.csv',bucket_name,validation_file_key)
write_to_s3('fall_test.csv',bucket_name,test_file_key)


Training Algorithm Docker Image

    AWS Maintains a separate image for every region and algorithm



In [8]:
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
role = get_execution_role()
print(role)

arn:aws:iam::043090642581:role/service-role/AmazonSageMaker-ExecutionRole-20191028T163679


In [9]:
# find you region
boto3.Session().region_name

'eu-central-1'

Build Model

In [10]:
sess = sagemaker.Session()

In [11]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name
container_path = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='0.90-1');
estimator = sagemaker.estimator.Estimator(container_path,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m5.large',#'ml.m5.4xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-fall-v1')

In [12]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(max_depth=6,objective="reg:linear",
                              eta=0.12,subsample=0.73,num_round=200)

In [13]:
estimator.hyperparameters()

{'max_depth': 6,
 'objective': 'reg:linear',
 'eta': 0.12,
 'subsample': 0.73,
 'num_round': 200}

In [14]:
estimator.image_name

'492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3'

Specify Training Data Location and Optionally, Validation Data Location

In [15]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

In [16]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-eu-central-1-19111535/model_data/fall_train.csv'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-eu-central-1-19111535/model_data/fall_validation.csv'}}, 'ContentType': 'csv'}


Train the model

In [17]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':training_input_config, 'validation':validation_input_config})

2019-11-19 15:18:52 Starting - Starting the training job...
2019-11-19 15:18:56 Starting - Launching requested ML instances......
2019-11-19 15:19:55 Starting - Preparing the instances for training......
2019-11-19 15:21:06 Downloading - Downloading input data...
2019-11-19 15:21:35 Training - Downloading the training image..[31mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[31mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[31mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31mINFO:root:Determined delimiter of CSV input is ','[0m
[31m[15:21:56] 559x36 matrix with 20124 entries loaded from /opt/ml/input/data/train?f

Deploy Model


In [18]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.large',#'ml.m5.4xlarge',
                             endpoint_name = 'xgboost-fall-v1')


-------------------------------------------------------------------------------------!

Run Predictions

In [19]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [20]:
# calcola gli errori del modello
rate_fall = 0
rate_no_fall = 0
for i in range(1, len(test_features)):
    pred = predictor.predict([test_features[i]])
    pred = round(float(pred))
    if(pred == test_labels[i]):
        if(pred == 0):
            rate_no_fall = rate_no_fall + 1                    
        else:
            rate_fall = rate_fall + 1
        
    


In [21]:
print(rate_fall)
print(rate_no_fall)
print(rate_fall+rate_no_fall)
print(len(test_features))


48
137
185
187
