In [1]:
# import libraries test 1
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   
import io
import time
import json
import sagemaker.amazon.common as smac
%matplotlib inline
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance


print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [2]:
# S3 prefix
bucket_name = 'demo-saeed'
prefix = 'fraudcredit-pipeline-boto3'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

## Pre-processing
#### Pre-processing Model Training

In [3]:
!tar -czvf sklearn_fd_featurizer.tar.gz sklearn_fd_featurizer.py

src_path = sagemaker_session.upload_data(
    path='{}'.format('sklearn_fd_featurizer.tar.gz'), 
    bucket=bucket_name,
    key_prefix='{}/{}'.format(prefix, 'src_path'))
print(src_path)

sklearn_fd_featurizer.py
s3://demo-saeed/fraudcredit-pipeline-boto3/src_path/sklearn_fd_featurizer.tar.gz


In [4]:
import datetime
time  = str(datetime.datetime.today()).replace(' ', '-').replace(':', '-').rsplit('.')[0]
training_job_name = 'fd-preprocess-model-training-job-{}'.format(time)
sm = boto3.client('sagemaker')
resp = sm.create_training_job(
        TrainingJobName = training_job_name, 
        AlgorithmSpecification={
            'TrainingInputMode': 'File',
            'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3',
        }, 
        RoleArn=role,
        InputDataConfig=[
                            {
                                'ChannelName': 'train',
                                'DataSource': {
                                    'S3DataSource': {
                                        'S3DataType': 'S3Prefix',
                                        'S3Uri': 's3://{}/{}/raw_train'.format(bucket_name, prefix),
                                        'S3DataDistributionType': 'FullyReplicated',
                                    }
                                },
                            },
                        ], 
        OutputDataConfig={
                            'S3OutputPath': 's3://{}/{}/preprocessed-model'.format(bucket_name, prefix)
                        },
        ResourceConfig={
                        'InstanceType': 'ml.m4.xlarge',
                        'InstanceCount': 1,
                        'VolumeSizeInGB': 30,
                    }, 
        StoppingCondition={
                            'MaxRuntimeInSeconds': 600
                        },
        HyperParameters={
            'sagemaker_program' : "sklearn_fd_featurizer.py",
            'sagemaker_region': "us-east-1",
            'sagemaker_job_name': training_job_name,
            'sagemaker_submit_directory': src_path
        },
        Tags=[]

)
training_job_name

'fd-preprocess-model-training-job-2019-05-31-14-06-30'

In [6]:
sm = boto3.client('sagemaker')
#container = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3'
myPreprocesssedModelName = 'fd-preprocessing-model-{}'.format(time)
try:
    sm.create_model(
        ModelName= myPreprocesssedModelName,
        PrimaryContainer={
            'Image': sm.describe_training_job(TrainingJobName = training_job_name)['AlgorithmSpecification']['TrainingImage'],
            'ModelDataUrl': sm.describe_training_job(TrainingJobName = training_job_name)['ModelArtifacts']['S3ModelArtifacts'],
                'Environment': {
                'SAGEMAKER_PROGRAM': sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_program'],
                'SAGEMAKER_REGION':sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_region'],
                'SAGEMAKER_SUBMIT_DIRECTORY': sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_submit_directory']
    
            },
        },
        ExecutionRoleArn=role
    )
except Exception as e:
    print(e)
    print('Unable to create model.')
    raise(e)

#### Pre-processing Transformation

In [7]:
myPreprocesssedModelName

'fd-preprocessing-model-2019-05-31-14-06-30'

In [9]:
sm = boto3.client('sagemaker')
myTransformJobName='fd-TransformJob-train-{}'.format(time)
transformerS3OutputPath_train = 's3://{}/{}/preprocessed_train/'.format(bucket_name, prefix)
response = sm.create_transform_job(
    TransformJobName=myTransformJobName,
    ModelName = myPreprocesssedModelName,
    MaxConcurrentTransforms=1,
    MaxPayloadInMB=6,
    BatchStrategy='MultiRecord',

    TransformInput={
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': 's3://{}/{}/raw_train'.format(bucket_name, prefix)
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    },
    TransformOutput={
        'S3OutputPath': transformerS3OutputPath_train,
        'Accept': 'text/csv',
        'AssembleWith': 'Line'
    },
    TransformResources={
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    },
    Tags=[]
)
print(myTransformJobName)
s3_input_processed_train = sagemaker.session.s3_input(
    transformerS3OutputPath_train, 
    distribution='FullyReplicated',
    content_type='text/csv', 
    s3_data_type='S3Prefix')
print(s3_input_processed_train.config)

fd-TransformJob-train-2019-05-31-14-06-30
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://demo-saeed/fraudcredit-pipeline-boto3/preprocessed_train/'}}, 'ContentType': 'text/csv'}


In [None]:
data_location = 's3://{}/{}/preprocessed_train/{}'.format(bucket_name, prefix,'train.csv.out')
data_location

In [None]:
data = pd.read_csv(data_location, header=None)
data.head()

In [10]:
sm = boto3.client('sagemaker')
myTransformJobName='fd-TransformJob-validation-{}'.format(time)
transformerS3OutputPath_validation = 's3://{}/{}/preprocessed_validation/'.format(bucket_name, prefix)
response = sm.create_transform_job(
    TransformJobName=myTransformJobName,
    ModelName = myPreprocesssedModelName,
    MaxConcurrentTransforms=1,
    MaxPayloadInMB=6,
    BatchStrategy='MultiRecord',

    TransformInput={
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': 's3://{}/{}/raw_validation'.format(bucket_name, prefix)
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    },
    TransformOutput={
        'S3OutputPath': transformerS3OutputPath_validation,
        'Accept': 'text/csv',
        'AssembleWith': 'Line'
    },
    TransformResources={
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    },
    Tags=[]
)
print(myTransformJobName)
s3_input_processed_validation = sagemaker.session.s3_input(
    transformerS3OutputPath_validation, 
    distribution='FullyReplicated',
    content_type='text/csv', 
    s3_data_type='S3Prefix')
print(s3_input_processed_validation.config)

fd-TransformJob-validation-2019-05-31-14-06-30
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://demo-saeed/fraudcredit-pipeline-boto3/preprocessed_validation/'}}, 'ContentType': 'text/csv'}


In [11]:
sm = boto3.client('sagemaker')
myTransformJobName='fd-TransformJob-test-{}'.format(time)
transformerS3OutputPath_test = 's3://{}/{}/preprocessed_test/'.format(bucket_name, prefix)
response = sm.create_transform_job(
    TransformJobName=myTransformJobName,
    ModelName = myPreprocesssedModelName,
    MaxConcurrentTransforms=1,
    MaxPayloadInMB=6,
    BatchStrategy='MultiRecord',

    TransformInput={
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': 's3://{}/{}/raw_test'.format(bucket_name, prefix)
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'SplitType': 'Line'
    },
    TransformOutput={
        'S3OutputPath': transformerS3OutputPath_test,
        'Accept': 'text/csv',
        'AssembleWith': 'Line'
    },
    TransformResources={
        'InstanceType': 'ml.m5.4xlarge',
        'InstanceCount': 1
    },
    Tags=[]
)

print(myTransformJobName)
s3_input_processed_test = sagemaker.session.s3_input(
    transformerS3OutputPath_test, 
    distribution='FullyReplicated',
    content_type='text/csv', 
    s3_data_type='S3Prefix')
print(s3_input_processed_test.config)

fd-TransformJob-test-2019-05-31-14-06-30
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://demo-saeed/fraudcredit-pipeline-boto3/preprocessed_test/'}}, 'ContentType': 'text/csv'}


# linear learner Model

In [12]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')
container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'

In [13]:
output_location = 's3://{}/{}/model-output'.format(bucket_name, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://demo-saeed/fraudcredit-pipeline-boto3/model-output


In [14]:
transformerS3OutputPath_train

's3://demo-saeed/fraudcredit-pipeline-boto3/preprocessed_train/'

In [15]:
myModelTrainingJobName='fd-linear-TrainingJob-{}'.format(time)
try:
    response = sm.create_training_job(
        TrainingJobName=myModelTrainingJobName,
        HyperParameters={
            'feature_dim': '30',
            'predictor_type': 'binary_classifier',
            'mini_batch_size': '200'
        },
        AlgorithmSpecification={
            'TrainingImage': container,
            'TrainingInputMode': 'File'
        },
        RoleArn=role,
        InputDataConfig=[
            {
                'ChannelName': 'train',
                'ContentType':'text/csv',
                'DataSource': {
                    'S3DataSource': {
                        'S3DataType': 'S3Prefix',
                        'S3Uri': transformerS3OutputPath_train,
                        'S3DataDistributionType': 'FullyReplicated'
                    }
                },

                'CompressionType': 'None'
            },
            {
                'ChannelName': 'validation',
                'ContentType':'text/csv',
                'DataSource': {
                    'S3DataSource': {
                        'S3DataType': 'S3Prefix',
                        'S3Uri': transformerS3OutputPath_validation,
                        'S3DataDistributionType': 'FullyReplicated'
                        
                    }
                },

                'CompressionType': 'None'
            },
            {
                'ChannelName': 'test',
                'ContentType':'text/csv',
                'DataSource': {
                    'S3DataSource': {
                        'S3DataType': 'S3Prefix',
                        'S3Uri': transformerS3OutputPath_test,
                        'S3DataDistributionType': 'FullyReplicated'
                        
                    }
                },

                'CompressionType': 'None'
            }
        ],
        OutputDataConfig={
            'S3OutputPath': output_location
        },
        ResourceConfig={
            'InstanceType': 'ml.m4.2xlarge',
            'InstanceCount': 4,
            'VolumeSizeInGB': 50
        },
        StoppingCondition={
            'MaxRuntimeInSeconds': 86400
        }
    )
except Exception as e:
    print(e)
    print('Unable to create training job.')
    raise(e)

In [17]:
sm = boto3.client('sagemaker')
try:
    sm.create_model(
        ModelName='fd-linear-model-{}'.format(time),
        PrimaryContainer={
            'Image': container,
            'ModelDataUrl': sm.describe_training_job( TrainingJobName=myModelTrainingJobName )['ModelArtifacts']['S3ModelArtifacts']
        },
        ExecutionRoleArn=role
    )
except Exception as e:
    print(e)
    print('Unable to create model.')
    raise(e)

# Model Pipline

In [18]:
sm = boto3.client('sagemaker')
pipline_model_name = 'fd-pipline-model-{}'.format(time)
response = sm.create_model(
    ModelName= pipline_model_name,
    Containers=[
         {
            'Image': sm.describe_training_job(TrainingJobName = training_job_name)['AlgorithmSpecification']['TrainingImage'],
            'ModelDataUrl': sm.describe_training_job(TrainingJobName = training_job_name)['ModelArtifacts']['S3ModelArtifacts'],
                'Environment': {
                'SAGEMAKER_PROGRAM': sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_program'],
                'SAGEMAKER_REGION':sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_region'],
                'SAGEMAKER_SUBMIT_DIRECTORY': sm.describe_training_job(TrainingJobName = training_job_name)['HyperParameters']['sagemaker_submit_directory']
    
            },
        },
        {
            'Image': sm.describe_training_job( TrainingJobName=myModelTrainingJobName )['AlgorithmSpecification']['TrainingImage'],
            'ModelDataUrl': sm.describe_training_job( TrainingJobName=myModelTrainingJobName )['ModelArtifacts']['S3ModelArtifacts']
        }
    ],
    ExecutionRoleArn=role
)
response

{'ModelArn': 'arn:aws:sagemaker:us-east-1:079329190341:model/fd-pipline-model-2019-05-31-14-06-30',
 'ResponseMetadata': {'RequestId': 'b91c43c4-4320-44f5-a430-e87214aa1354',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b91c43c4-4320-44f5-a430-e87214aa1354',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '98',
   'date': 'Fri, 31 May 2019 14:21:26 GMT'},
  'RetryAttempts': 0}}

In [19]:
pipline_model_name

'fd-pipline-model-2019-05-31-14-06-30'

In [20]:
myEndpointConfigName='fd-endpoint-conf-pipline-{}'.format(time)
try:
    sm.create_endpoint_config(
            EndpointConfigName=myEndpointConfigName,
            ProductionVariants=[
                {
                    'VariantName': 'prod',
                    'ModelName': pipline_model_name,
                    'InitialInstanceCount': 1,
                    'InstanceType': 'ml.c4.xlarge'
                }
            ]
        )
except Exception as e:
        print(e)
        print('Unable to create endpoint configuration.')
        raise(e)

In [21]:
myEndpointConfigName

'fd-endpoint-conf-pipline-2019-05-31-14-06-30'

In [22]:
myEndpointName = 'fd-scikit-est-model-pipe-inference'
try:
        sm.update_endpoint(
            EndpointName=myEndpointName,
            EndpointConfigName=myEndpointConfigName
        )
except Exception as e:
        print(e)
        print('Unable to create endpoint.')
        raise(e)

# Pipline Inference

In [None]:
data_location = 's3://{}/{}/{}/{}'.format(bucket_name, prefix,'preprocessed_train','train.csv.out')
df = pd.read_csv(data_location,header = None)
df.head(5)

In [None]:
X_test = df.iloc[0:1, 1:]
# X_test.columns =['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
#        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
#        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
X_test

In [None]:
# input_Data = ",".join( map( str, X_test.iloc[0] ) )
# input_Data 

In [None]:
sess = sagemaker.Session()
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON
payload = X_test.values
actual_rings = 10
predictor = RealTimePredictor(
    endpoint=myEndpointName,
    sagemaker_session=sess,
    serializer=csv_serializer,
    content_type=CONTENT_TYPE_CSV,
    accept=CONTENT_TYPE_JSON)

print(predictor.predict(payload))
