In [1]:
import sklearn
import pandas as pd
import numpy as np
import boto3
import pprint
import os
import time


import re

import sagemaker
from sagemaker import get_execution_role

In [2]:
sess = sagemaker.Session()

In [3]:
role = get_execution_role()
print(role)

arn:aws:iam::750253866451:role/sage_role


In [4]:
bucket_name = 'saurav-ml-sagemaker'

training_folder = r'ChurnModel/training/'
test_folder = r'ChurnModel/test/'
validation_folder = r'ChurnModel/validation/'

In [5]:
s3_model_output_location = r's3://{}/ChurnModel/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_folder)

In [6]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)


s3://saurav-ml-sagemaker/ChurnModel/model
s3://saurav-ml-sagemaker/ChurnModel/training/
s3://saurav-ml-sagemaker/ChurnModel/validation/
s3://saurav-ml-sagemaker/ChurnModel/test/


In [7]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner', 'latest')

In [8]:
linear_job = "Demo-linear" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print('Job Name is {}'.format(linear_job))

Job Name is Demo-linear2020-03-30-03-01-30


In [9]:
linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_training_file_location,
                    "S3DataDistributionType": "ShardedByS3Key"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_validation_file_location,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "text/csv",
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": s3_model_output_location
    },
    "HyperParameters": {
                            "feature_dim": "13",
                            "mini_batch_size": "100",
                            "predictor_type": "binary_classifier",
                            "epochs": "10",
                            "num_models": "32",
                            "loss": "absolute_loss"
        
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

In [10]:
region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress


In [None]:
#HOST

In [None]:
#SETUP A MODEL

In [12]:
linear_hosting_container = {
    'Image': container,
    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']
}

create_model_response = sm.create_model(
    ModelName=linear_job,
    ExecutionRoleArn=role,
    PrimaryContainer=linear_hosting_container)

print(create_model_response['ModelArn'])

arn:aws:sagemaker:us-east-1:750253866451:model/demo-linear2020-03-30-03-01-30


In [16]:
sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']

's3://saurav-ml-sagemaker/ChurnModel/model/Demo-linear2020-03-30-03-01-30/output/model.tar.gz'

In [None]:
#CREATE AN ENDPOINT CONFIG

In [17]:
linear_endpoint_config = 'DEMO-linear-endpoint-config-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[{
        'InstanceType': 'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': linear_job,
        'VariantName': 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

DEMO-linear-endpoint-config-2020-03-30-03-08-30
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:750253866451:endpoint-config/demo-linear-endpoint-config-2020-03-30-03-08-30


In [None]:
#CREATE AN ENDPOINT

In [18]:
%%time

linear_endpoint = 'DEMO-linear-endpoint-' + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint,
    EndpointConfigName=linear_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Status: " + status)

sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

if status != 'InService':
    raise Exception('Endpoint creation did not succeed')

DEMO-linear-endpoint-202003300309
arn:aws:sagemaker:us-east-1:750253866451:endpoint/demo-linear-endpoint-202003300309
Status: Creating
Arn: arn:aws:sagemaker:us-east-1:750253866451:endpoint/demo-linear-endpoint-202003300309
Status: InService
CPU times: user 190 ms, sys: 33.2 ms, total: 223 ms
Wall time: 7min 31s


In [None]:
#PREDICT

In [19]:
prefix = "ChurnModel/test/"

data_key = prefix + 'test.csv'
data_location = 's3://{}/{}'.format(bucket_name, data_key)
test = pd.read_csv(data_location)

In [20]:
test.head()

Unnamed: 0,0,-0.5787359118285802,1.7427397119690895,1.0959875190286517,-1.0959875190286514,-0.4400359548576646,0.19816383219544578,-1.387537586562431,0.11735002143511637,-0.911583494040172,-1.5477679860172207,0.970242550937133,0.21653375188734358,-1.0028039309990795
0,0,-0.578736,1.74274,1.095988,-1.095988,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,-1.002804
1,0,-0.578736,1.74274,-0.912419,0.912419,-1.588528,-1.422847,-0.695982,-1.225848,0.807737,0.646092,-1.03067,-0.412123,-1.002804
2,0,-0.578736,-0.573809,1.095988,-1.095988,-1.050496,-1.327494,-0.004426,-1.225848,0.807737,-1.547768,-1.03067,1.578357,0.997204
3,1,-0.578736,1.74274,1.095988,-1.095988,-1.45402,-0.087897,-0.350204,-1.225848,-0.911583,0.646092,-1.03067,0.327318,-1.002804
4,0,-0.578736,1.74274,1.095988,-1.095988,0.739496,-0.946079,1.378686,-1.225848,0.807737,0.646092,0.970243,0.729344,-1.002804


In [21]:
test_X = test.iloc[:,1:]

In [23]:
test_X.head()

Unnamed: 0,-0.5787359118285802,1.7427397119690895,1.0959875190286517,-1.0959875190286514,-0.4400359548576646,0.19816383219544578,-1.387537586562431,0.11735002143511637,-0.911583494040172,-1.5477679860172207,0.970242550937133,0.21653375188734358,-1.0028039309990795
0,-0.578736,1.74274,1.095988,-1.095988,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,-1.002804
1,-0.578736,1.74274,-0.912419,0.912419,-1.588528,-1.422847,-0.695982,-1.225848,0.807737,0.646092,-1.03067,-0.412123,-1.002804
2,-0.578736,-0.573809,1.095988,-1.095988,-1.050496,-1.327494,-0.004426,-1.225848,0.807737,-1.547768,-1.03067,1.578357,0.997204
3,-0.578736,1.74274,1.095988,-1.095988,-1.45402,-0.087897,-0.350204,-1.225848,-0.911583,0.646092,-1.03067,0.327318,-1.002804
4,-0.578736,1.74274,1.095988,-1.095988,0.739496,-0.946079,1.378686,-1.225848,0.807737,0.646092,0.970243,0.729344,-1.002804


In [24]:
test_y = test.iloc[:,0]

In [25]:
test_y.head()

0    0
1    0
2    0
3    1
4    0
Name: 0, dtype: int64

In [38]:
train = pd.read_csv('train.csv')

In [39]:
train_X = train.iloc[:,1:]
train_y = train.iloc[:,0]

In [32]:
import io
import json
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

In [33]:
runtime= boto3.client('runtime.sagemaker')

payload = np2csv(test_X)
response = runtime.invoke_endpoint(EndpointName=linear_endpoint,
                                   ContentType='text/csv',
                                   Body=payload)
result = json.loads(response['Body'].read().decode())
test_pred = np.array([r['score'] for r in result['predictions']])

In [36]:
test_pred

array([ 0.08512215, -0.04367315, -0.01518775, ..., -0.02131663,
       -0.01934228, -0.00701834])

In [40]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(np.abs(test_y - np.median(train_y))) ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear,3))

Test MAE Baseline : 0.207
Test MAE Linear: 0.236


In [41]:
test_pred_class = (test_pred > 0.5)+0;
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class))*100
baseline_accuracy = np.mean((test_y == test_pred_baseline))*100

print("Prediction Accuracy:", round(prediction_accuracy,1), "%")
print("Baseline Accuracy:", round(baseline_accuracy,1), "%")

Prediction Accuracy: 79.3 %
Baseline Accuracy: 79.3 %


In [42]:
test_pred_class

array([0, 0, 0, ..., 0, 0, 0])