In [56]:
import boto3
import sagemaker

region = boto3.Session().region_name

session = sagemaker.Session()
bucket = 'sagemaker-titanic'
prefix = 'autopilot'

role = sagemaker.get_execution_role()
sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [57]:
import os
import pandas as pd

data = pd.read_csv('s3://{}/data/train.csv'.format(bucket))
test_data = pd.read_csv('s3://{}/data/test.csv'.format(bucket))

In [58]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [59]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(data, test_size=0.2, random_state=0)

In [60]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [61]:
target_colname = ['Survived']
feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

train_data = train_data[target_colname + feature_names]
valid_data = valid_data[target_colname + feature_names]
test_data = test_data[feature_names]

valid_data_no_target = valid_data.drop(columns=target_colname)
valid_target = valid_data[target_colname]

In [62]:
s3 = boto3.resource('s3')

train_file = 'train_data.csv'
train_data.to_csv(train_file, index=False, header=True)
s3.Bucket(bucket).upload_file(train_file, os.path.join(prefix, 'train', train_file))

valid_file = 'validation_data.csv'
valid_data_no_target.to_csv(valid_file, index=False, header=True)
s3.Bucket(bucket).upload_file(valid_file, os.path.join(prefix, 'validation', valid_file))

test_file = 'test_data.csv'
test_data.to_csv(test_file, index=False, header=True)
s3.Bucket(bucket).upload_file(test_file, os.path.join(prefix, 'test', test_file))

In [63]:
train_data_s3_path = 's3://{}/{}/train/{}'.format(bucket, prefix, train_file)
valid_data_s3_path = 's3://{}/{}/validation/{}'.format(bucket, prefix, valid_file)
test_data_s3_path = 's3://{}/{}/test/{}'.format(bucket, prefix, test_file)

In [64]:
input_data_config = [{
    'DataSource': {
        'S3DataSource': {
            'S3DataType': 'S3Prefix', 
            'S3Uri': 's3://{}/{}/train/{}'.format(bucket, prefix, train_file)
        }
    }, 
    'TargetAttributeName': 'Survived'
}]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket, prefix)
}

In [69]:
from time import gmtime, strftime, sleep, time
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-titanic-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

AutoMLJobName: automl-titanic-29-12-11-18


In [70]:
sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name, 
                      InputDataConfig=input_data_config, 
                      OutputDataConfig=output_data_config, 
                      RoleArn=role)

start_time = time()

print('JobStatus - Secondary Status - Elapsed Time')
print('--------------------------------------------------')

describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print(describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']

    print(describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'] + " - " + str(time() - start_time))
    sleep(30)

JobStatus - Secondary Status - Elapsed Time
--------------------------------------------------
InProgress - Starting
InProgress - Starting - 0.12175917625427246
InProgress - AnalyzingData - 30.269807815551758
InProgress - AnalyzingData - 60.37604832649231
InProgress - AnalyzingData - 90.49548625946045
InProgress - AnalyzingData - 120.60664916038513
InProgress - AnalyzingData - 150.68300104141235
InProgress - AnalyzingData - 180.78416442871094
InProgress - AnalyzingData - 210.91369891166687
InProgress - AnalyzingData - 241.0105264186859
InProgress - AnalyzingData - 271.14640831947327
InProgress - AnalyzingData - 301.33691334724426
InProgress - AnalyzingData - 331.4434859752655
InProgress - AnalyzingData - 361.5741968154907
InProgress - AnalyzingData - 391.70083951950073
InProgress - AnalyzingData - 421.8382956981659
InProgress - AnalyzingData - 451.9442024230957
InProgress - AnalyzingData - 482.05213260650635
InProgress - AnalyzingData - 512.1750540733337
InProgress - AnalyzingData - 54

In [71]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

CandidateName: tuning-job-1-16fbd3b9746b4b8893-213-6d4157e2
FinalAutoMLJobObjectiveMetricName: validation:f1
FinalAutoMLJobObjectiveMetricValue: 0.8558499813079834


In [72]:
from pprint import pprint
pprint(best_candidate)

{'CandidateName': 'tuning-job-1-16fbd3b9746b4b8893-213-6d4157e2',
 'CandidateStatus': 'Completed',
 'CandidateSteps': [{'CandidateStepArn': 'arn:aws:sagemaker:ap-northeast-1:667022276241:processing-job/db-1-05a9df0c86b346ef970248180f00a2f4939f2caad046463daa5cac4a96',
                     'CandidateStepName': 'db-1-05a9df0c86b346ef970248180f00a2f4939f2caad046463daa5cac4a96',
                     'CandidateStepType': 'AWS::SageMaker::ProcessingJob'},
                    {'CandidateStepArn': 'arn:aws:sagemaker:ap-northeast-1:667022276241:training-job/automl-tit-dpp5-1-2034ead343384d7db114d30bbbbd559156d2dcfe20964',
                     'CandidateStepName': 'automl-tit-dpp5-1-2034ead343384d7db114d30bbbbd559156d2dcfe20964',
                     'CandidateStepType': 'AWS::SageMaker::TrainingJob'},
                    {'CandidateStepArn': 'arn:aws:sagemaker:ap-northeast-1:667022276241:transform-job/automl-tit-dpp5-rpb-1-993f9c362ba54d8e950bebdf66edfa4b2515f21e5',
                     'Candida

In [73]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, SortBy='FinalObjectiveMetricValue')['Candidates']
for index, candidate in enumerate(candidates):
    print (str(index+1) + "  " + candidate['CandidateName'] + "  " + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))

1  tuning-job-1-16fbd3b9746b4b8893-213-6d4157e2  0.8558499813079834
2  tuning-job-1-16fbd3b9746b4b8893-069-501bc3d9  0.8489000201225281
3  tuning-job-1-16fbd3b9746b4b8893-226-0c62c38d  0.8476099967956543
4  tuning-job-1-16fbd3b9746b4b8893-174-6ff21d9f  0.8462399840354919
5  tuning-job-1-16fbd3b9746b4b8893-190-78ec06a3  0.8462399840354919
6  tuning-job-1-16fbd3b9746b4b8893-160-bf232ced  0.8447700142860413
7  tuning-job-1-16fbd3b9746b4b8893-237-6254dd97  0.8406800031661987
8  tuning-job-1-16fbd3b9746b4b8893-074-ace60724  0.8392800092697144
9  tuning-job-1-16fbd3b9746b4b8893-102-356d2e76  0.8377900123596191
10  tuning-job-1-16fbd3b9746b4b8893-093-63302ddc  0.8377900123596191


In [74]:
model_name = 'automl-titanic-model-' + timestamp_suffix
model = sm.create_model(Containers=best_candidate['InferenceContainers'],
                        ModelName=model_name, 
                        ExecutionRoleArn=role)
print('Model ARN corresponding to the best candidate is : {}'.format(model['ModelArn']))

Model ARN corresponding to the best candidate is : arn:aws:sagemaker:ap-northeast-1:667022276241:model/automl-titanic-model-29-12-11-18


In [75]:
pprint(best_candidate['InferenceContainers'])

[{'Environment': {'AUTOML_SPARSE_ENCODE_RECORDIO_PROTOBUF': '1',
                  'AUTOML_TRANSFORM_MODE': 'feature-transform',
                  'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'application/x-recordio-protobuf',
                  'SAGEMAKER_PROGRAM': 'sagemaker_serve',
                  'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code'},
  'Image': '354813040037.dkr.ecr.ap-northeast-1.amazonaws.com/sagemaker-sklearn-automl:0.2-1-cpu-py3',
  'ModelDataUrl': 's3://sagemaker-titanic/autopilot/output/automl-titanic-29-12-11-18/data-processor-models/automl-tit-dpp5-1-2034ead343384d7db114d30bbbbd559156d2dcfe20964/output/model.tar.gz'},
 {'Environment': {'MAX_CONTENT_LENGTH': '20971520',
                  'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT': 'text/csv',
                  'SAGEMAKER_INFERENCE_OUTPUT': 'predicted_label',
                  'SAGEMAKER_INFERENCE_SUPPORTED': 'predicted_label,probability,probabilities'},
  'Image': '354813040037.dkr.ecr.ap-northeast-1.amazonaws.com/sag

In [76]:
# timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
transform_job_name = 'automl-titanic-transform-' + timestamp_suffix

transform_input = {
    'DataSource': {
        'S3DataSource': {
            'S3DataType': 'S3Prefix',
            'S3Uri': valid_data_s3_path
        }
    },
    'ContentType': 'text/csv',
    'CompressionType': 'None',
    'SplitType': 'Line'
}

transform_output = {
    'S3OutputPath': 's3://{}/{}/inference-results'.format(bucket, prefix),
}

transform_resources = {
    'InstanceType': 'ml.m5.4xlarge',
    'InstanceCount': 1
}

sm.create_transform_job(
    TransformJobName = transform_job_name,
    ModelName = model_name, 
    TransformInput = transform_input,
    TransformOutput = transform_output,
    TransformResources = transform_resources
)

{'TransformJobArn': 'arn:aws:sagemaker:ap-northeast-1:667022276241:transform-job/automl-titanic-transform-29-12-11-18',
 'ResponseMetadata': {'RequestId': 'e74488a7-cba6-4f1a-9172-4a05b962b968',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e74488a7-cba6-4f1a-9172-4a05b962b968',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '118',
   'date': 'Fri, 29 Jan 2021 13:27:27 GMT'},
  'RetryAttempts': 0}}

In [77]:
s3_output_key = '{}/inference-results/validation_data.csv.out'.format(prefix)
local_inference_results_path = 'inference_results.csv'

s3 = boto3.resource('s3')
inference_results_bucket = s3.Bucket(bucket)
inference_results_bucket.download_file(s3_output_key, local_inference_results_path);

valid_pred = pd.read_csv(local_inference_results_path, names=['predicted_label'])
valid_pred[1:]

Unnamed: 0,predicted_label
1,0
2,0
3,0
4,1
5,1
...,...
175,1
176,0
177,1
178,0


In [81]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(valid_target['Survived'], valid_pred[1:]['predicted_label'])

In [82]:
acc

0.776536312849162

In [80]:
valid_target

Unnamed: 0,Survived
495,0
648,0
278,0
31,1
255,1
...,...
780,1
837,0
215,1
833,0
