# 0. 事前準備

あらかじめS3バケットとIAMロールを作成しておく。IAMロールにはSageMakerの全権限とS3,ECRへの書き込み権限を含める。

In [1]:
bucket = 'speg03-sagemaker'
role = 'AmazonSageMaker-ExecutionRole'

# 1. データセットの準備

In [2]:
import numpy as np
from tensorflow.keras.datasets.cifar10 import load_data

In [3]:
(x_train, y_train), (x_test, y_test) = load_data()

!mkdir -p ./data/cifar10/{train,val}
np.savez('./data/cifar10/train/cifar10_train.npz', x=x_train, y=y_train)
np.savez('./data/cifar10/val/cifar10_val.npz', x=x_test, y=y_test)

# 2. ローカルモードで学習

In [4]:
import os
from sagemaker.tensorflow import TensorFlow

In [5]:
estimator = TensorFlow(
    role=role,
    source_dir='src',
    entry_point='train.py',
    train_instance_count=1,
    train_instance_type='local',
    framework_version='1.12.0',
    py_version='py3',

    hyperparameters=dict(
        limit_data_rate=0.01
    ),

    code_location=f's3://{bucket}/local_output',  # 末尾スラッシュなし
    output_path=f's3://{bucket}/local_output/'    # 末尾スラッシュあり
)

In [6]:
current_dir = os.path.abspath(os.path.curdir)
estimator.fit(
    inputs=dict(
        train=f'file://{current_dir}/data/cifar10/train',
        validation=f'file://{current_dir}/data/cifar10/val'
    )
)

INFO:sagemaker:Creating training-job with name: sagemaker-tensorflow-scriptmode-2019-03-14-14-02-54-858


Creating tmpu0f2i2v4_algo-1-u2ppf_1 ... 
[1BAttaching to tmpu0f2i2v4_algo-1-u2ppf_1
[36malgo-1-u2ppf_1  |[0m 2019-03-14 14:03:00,858 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training
[36malgo-1-u2ppf_1  |[0m 2019-03-14 14:03:00,901 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-u2ppf_1  |[0m 2019-03-14 14:03:01,197 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-u2ppf_1  |[0m 2019-03-14 14:03:01,236 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-u2ppf_1  |[0m 2019-03-14 14:03:01,260 sagemaker-containers INFO     Invoking user script
[36malgo-1-u2ppf_1  |[0m 
[36malgo-1-u2ppf_1  |[0m Training Env:
[36malgo-1-u2ppf_1  |[0m 
[36malgo-1-u2ppf_1  |[0m {
[36malgo-1-u2ppf_1  |[0m     "additional_framework_parameters": {},
[36malgo-1-u2ppf_1  |[0m     "channel_input_dirs": {
[36malgo-1-u2ppf_1  |[0m         "tr

# 3. データセットのアップロード

In [7]:
from sagemaker import Session

In [8]:
session = Session()

In [9]:
train_data = session.upload_data('./data/cifar10/train', bucket=bucket, key_prefix='data/cifar10/train')
train_data

's3://speg03-sagemaker/data/cifar10/train'

In [10]:
val_data = session.upload_data('./data/cifar10/val', bucket=bucket, key_prefix='data/cifar10/val')
val_data

's3://speg03-sagemaker/data/cifar10/val'

# 4. リモートで学習

In [11]:
estimator = TensorFlow(
    role=role,
    source_dir='src',
    entry_point='train.py',
    train_instance_count=1,
    train_instance_type='ml.p2.xlarge',
    framework_version='1.12.0',
    py_version='py3',

    hyperparameters=dict(
        epoch=100,
        batch_size=128
    ),

    metric_definitions=[
        dict(Name='train_loss', Regex='- loss: (\S+)'),
        dict(Name='train_acc', Regex='- acc: (\S+)'),
        dict(Name='val_loss', Regex='- val_loss: (\S+)'),
        dict(Name='val_acc', Regex='- val_acc: (\S+)')
    ],

    code_location=f's3://{bucket}/output',  # 末尾スラッシュなし
    output_path=f's3://{bucket}/output/'    # 末尾スラッシュあり
)

In [12]:
estimator.fit(
    inputs=dict(
        train=train_data,
        validation=val_data
    ),
    wait=False
)

INFO:sagemaker:Creating training-job with name: sagemaker-tensorflow-scriptmode-2019-03-14-14-04-17-290
