# Fashion-MNIST PyTorch image classification w/ Tensorboard
Source
- https://tutorials.pytorch.kr/intermediate/tensorboard_tutorial.html
- https://github.com/aws/amazon-sagemaker-examples/blob/master/frameworks/pytorch/get_started_mnist_train.ipynb

## Initial setup

In [47]:
# install_needed = True
install_needed = False

In [48]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U 'sagemaker[local]'
    !{sys.executable} -m pip install -U sagemaker-experiments # SageMaker Experiments SDK 
    !{sys.executable} -m pip install -U sagemaker             # SageMaker Python SDK
    !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

## Prepare dataset

In [49]:
# imports
import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from time import strftime

In [50]:
# transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))])

# datasets
trainset = torchvision.datasets.FashionMNIST('./data',
    download=True,
    train=True,
    transform=transform)
testset = torchvision.datasets.FashionMNIST('./data',
    download=True,
    train=False,
    transform=transform)

## Set up the SageMaker environment

In [51]:
import os
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

from smexperiments.experiment import Experiment ### SM Experiment
from smexperiments.trial import Trial           ### SM Experiment

from sagemaker.debugger import TensorBoardOutputConfig ### For TensorBoard 

sagemaker_session = sagemaker.Session()

role = get_execution_role()

bucket = sagemaker_session.default_bucket()
prefix = "tensorboard_pytorch_fashion_mnist"
tensorboard_logs_path = "s3://{}/{}/logs".format(bucket, prefix) ### For TensorBoard
output_path = "s3://{}/{}/output".format(bucket, prefix)

print("Bucket: {}".format(bucket))
print("SageMaker ver: " + sagemaker.__version__)
print("Tensorboard log path: {}".format(tensorboard_logs_path))

Bucket: sagemaker-ap-northeast-2-889750940888
SageMaker ver: 2.70.0
Tensorboard log path: s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/logs


## Uploading the data to s3

In [52]:
!aws s3 cp ./data/FashionMNIST/raw s3://{bucket}/{prefix}/data --recursive

upload: data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte.gz
upload: data/FashionMNIST/raw/t10k-labels-idx1-ubyte to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte
upload: data/FashionMNIST/raw/train-labels-idx1-ubyte to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte
upload: data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte.gz
upload: data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte.gz
upload: data/FashionMNIST/raw/t10k-images-idx3-ubyte to s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte
upload: data

In [53]:
train_location = 's3://{}/{}/data'.format(bucket, prefix)
test_location = 's3://{}/{}/data'.format(bucket, prefix)

In [54]:
!aws s3 ls {train_location} --recursive

2021-12-10 03:21:06    7840016 tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte
2021-12-10 03:21:06    4422102 tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte.gz
2021-12-10 03:21:06      10008 tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte
2021-12-10 03:21:06       5148 tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte.gz
2021-12-10 03:21:06   47040016 tensorboard_pytorch_fashion_mnist/data/train-images-idx3-ubyte
2021-12-10 03:21:06   26421880 tensorboard_pytorch_fashion_mnist/data/train-images-idx3-ubyte.gz
2021-12-10 03:21:06      60008 tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte
2021-12-10 03:21:06      29515 tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte.gz


In [55]:
!aws s3 ls {test_location} --recursive

2021-12-10 03:21:06    7840016 tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte
2021-12-10 03:21:06    4422102 tensorboard_pytorch_fashion_mnist/data/t10k-images-idx3-ubyte.gz
2021-12-10 03:21:06      10008 tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte
2021-12-10 03:21:06       5148 tensorboard_pytorch_fashion_mnist/data/t10k-labels-idx1-ubyte.gz
2021-12-10 03:21:06   47040016 tensorboard_pytorch_fashion_mnist/data/train-images-idx3-ubyte
2021-12-10 03:21:06   26421880 tensorboard_pytorch_fashion_mnist/data/train-images-idx3-ubyte.gz
2021-12-10 03:21:06      60008 tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte
2021-12-10 03:21:06      29515 tensorboard_pytorch_fashion_mnist/data/train-labels-idx1-ubyte.gz


## Local mode training

In [56]:
# from sagemaker.local import LocalSession
# sagemaker_session = LocalSession()

In [57]:
from sagemaker.debugger import TensorBoardOutputConfig ### For TensorBoard 

# An error occurred (ValidationException) when calling the CreateTrainingJob operation:
# "LocalPath" of "TensorBoardOutputConfig" cannot start with the following reserved path: [/opt/ml, /tmp, /usr/local/nvidia]

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=tensorboard_logs_path,
    container_local_output_path='/pytorch/tensors'
) 

In [58]:
hyperparameters_local = {"batch-size": 128,
                         "epochs": 1,
                         "learning-rate": 1e-3,
                         "log-interval": 100,
                         "tensorboard-logs-path": tensorboard_logs_path} # Not working in local mode

In [59]:
# set local_mode to be True if you want to run the training script
# on the machine that runs this notebook

local_mode = True

if local_mode:
    instance_type = "local"
else:
    instance_type = "ml.c5.xlarge"

est_local = PyTorch(
            entry_point="train.py",
            source_dir="code",  # directory of your training script
            role=role,
            framework_version="1.8.1",
            py_version="py3",
            instance_type=instance_type,
            instance_count=1,
            output_path=output_path,
            hyperparameters=hyperparameters_local,
            tensorboard_output_config=tensorboard_output_config
)

In [60]:
channels = {"training": train_location, "testing": test_location}
est_local.fit(inputs=channels)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: pytorch-training-2021-12-10-03-21-18-149
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-rs61j:
    command: train
    container_name: t8imoy5bnm-algo-1-rs61j
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/pytorch-training:1.8.1-cpu-py3
    networks:
      sagemaker-local:
        aliases:
        - algo-1-rs61j
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmp9q0ols2m/algo-1-rs61j/output/data:/opt/ml/output/data
    - /tmp/tmp9q0ols2m/algo-1-rs61j/input

Creating t8imoy5bnm-algo-1-rs61j ... 
Creating t8imoy5bnm-algo-1-rs61j ... done
Attaching to t8imoy5bnm-algo-1-rs61j
[36mt8imoy5bnm-algo-1-rs61j |[0m 2021-12-10 03:21:22,970 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mt8imoy5bnm-algo-1-rs61j |[0m 2021-12-10 03:21:22,972 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36mt8imoy5bnm-algo-1-rs61j |[0m 2021-12-10 03:21:22,982 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mt8imoy5bnm-algo-1-rs61j |[0m 2021-12-10 03:21:22,985 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mt8imoy5bnm-algo-1-rs61j |[0m 2021-12-10 03:21:23,125 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36mt8imoy5bnm-algo-1-rs61j |[0m /opt/conda/bin/python3.6 -m pip install -r requirements.txt
[36mt8imoy5bnm-algo-1-rs61j |[0m Collecting tensorboard<2.4
[36mt8i

## Managed training

### Experiments

In [61]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'modelname',
                                                  'Value': 'fashion-mnist'
                                              },
                                          ])

In [62]:
def create_trial(experiment_name, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    algo = 'dp'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p2.8xlarge':
        i_tag = 'p2'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'
    else:
        i_tag = 'others'
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

### Debugger rules

In [63]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

### Debugger Profiling

In [64]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)

### Training environments

In [65]:
metric_definitions = [{'Name': 'average loss',
                       'Regex': 'Average loss: ([0-9\\.]+)'},
                      {'Name': 'accuracy',
                       'Regex': 'Accuracy: [0-9]+/[0-9]+, ([0-9\\.]+)'}]

In [66]:
type(metric_definitions)

list

In [67]:
hyperparameters = {"batch-size": 128,
                   "epochs": 100,
                   "learning-rate": 1e-3,
                   "log-interval": 100,
                   "tensorboard-logs-path": tensorboard_logs_path}

In [68]:
# set local_mode to be True if you want to run the training script
# on the machine that runs this notebook

local_mode = False

instance_count = 1

if local_mode:
    instance_type = "local"
else:
    instance_type = "ml.p3.2xlarge"

estimator = PyTorch(
            entry_point="train.py",
            source_dir="code",  # directory of your training script
            role=role,
            framework_version="1.8.1",
            py_version="py3",
            instance_type=instance_type,
            instance_count=instance_count,
            output_path=output_path,
            hyperparameters=hyperparameters,
            tensorboard_output_config=tensorboard_output_config,
            base_job_name='pytorch-tensorboard',
            metric_definitions=metric_definitions,
            profiler_config=profiler_config,
            rules=rules,
            disable_profiler=False # default: False
)

In [69]:
experiment_name = 'pytorch-tensorboard'
do_spot_training=False

create_experiment(experiment_name)
job_name = create_trial(experiment_name, instance_type, instance_count, do_spot_training)
job_name

'pytorch-tensorboard-others-1-dp-d-1210-03211639106518'

In [70]:
channels = {"training": train_location, "testing": test_location}
estimator.fit(inputs=channels,
              experiment_config={
                  'TrialName': job_name,
                  'TrialComponentDisplayName': job_name,
                },
              wait=False)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: pytorch-tensorboard-2021-12-10-03-22-01-596


In [None]:
job_name=estimator.latest_training_job.name
sagemaker_session.logs_for_job(job_name=job_name, wait=True)

2021-12-10 03:22:01 Starting - Starting the training job...
2021-12-10 03:22:17 Starting - Launching requested ML instancesLossNotDecreasing: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2021-12-10 03:23:33 Starting - Preparing the instances for training.........
2021-12-10 03:25:03 Downloading - Downloading input data
2021-12-10 03:25:03 Training - Downloading the training image...........................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-12-10 03:29:25,856 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-12-10 03:29:25,878 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-12-10 03:29:25,886 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-12-10 03:29:26,297 sagemaker-training-toolkit INFO     Installing

### Download profiler report

In [38]:
rule_output_path = estimator.output_path + '/' + estimator.latest_training_job.job_name + "/rule-output"
rule_output_path

's3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output'

In [39]:
! aws s3 ls {rule_output_path} --recursive

2021-12-09 13:55:22     389321 tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-report.html
2021-12-09 13:55:21     239418 tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2021-12-09 13:55:17        191 tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2021-12-09 13:55:17      10527 tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2021-12-09 13:55:17       1927 tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
2021-12-09 13:55:17        129 tensorboard_pytorch_fashion_mn

In [41]:
os.makedirs('./profiler', exist_ok=True)

In [42]:
! aws s3 cp {rule_output_path} ./profiler --recursive

download: s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json to profiler/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json to profiler/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
download: s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/output/pytorch-tensorboard-2021-12-09-13-46-49-120/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json to profiler/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
download: s3://sagemaker-ap-northeast-2-889750940888/tensorboard_pytorch_fashion_mnist/output/pytorch-tensorbo

# Screenshots

![tensorboard](image/01.tensorboard.png)

![SM-experiments](image/02.SM-experiments.png)

![SM-debugger](image/03.SM-debugger.png)