In [1]:
import os
import ray
import logging
import hydra
import hydra.experimental
from hydra.utils import get_original_cwd

import numpy as np
import torch
from torchvision import datasets, transforms

# from torchfly_dev.training.trainer import Trainer
from torchfly_dev.training.trainer2 import Trainer2
from torchfly_dev.utils import configure_logging

from model import CNNNet
from dataloader import get_data_loader

logger = logging.getLogger(__name__)


In [2]:
hydra.experimental.initialize(
    config_dir="config", 
    strict=False)

# compose from config.yaml, this composes a bunch of defaults in:
config=hydra.experimental.compose(config_file="config.yaml")
print(config.pretty())

logging:
  color: true
  steps_interval: -1
  level: DEBUG
  seconds_interval: 2
saving:
  steps_interval: -1
  keep_checkpoint_every_num_seconds: 3600
  num_checkpoints_to_keep: 2
  resume_mode: true
  seconds_interval: 2
training:
  batch_size: 32
  fp16: true
  fp16_opt_level: O1
  gradient_accumulation_steps: 1
  learning_rate: 1.0e-05
  max_grad_norm: 1.0
  num_gpus_per_node: 1
  optimizer: AdamW
  random_seed: 1
  total_num_epochs: 10
  total_num_steps: 10



In [3]:
configure_logging(config)

In [4]:
train_loader, val_loader = get_data_loader(config)
model = CNNNet(config)
trainer = Trainer2(config=config, model=model, train_loader=train_loader, validation_loader=val_loader)
trainer.train()

[2020-03-09 22:32:42,996][torchfly_dev.training.trainer2][INFO] - logging:
  color: true
  steps_interval: -1
  level: DEBUG
  seconds_interval: 2
saving:
  steps_interval: -1
  keep_checkpoint_every_num_seconds: 3600
  num_checkpoints_to_keep: 2
  resume_mode: true
  save_dir: /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints
  seconds_interval: 2
training:
  batch_size: 32
  fp16: true
  fp16_opt_level: O1
  gradient_accumulation_steps: 1
  learning_rate: 1.0e-05
  max_grad_norm: 1.0
  num_gpus_per_node: 1
  optimizer: AdamW
  random_seed: 1
  total_num_epochs: 10
  total_num_steps: 10
  validation_steps_interval: 1874

[2020-03-09 22:32:42,999][torchfly_dev.training.trainer2][INFO] - Restoring the latest checkpoint
[2020-03-09 22:32:43,012][torchfly_dev.training.checkpointer][INFO] - Loading checkpoint /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_3780_state.pth


2020-03-09 22:32:43,020	INFO resource_spec.py:212 -- Starting Ray with 6.79 GiB memory available for workers and up to 3.4 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


[2020-03-09 22:32:43,544][torchfly_dev.training.trainer2][INFO] - {'node_ip_address': '192.168.50.105', 'redis_address': '192.168.50.105:30528', 'object_store_address': '/tmp/ray/session_2020-03-09_22-32-43_016659_15777/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2020-03-09_22-32-43_016659_15777/sockets/raylet', 'webui_url': None, 'session_dir': '/tmp/ray/session_2020-03-09_22-32-43_016659_15777'}
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_func

[2020-03-09 22:33:17,375][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_6794_state.pth!
[2020-03-09 22:33:18,933][torchfly_dev.training.trainer2][INFO] - Train Epoch: [4/10] [95.3600%]    Loss: 0.249221
[2020-03-09 22:33:19,368][torchfly_dev.training.checkpointer][DEBUG] - Waiting for history job to finish!
[2020-03-09 22:33:19,379][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_7007_state.pth!
[2020-03-09 22:33:21,285][torchfly_dev.training.trainer2][INFO] - Validation Accuracy 92.72
[2020-03-09 22:33:21,304][torchfly_dev.training.trainer2][INFO] - Train Epoch: [4/10] [99.7867%]    Loss: 0.258159
[2020-03-09 22:33:21,337][torchfly_dev.training.trainer2][INFO] - Epoch duration: 0:00:18.402624
[2020-03-09 22:33:21,343][torchfly_dev.training.trainer2][INFO] - Epoch 5/10
[2020-03-09 22:33:21,369][to

[2020-03-09 22:33:55,999][torchfly_dev.training.checkpointer][DEBUG] - Waiting for history job to finish!
[2020-03-09 22:33:56,010][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_10701_state.pth!
[2020-03-09 22:33:56,026][torchfly_dev.training.trainer2][INFO] - Train Epoch: [6/10] [94.9333%]    Loss: 0.246989
[2020-03-09 22:33:58,332][torchfly_dev.training.trainer2][INFO] - Validation Accuracy 93.86
[2020-03-09 22:33:58,347][torchfly_dev.training.checkpointer][DEBUG] - Waiting for history job to finish!
[2020-03-09 22:33:58,360][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_10930_state.pth!
[2020-03-09 22:33:58,366][torchfly_dev.training.trainer2][INFO] - Train Epoch: [6/10] [99.6800%]    Loss: 0.261812
[2020-03-09 22:33:58,418][torchfly_dev.training.trainer2][INFO] - Epoch duration: 0:00:18.3967

[2020-03-09 22:34:32,481][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_14482_state.pth!
[2020-03-09 22:34:32,507][torchfly_dev.training.trainer2][INFO] - Train Epoch: [8/10] [96.7467%]    Loss: 0.196909
[2020-03-09 22:34:34,572][torchfly_dev.training.trainer2][INFO] - Validation Accuracy 94.81
[2020-03-09 22:34:34,588][torchfly_dev.training.checkpointer][DEBUG] - Waiting for history job to finish!
[2020-03-09 22:34:34,600][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_14713_state.pth!
[2020-03-09 22:34:34,607][torchfly_dev.training.trainer2][INFO] - Train Epoch: [8/10] [99.5733%]    Loss: 0.187791
[2020-03-09 22:34:34,674][torchfly_dev.training.trainer2][INFO] - Epoch duration: 0:00:18.136659
[2020-03-09 22:34:34,678][torchfly_dev.training.trainer2][INFO] - Epoch 9/10
[2020-03-09 22:34:36,596][

[2020-03-09 22:35:08,706][torchfly_dev.training.trainer2][INFO] - Train Epoch: [10/10] [92.4267%]    Loss: 0.167272
[2020-03-09 22:35:11,565][torchfly_dev.training.trainer2][INFO] - Validation Accuracy 95.42
[2020-03-09 22:35:11,584][torchfly_dev.training.checkpointer][DEBUG] - Waiting for history job to finish!
[2020-03-09 22:35:11,595][torchfly_dev.training.checkpointer][DEBUG] - Removing /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_18381_state.pth!
[2020-03-09 22:35:11,601][torchfly_dev.training.trainer2][INFO] - Train Epoch: [10/10] [99.4667%]    Loss: 0.172042
[2020-03-09 22:35:11,683][torchfly_dev.training.trainer2][INFO] - Epoch duration: 0:00:19.163649


{}

In [None]:
@hydra.main(config_path="config/config.yaml", strict=False)
def main(config=None):
    # set data loader
    train_loader, val_loader = get_data_loader(config)
    model = CNNNet(config)
    trainer = Trainer2(config=config, model=model, train_loader=train_loader, validation_loader=val_loader)
    trainer.train()


if __name__ == "__main__":
    main()


[2020-03-09 22:26:51,686][torchfly_dev.training.trainer2][INFO] - logging:
  color: true
  steps_interval: -1
  level: DEBUG
  seconds_interval: 2
saving:
  steps_interval: -1
  keep_checkpoint_every_num_seconds: 3600
  num_checkpoints_to_keep: 2
  resume_mode: true
  save_dir: /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints
  seconds_interval: 2
training:
  batch_size: 32
  fp16: true
  fp16_opt_level: O1
  gradient_accumulation_steps: 1
  learning_rate: 1.0e-05
  max_grad_norm: 1.0
  num_gpus_per_node: 1
  optimizer: AdamW
  random_seed: 1
  total_num_epochs: 10
  total_num_steps: 10
  validation_steps_interval: 1874

[2020-03-09 22:26:51,690][torchfly_dev.training.trainer2][INFO] - Restoring the latest checkpoint
[2020-03-09 22:26:51,704][torchfly_dev.training.checkpointer][INFO] - Loading checkpoint /home/yuheng/Desktop/final-project-torchfly/examples/TestTorchfly/MNIST/Checkpoints/iter_18719_state.pth


2020-03-09 22:26:51,714	INFO resource_spec.py:212 -- Starting Ray with 7.47 GiB memory available for workers and up to 3.76 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


[2020-03-09 22:26:52,186][torchfly_dev.training.trainer2][INFO] - {'node_ip_address': '192.168.50.105', 'redis_address': '192.168.50.105:52861', 'object_store_address': '/tmp/ray/session_2020-03-09_22-26-51_710253_14629/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2020-03-09_22-26-51_710253_14629/sockets/raylet', 'webui_url': None, 'session_dir': '/tmp/ray/session_2020-03-09_22-26-51_710253_14629'}
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_func