## Training a Wide ResNet with Dask Classifier

In [1]:
from dask.distributed import Client

In [2]:
import os
os.chdir('/home/ubuntu/adadamp-experiments')

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
from adadamp.adadamp import DaskClassifier, DaskClassifierIncreasingLR

In [3]:
# training client
from dask.distributed import Client

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://172.31.40.124/24831/1  Dashboard: http://172.31.40.124:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 16.48 GB


In [4]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [5]:
# load data - https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [6]:
# model for expiriment 1: 
#    “Decaying learning rate” follows the original implementation; 
#     the batch size is constant, while the learning rate repeatedly 
#     decays by a factor of 5 at a sequence of steps
# my understanding is that this is the "control" expeririment, where we are not touching the number of 
# workers nor the batch size
device = "cpu" if not torch.cuda.is_available() else "cuda:0"
model = DaskClassifierIncreasingLR(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=0.1,
    optimizer__momentum=0.9,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    batch_size=128,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client
)

In [None]:
args = (model, train_set, test_set)
_, test_y = test_sest # todo: passs tests sest for training
model.fit(train_set)

| Wide-Resnet 16x4
Initial Learning Rate: 0.1
Epoch: 1 (acc: 0.09953999519348145)
Epoch: 2 (acc: 0.09959999471902847)
Epoch: 3 (acc: 0.09953999519348145)
Epoch: 4 (acc: 0.09963999688625336)
Epoch: 5 (acc: 0.09963999688625336)
Epoch: 6 (acc: 0.09978000074625015)
Epoch: 7 (acc: 0.09971999377012253)
Epoch: 8 (acc: 0.09933999925851822)
Epoch: 9 (acc: 0.09947999566793442)
Epoch: 10 (acc: 0.09953999519348145)
Epoch: 11 (acc: 0.09950000047683716)
Epoch: 12 (acc: 0.09947999566793442)
Epoch: 13 (acc: 0.09939999878406525)
Epoch: 14 (acc: 0.09961999952793121)
Epoch: 15 (acc: 0.09961999952793121)
Epoch: 16 (acc: 0.09941999614238739)
Epoch: 17 (acc: 0.09957999736070633)
Epoch: 18 (acc: 0.09963999688625336)
Epoch: 19 (acc: 0.09950000047683716)
Epoch: 20 (acc: 0.09950000047683716)
Epoch: 21 (acc: 0.09963999688625336)
Epoch: 22 (acc: 0.09973999857902527)
Epoch: 23 (acc: 0.09969999641180038)
Epoch: 24 (acc: 0.09959999471902847)
Epoch: 25 (acc: 0.09953999519348145)
Epoch: 26 (acc: 0.09953999519348145)
E

In [None]:
toCSV = model.curr_metas
with open('./exp-dask/exp1-decreaseingLR-const-workers-v0.csv', 'w', encoding='utf8', newline='') as output_file:
    fc = csv.DictWriter(output_file, fieldnames=toCSV[0].keys())
    fc.writeheader()
    fc.writerows(toCSV)