## Training a Wide ResNet with Dask Classifier

- Rerun LR decrease starting with 0.05
- Rerun BS increase starting with 0.05
- Run 2nd/3rd hybrid exps

Make simple dist.py that reads from CSV. 
- Subclass dask classifier to return based on training CSV
- sleep during partial_fit, override to sleep on diff workers
- Overwrite score and _get_grads ( to sleep )
--- Function called 

In [1]:
from dask.distributed import Client

In [2]:
import os
os.chdir('/Users/joeholt/Developer/next-lab/adadamp-experiments/')

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
from copy import copy
from adadamp.adadamp import DaskClassifier, DaskClassifierExpiriments

In [3]:
# training client
from dask.distributed import Client
from dask.distributed import performance_report

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://192.168.0.103/56690/1  Dashboard: http://192.168.0.103:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 17.18 GB


In [4]:
os.getcwd()

'/Users/joeholt/Developer/next-lab/adadamp-experiments'

In [5]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [5]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [6]:
train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform_train)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [7]:
# model for expiriment 1: 
#    “Decaying learning rate” follows the original implementation; 
#     the batch size is constant, while the learning rate repeatedly 
#     decays by a factor of 5 at a sequence of steps
# my understanding is that this is the "control" expeririment, where we are not touching the number of 
# workers nor the batch size
device = "cpu" if not torch.cuda.is_available() else "cuda:0"
args = dict(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=0.1,
    optimizer__momentum=0.9,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    batch_size=128,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client
)

In [8]:
# v0 - broken dist.py, LR 1
# v1 - fixed dist.py, LR 1
# v2 - LR 0.02
# v3 - LR 0.02, decaying learning rate, fixed data load

def write_stats(hist, exp, epoch):
    toCSV = hist
    with open('./exp-dask/stats/{}/exp-{}.csv'.format(exp, epoch), 'w', encoding='utf8', newline='') as output_file:
        fc = csv.DictWriter(output_file, fieldnames=toCSV[0].keys())
        fc.writeheader()
        fc.writerows(toCSV)

In [9]:
def test(epoch, model, test_set):
    """
    tests if model meets certain testing standards:
    - by 10th epoch, accuracy is over 70%
    """
    if epoch == 10:
        print("[TEST] Testing accuracy for 10th epoch is over 70%")
        score = model.score(test_set)
        acc = model._meta['score__acc']
        if acc < 0.70:
            print("[TEST] Test failed with {} accuracy".format(acc))
            return False
        else:
            print("[TEST] Test passed with {} accuracy".format(acc))
    
    return True

In [10]:
def train(model, train_set, test_set, n_epochs=200, epoch_sched=[], lr_sched=[], bs_sched=[], log_interval=1, exp='increasing-bs'):
    """
    Train based on expiriment params
    
    Parameters:
    epoch_sched: update lr and bs at epochs in this list
    lr_sched: update lr to value at matching epoch. Should be same length as epoch_sched
    bs_sched: update bs to value at matching epoch. Should be same length as epoch_sched
    """
    assert len(epoch_sched) == len(lr_sched) == len(bs_sched), "Invalid schedules. Epoch, lr and bs schedules should all be the same length."
    
    epochs = copy(epoch_sched)
    lrs = copy(lr_sched)
    bss = copy(bs_sched)
    
    history = []
    for epoch in range(n_epochs):
        # check for updates
        if len(epochs) > 0 and epochs[0] == epoch:
            lr = lrs.pop(0)
            bs = bss.pop(0)
            epochs.pop(0)
            model.set_lr(lr)
            model.set_bs(bs)
            print("[UPDATE] Updated model params:\n\tlr: {}\n\tbs: {}".format(lr, bs))
            if epoch != 0:
                print("[UPDATE] Running loss on train set...", end=" ")
                train_score = model.score(train_set)
                print(model._meta['score__loss'], '(acc:', model._meta['score__acc'], ')')
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {}".format(score))
        history.append(datum)
        
        # test
        if test(epoch, model, test_set) == False:
            print("[TEST] Test failed, exiting")
            break
        
        if epoch % log_interval == 0:
            write_stats(history, exp, 'ep{}'.format(epoch))

    return history

In [None]:
# ie "Update LR to 0.1 and bs t0 640 on 60th epoch"
exp0_epochs = [0, 60, 120, 180]
exp0_lr = [
    0.05, 
    0.05 / 5, 
    0.05 / 5 / 5, 
    0.05 / 5 / 5 / 5
]
exp0_bs = [128, 128, 128, 128]
model = DaskClassifierExpiriments(**args)
# train
hist = None
with performance_report(filename="dask-report.html"):
    hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             epoch_sched=exp0_epochs, 
             lr_sched=exp0_lr, 
             bs_sched=exp0_bs, 
             log_interval=20,
             exp='decreasing-lr'
            )

In [12]:
write_stats(hist, 'decreasing-lr', '-final')

In [11]:
# ie "Update LR to 0.02 and bs t0 640 on 60th epoch"
exp1_epochs = [0, 60, 120, 180]
exp1_lr = [0.05, 0.05, 0.05, 0.05]
exp1_bs = [128, 640, 3200, 16000]
model = DaskClassifierExpiriments(**args)
# train
hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             epoch_sched=exp1_epochs, 
             lr_sched=exp1_lr, 
             bs_sched=exp1_bs, 
             log_interval=20,
             exp='increasing-bs'
            )

[UPDATE] Updated model params:
	lr: 0.05
	bs: 128
[Epoch 0]

  warn("Model appears not to update with weight difference {diff}")


 Score: 0.5943999886512756
[Epoch 1] Score: 0.6757999658584595
[Epoch 2] Score: 0.7128999829292297
[Epoch 3] Score: 0.7423999905586243
[Epoch 4] Score: 0.770799994468689
[Epoch 5] Score: 0.7875999808311462
[Epoch 6] Score: 0.7892999649047852
[Epoch 7] Score: 0.8071999549865723
[Epoch 8] Score: 0.8120999932289124
[Epoch 9] Score: 0.8226000070571899
[Epoch 10] Score: 0.8353999853134155
[TEST] Testing accuracy for 10th epoch is over 70%
[TEST] Test passed with 0.8330999612808228 accuracy
[Epoch 11] Score: 0.8224999904632568
[Epoch 12] Score: 0.8246999979019165
[Epoch 13] Score: 0.8355000019073486
[Epoch 14] Score: 0.8416999578475952
[Epoch 15] Score: 0.8370999693870544
[Epoch 16] Score: 0.8308999538421631
[Epoch 17] Score: 0.8429999947547913
[Epoch 18] Score: 0.8490999937057495
[Epoch 19] Score: 0.8333999514579773
[Epoch 20] Score: 0.8337999582290649
[Epoch 21] Score: 0.848800003528595
[Epoch 22] Score: 0.8454999923706055
[Epoch 23] Score: 0.8416000008583069
[Epoch 24] Score: 0.8499000072

In [12]:
write_stats(hist, 'increasing-bs', 'final')

In [13]:
# ie "Update LR to 0.1 and bs t0 640 on 60th epoch"
exp2_epochs = [0, 60, 120, 180]
exp2_lr = [0.05, 
           0.05, 
           0.05 / 5, 
           0.05 / 5 / 5
          ]
exp2_bs = [128, 640, 640, 640]
model = DaskClassifierExpiriments(**args)
# train
hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             epoch_sched=exp2_epochs, 
             lr_sched=exp2_lr, 
             bs_sched=exp2_bs, 
             log_interval=20,
             exp='hybrid'
            )

[UPDATE] Updated model params:
	lr: 0.05
	bs: 128
[Epoch 0] Score: 0.6014999747276306
[Epoch 1] Score: 0.6804999709129333
[Epoch 2] Score: 0.7328000068664551
[Epoch 3] Score: 0.7621999979019165
[Epoch 4] Score: 0.7834999561309814
[Epoch 5] Score: 0.8009999990463257
[Epoch 6] Score: 0.8064000010490417
[Epoch 7] Score: 0.8100000023841858
[Epoch 8] Score: 0.8197999596595764
[Epoch 9] Score: 0.812999963760376
[Epoch 10] Score: 0.8154999613761902
[TEST] Testing accuracy for 10th epoch is over 70%
[TEST] Test passed with 0.8159999847412109 accuracy
[Epoch 11] Score: 0.8373000025749207
[Epoch 12] Score: 0.8402000069618225
[Epoch 13] Score: 0.8319000005722046
[Epoch 14] Score: 0.8525999784469604
[Epoch 15] Score: 0.8307999968528748
[Epoch 16] Score: 0.8412999510765076
[Epoch 17] Score: 0.838699996471405
[Epoch 18] Score: 0.837399959564209
[Epoch 19] Score: 0.8515999913215637
[Epoch 20] Score: 0.84579998254776
[Epoch 21] Score: 0.8432999849319458
[Epoch 22] Score: 0.8391000032424927
[Epoch 23] 

In [14]:
write_stats(hist, 'hybrid', 'final')

In [15]:
# ie "Update ..."
exp3_epochs = [0, 60, 120, 180]
exp3_lr = [0.05, 
           0.05, 
           0.05, 
           0.05 / 5
          ]
exp3_bs = [128, 640, 3200, 3200]
model = DaskClassifierExpiriments(**args)
# train
hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             epoch_sched=exp3_epochs, 
             lr_sched=exp3_lr, 
             bs_sched=exp3_bs, 
             log_interval=20,
             exp='hybrid-2'
            )

[UPDATE] Updated model params:
	lr: 0.05
	bs: 128
[Epoch 0] Score: 0.6290000081062317
[Epoch 1] Score: 0.694599986076355
[Epoch 2] Score: 0.7508999705314636
[Epoch 3] Score: 0.7793999910354614
[Epoch 4] Score: 0.8016999959945679
[Epoch 5] Score: 0.8024999499320984
[Epoch 6] Score: 0.8096999526023865
[Epoch 7] Score: 0.832099974155426
[Epoch 8] Score: 0.8307999968528748
[Epoch 9] Score: 0.8369999527931213
[Epoch 10] Score: 0.8292999863624573
[TEST] Testing accuracy for 10th epoch is over 70%
[TEST] Test passed with 0.8325999975204468 accuracy
[Epoch 11] Score: 0.8430999517440796
[Epoch 12] Score: 0.8443999886512756
[Epoch 13] Score: 0.8333999514579773
[Epoch 14] Score: 0.8466999530792236
[Epoch 15] Score: 0.8345999717712402
[Epoch 16] Score: 0.8388999700546265
[Epoch 17] Score: 0.8547999858856201
[Epoch 18] Score: 0.8427000045776367
[Epoch 19] Score: 0.8279999494552612
[Epoch 20] Score: 0.8407999873161316
[Epoch 21] Score: 0.856499969959259
[Epoch 22] Score: 0.8574000000953674
[Epoch 23

In [16]:
write_stats(hist, 'hybrid-2', 'final')