## Training a Wide ResNet with Dask Classifier

- Rerun LR decrease starting with 0.05
- Rerun BS increase starting with 0.05
- Run 2nd/3rd hybrid exps

Make simple dist.py that reads from CSV. 
- Subclass dask classifier to return based on training CSV
- sleep during partial_fit, override to sleep on diff workers
- Overwrite score and _get_grads ( to sleep )
--- Function called 

In [1]:
from dask.distributed import Client

In [2]:
import os
import importlib  
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
from copy import copy

In [3]:
# import simulator
os.chdir('/home/ubuntu/adadamp-experiments/')
classifier = importlib.import_module("exp-dask.classifier")
from classifier import DaskClassifierExpiriments

In [4]:
# training client
from dask.distributed import Client
from dask.distributed import performance_report

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://172.31.40.124/27181/1  Dashboard: http://172.31.40.124:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 16.48 GB


In [5]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [6]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [7]:
train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform_train)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [8]:
def write_stats(hist, exp, epoch):
    toCSV = hist
    with open('./exp-dask/stats/{}/exp-{}.csv'.format(exp, epoch), 'w', encoding='utf8', newline='') as output_file:
        fc = csv.DictWriter(output_file, fieldnames=toCSV[0].keys())
        fc.writeheader()
        fc.writerows(toCSV)

In [9]:
def test(epoch, model, test_set):
    """
    tests if model meets certain testing standards:
    - by 10th epoch, accuracy is over 70%
    """
    if epoch == 10:
        print("[TEST] Testing accuracy for 10th epoch is over 70%")
        score = model.score(test_set)
        acc = model._meta['score__acc']
        if acc < 0.70:
            print("[TEST] Test failed with {} accuracy".format(acc))
            return False
        else:
            print("[TEST] Test passed with {} accuracy".format(acc))
    
    return True

In [10]:
def train(model, train_set, test_set, n_epochs=200, dampings=None, log_interval=1, exp='increasing-bs'):
    """
    Train based on expiriment params
    
    Parameters:
    epoch_sched: update lr and bs at epochs in this list
    lr_sched: update lr to value at matching epoch. Should be same length as epoch_sched
    bs_sched: update bs to value at matching epoch. Should be same length as epoch_sched
    """
    
    history = []
    for epoch in range(n_epochs):
        # check for updates
        bs = dampings.get(epoch, None)
        if bs:
            model.damping_ = bs
            print("[UPDATE] Updated model params")
            if epoch != 0:
                print("[UPDATE] Running loss on train set...", end=" ")
                train_score = model.score(train_set)
                print(model._meta['score__loss'], '(acc:', model._meta['score__acc'], ')')
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {}".format(score))
        history.append(datum)
        
        # test
        if test(epoch, model, test_set) == False:
            print("[TEST] Test failed, exiting")
            break
        
        if epoch % log_interval == 0:
            write_stats(history, exp, 'ep{}'.format(epoch))

    return history

In [15]:
### Const Batch Size (512), decreasing LR

# lr and bs
orig_lr = 0.05
orig_bs = 512
# exp
exp_lr = orig_lr
exp_bs = orig_bs
exp_momentum = 0.9
# damping sched
dampings = {
    0:   exp_bs,
    60:  exp_bs * 5,
    120: exp_bs * 5 * 5,
    180: exp_bs * 5 * 5 * 5,
}

device = "cpu" if not torch.cuda.is_available() else "cuda:0"
args = dict(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=exp_lr,
    optimizer__momentum=exp_momentum,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client,
    max_batch_size=512,
    lr=exp_lr,
    batch_size=exp_bs
)
# set up
model = DaskClassifierExpiriments(**args)
# train
hist = None
with performance_report(filename="dask-report.html"):
    hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             dampings=dampings, 
             log_interval=20,
             exp='dec-lr-512bs'
            )
write_stats(hist, 'dec-lr-512bs', 'final')

[UPDATE] Updated model params
[Epoch 0] Score: 0.48979997634887695
[Epoch 1] Score: 0.6232999563217163
[Epoch 2] Score: 0.6603999733924866
[Epoch 3] Score: 0.7120999693870544
[Epoch 4] Score: 0.7465999722480774
[Epoch 5] Score: 0.7512999773025513
[Epoch 6] Score: 0.7569999694824219
[Epoch 7] Score: 0.7965999841690063
[Epoch 8] Score: 0.7978999614715576
[Epoch 9] Score: 0.8180999755859375
[Epoch 10] Score: 0.8219999670982361
[TEST] Testing accuracy for 10th epoch is over 70%
[TEST] Test passed with 0.823699951171875 accuracy
[Epoch 11] Score: 0.8235999941825867
[Epoch 12] Score: 0.8337999582290649
[Epoch 13] Score: 0.8313999772071838
[Epoch 14] Score: 0.8382999897003174
[Epoch 15] Score: 0.840399980545044
[Epoch 16] Score: 0.8459999561309814
[Epoch 17] Score: 0.8436999917030334
[Epoch 18] Score: 0.8467999696731567
[Epoch 19] Score: 0.8513999581336975
[Epoch 20] Score: 0.8542999625205994
[Epoch 21] Score: 0.8543999791145325
[Epoch 22] Score: 0.8554999828338623
[Epoch 23] Score: 0.8635999

  warn(f"Model appears not to update with diff={rel_error}")


 Score: 0.9052000045776367
[Epoch 73] Score: 0.9065999984741211
[Epoch 74] Score: 0.9071999788284302
[Epoch 75] Score: 0.9035999774932861
[Epoch 76] Score: 0.9041000008583069
[Epoch 77] Score: 0.9049999713897705
[Epoch 78] Score: 0.9048999547958374
[Epoch 79] Score: 0.9080999493598938
[Epoch 80] Score: 0.9048999547958374
[Epoch 81] Score: 0.9057999849319458
[Epoch 82] Score: 0.9047999978065491
[Epoch 83] Score: 0.9017999768257141
[Epoch 84] Score: 0.9053999781608582
[Epoch 85] Score: 0.9061999917030334
[Epoch 86] Score: 0.9088000059127808
[Epoch 87] Score: 0.9049999713897705
[Epoch 88] Score: 0.9025999903678894
[Epoch 89] Score: 0.9055999517440796
[Epoch 90] Score: 0.9070000052452087
[Epoch 91] Score: 0.9059999585151672
[Epoch 92] Score: 0.9067999720573425
[Epoch 93] Score: 0.9062999486923218
[Epoch 94] Score: 0.905299961566925
[Epoch 95] Score: 0.9052000045776367
[Epoch 96] Score: 0.9052000045776367
[Epoch 97] Score: 0.902899980545044
[Epoch 98] Score: 0.9045999646186829
[Epoch 99] Sc

  warn(f"Model appears not to update with diff={rel_error}")


 Score: 0.9092999696731567
[Epoch 127] Score: 0.9088000059127808
[Epoch 128] Score: 0.9088000059127808
[Epoch 129] Score: 0.9088000059127808
[Epoch 130] Score: 0.9072999954223633
[Epoch 131] Score: 0.910099983215332
[Epoch 132] Score: 0.9099999666213989
[Epoch 133] Score: 0.9073999524116516
[Epoch 134] Score: 0.9126999974250793
[Epoch 135] Score: 0.9103999733924866
[Epoch 136] Score: 0.9085999727249146
[Epoch 137] Score: 0.9103999733924866
[Epoch 138] Score: 0.9097999930381775
[Epoch 139] Score: 0.9088999629020691
[Epoch 140] Score: 0.9089999794960022
[Epoch 141] Score: 0.910099983215332
[Epoch 142] Score: 0.9107999801635742
[Epoch 143] Score: 0.9115999937057495
[Epoch 144] Score: 0.9121999740600586
[Epoch 145] Score: 0.9092999696731567
[Epoch 146] Score: 0.9103999733924866
[Epoch 147] Score: 0.910099983215332
[Epoch 148] Score: 0.9122999906539917
[Epoch 149] Score: 0.909500002861023
[Epoch 150] Score: 0.909500002861023
[Epoch 151] Score: 0.9085999727249146
[Epoch 152] Score: 0.9068999

In [16]:
# lr and bs
orig_lr = 0.05
orig_bs = 128
# exp
exp_lr = orig_lr * 2
exp_bs = orig_bs * 2
exp_momentum = 0.9
# damping sched
dampings = {
    0:   exp_bs,
    60:  exp_bs * 5,
    120: exp_bs * 5 * 5,
    180: exp_bs * 5 * 5 * 5,
}

device = "cpu" if not torch.cuda.is_available() else "cuda:0"
args = dict(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=exp_lr,
    optimizer__momentum=exp_momentum,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client,
    max_batch_size=5120,
    lr=exp_lr,
    batch_size=exp_bs
)
# set up
model = DaskClassifierExpiriments(**args)
# train
hist = None
with performance_report(filename="dask-report.html"):
    hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             dampings=dampings, 
             log_interval=20,
             exp='large-bs-0'
            )
write_stats(hist, 'large-bs-0', 'final')

[UPDATE] Updated model params
[Epoch 0]

KeyboardInterrupt: 

In [42]:
print('done')

done


In [43]:
# lr and bs
orig_lr = 0.05
orig_bs = 128
# exp
exp_lr = orig_lr * 2
exp_bs = orig_bs * 2
exp_momentum = 0.98
# damping sched
dampings = {
    0:   exp_bs,
    60:  exp_bs * 5,
    120: exp_bs * 5 * 5,
    180: exp_bs * 5 * 5 * 5,
}
device = "cpu" if not torch.cuda.is_available() else "cuda:0"
args = dict(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=exp_lr,
    optimizer__momentum=exp_momentum,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client,
    max_batch_size=5120,
    lr=exp_lr,
    batch_size=exp_bs
)
# set up
model = DaskClassifierExpiriments(**args)
# train
hist = None
with performance_report(filename="dask-report.html"):
    hist = train(model, 
             train_set, 
             test_set, 
             n_epochs=200, 
             dampings=dampings, 
             log_interval=20,
             exp='large-bs-1'
            )
write_stats(hist, 'large-bs-1', 'final')

[UPDATE] Updated model params
[Epoch 0] Score: 0.47829997539520264
[Epoch 1] Score: 0.5977999567985535
[Epoch 2] Score: 0.665399968624115
[Epoch 3] Score: 0.6798999905586243
[Epoch 4] Score: 0.6807999610900879
[Epoch 5] Score: 0.7182999849319458
[Epoch 6] Score: 0.7193999886512756
[Epoch 7] Score: 0.7482999563217163
[Epoch 8] Score: 0.6961999535560608
[Epoch 9] Score: 0.7603999972343445
[Epoch 10] Score: 0.7512999773025513
[TEST] Testing accuracy for 10th epoch is over 70%
[TEST] Test passed with 0.7529999613761902 accuracy
[Epoch 11] Score: 0.7337999939918518
[Epoch 12] Score: 0.7601000070571899
[Epoch 13] Score: 0.7540000081062317
[Epoch 14] Score: 0.7477999925613403
[Epoch 15] Score: 0.7642999887466431
[Epoch 16] Score: 0.7534999847412109
[Epoch 17] Score: 0.722000002861023
[Epoch 18] Score: 0.7458999752998352
[Epoch 19] Score: 0.7356999516487122
[Epoch 20] Score: 0.7387999892234802
[Epoch 21] Score: 0.7529000043869019
[Epoch 22] Score: 0.7604999542236328
[Epoch 23] Score: 0.7419999