## Simulate running on multiple GPUs

In [1]:
from dask.distributed import Client

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.chdir('/Users/joeholt/Developer/next-lab/adadamp-experiments/')

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
import pandas as pd
from copy import copy
from adadamp.adadamp import DaskClassifierSimulator

In [4]:
# training client
from dask.distributed import Client

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://10.139.67.201/28856/1  Dashboard: http://10.139.67.201:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 17.18 GB


In [5]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform_train)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [6]:
def write_stats(hist, directory, filename):
    toCSV = hist
    with open('./exp-dask/{}/{}'.format(directory, filename), 'w', encoding='utf8', newline='') as output_file:
        fc = csv.DictWriter(output_file, fieldnames=toCSV[0].keys())
        fc.writeheader()
        fc.writerows(toCSV)

In [7]:
def train(model, train_set, test_set, stats_df, n_epochs=200, epoch_sched=[], lr_sched=[], bs_sched=[], log_interval=1, stats_dir=''):
    """
    Train based on expiriment params
    
    Parameters:
    epoch_sched: update lr and bs at epochs in this list
    lr_sched: update lr to value at matching epoch. Should be same length as epoch_sched
    bs_sched: update bs to value at matching epoch. Should be same length as epoch_sched
    """
    assert len(epoch_sched) == len(lr_sched) == len(bs_sched), "Invalid schedules. Epoch, lr and bs schedules should all be the same length."
    
    stats = list(stats_df.T.to_dict().values())
    print("[SETUP] Loaded {} epochs of stats".format(len(stats)))
    
    epochs = copy(epoch_sched)
    lrs = copy(lr_sched)
    bss = copy(bs_sched)
    
    history = []
    for epoch in range(min(n_epochs, len(stats))):
        
        # set stats
        stat = stats.pop(0)
        model.set_sim(stat)
        
        # check for updates
        if len(epochs) > 0 and epochs[0] == epoch:
            lr = lrs.pop(0)
            bs = bss.pop(0)
            epochs.pop(0)
            model.set_lr(lr)
            model.set_bs(bs)
            print("[UPDATE] Updated model params:\n\tlr: {}\n\tbs: {}".format(lr, bs))
            if epoch != 0:
                print("[UPDATE] Running loss on train set...", end=" ")
                train_score = model.score(train_set)
                print(model._meta['score__loss'], '(acc:', model._meta['score__acc'], ')')
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {}".format(score))
        history.append(datum)
        
        if epoch % log_interval == 0:
            write_stats(history, stats_dir, 'results-ep{}.csv'.format(epoch))

    return history

In [8]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [9]:
device = "cpu" if not torch.cuda.is_available() else "cuda:0"
args = dict(
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=0.1,
    optimizer__momentum=0.9,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    batch_size=128,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client
)

In [10]:
# ie "Update LR to 0.1 and bs t0 640 on 60th epoch"
exp0_epochs = [0, 60, 120, 180]
exp0_lr = [
    0.05, 
    0.05 / 5, 
    0.05 / 5 / 5, 
    0.05 / 5 / 5 / 5
]
exp0_bs = [128, 128, 128, 128]
model = DaskClassifierSimulator(**args)
stats_df = pd.read_csv('./exp-dask/stats/decreasing-lr/exp--final.csv')

# train
hist = train(model, 
         train_set, 
         test_set, 
         stats_df,
         n_epochs=200, 
         epoch_sched=exp0_epochs, 
         lr_sched=exp0_lr, 
         bs_sched=exp0_bs, 
         log_interval=20,
         stats_dir="sim-results/dec-lr"
        )

Loaded 200 epochs of stats
[UPDATE] Updated model params:
	lr: 0.05
	bs: 128
[Epoch 0] Score: 0.123456789


NameError: name 'test' is not defined

In [None]:
write_stats(history, 'sim-results/dec-lr', 'results-final')