## Simulate running on multiple GPUs

In [1]:
from dask.distributed import Client

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
os.chdir('/Users/joeholt/Developer/next-lab/adadamp-experiments/')

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
import pandas as pd
from copy import copy
from adadamp.adadamp import DaskClassifierSimulator

In [11]:
# training client
from dask.distributed import Client

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://192.168.0.103/56901/1  Dashboard: http://192.168.0.103:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 17.18 GB


In [30]:
def train(model, train_set, test_set, sim_data, n_epochs=200, log_interval=1):
    """
    Train based on expiriment params
    
    Parameters:
    epoch_sched: update lr and bs at epochs in this list
    lr_sched: update lr to value at matching epoch. Should be same length as epoch_sched
    bs_sched: update bs to value at matching epoch. Should be same length as epoch_sched
    """
    assert len(epoch_sched) == len(lr_sched) == len(bs_sched), "Invalid schedules. Epoch, lr and bs schedules should all be the same length."
    
    
    history = []
    for epoch in range(n_epochs):
        sim = sim_data.pop(0)
        model.set_sim(sim)
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {}".format(score))
        history.append(datum)
        
        if epoch % log_interval == 0:
            write_stats(history, exp, 'ep{}'.format(epoch))

    return history

In [31]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [32]:
filename = "./exp-dask/stats/decreasing-lr/exp--final.csv"
stats_df = pd.read_csv(filename)

In [33]:
device = "cpu" if not torch.cuda.is_available() else "cuda:0"
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
args = dict(
    stats_df=stats_df,
    module=Wide_ResNet,
    module__depth=16,
    module__widen_factor=4,
    module__dropout_rate=0.3,
    module__num_classes=len(classes),
    loss=torch.nn.CrossEntropyLoss,
    optimizer=torch.optim.SGD,
    optimizer__lr=0.1,
    optimizer__momentum=0.9,
    optimizer__nesterov=True,
    optimizer__weight_decay=0.5e-3,
    batch_size=128,
    max_epochs=200,
    device=device,
    grads_per_worker=128,
    client=client
)

In [18]:
model = DaskClassifierSimulator(**args)

In [19]:
model.initialize()
print(model._sim_data)

None


In [20]:
stats_df.head

<bound method NDFrame.head of      epoch   score                       module  \
0        0  0.5991  <class 'model.Wide_ResNet'>   
1        1  0.6763  <class 'model.Wide_ResNet'>   
2        2  0.7157  <class 'model.Wide_ResNet'>   
3        3  0.7778  <class 'model.Wide_ResNet'>   
4        4  0.7679  <class 'model.Wide_ResNet'>   
..     ...     ...                          ...   
195    195  0.9195  <class 'model.Wide_ResNet'>   
196    196  0.9190  <class 'model.Wide_ResNet'>   
197    197  0.9204  <class 'model.Wide_ResNet'>   
198    198  0.9211  <class 'model.Wide_ResNet'>   
199    199  0.9205  <class 'model.Wide_ResNet'>   

                                                 loss  \
0    <class 'torch.nn.modules.loss.CrossEntropyLoss'>   
1    <class 'torch.nn.modules.loss.CrossEntropyLoss'>   
2    <class 'torch.nn.modules.loss.CrossEntropyLoss'>   
3    <class 'torch.nn.modules.loss.CrossEntropyLoss'>   
4    <class 'torch.nn.modules.loss.CrossEntropyLoss'>   
..             