## Simulate running on multiple GPUs

In [1]:
from dask.distributed import Client

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.chdir('/home/ubuntu/adadamp-experiments/')

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
import importlib
import pandas as pd
from time import time
from copy import copy

In [4]:
# import simulator
os.chdir('/home/ubuntu/adadamp-experiments/')
classifier = importlib.import_module("exp-dask.classifier")
from classifier import DaskClassifierSimulator

In [5]:
# training client
from dask.distributed import Client

def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)
client

0,1
Client  Scheduler: inproc://172.31.40.124/3069/1  Dashboard: http://172.31.40.124:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 16.48 GB


In [6]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform_train)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [7]:
from model import Wide_ResNet

client.upload_file("./exp-dask/model.py")

In [8]:
def write_stats(hist, directory, filename):
    toCSV = hist
    with open('./exp-dask/{}/{}'.format(directory, filename), 'w', encoding='utf8', newline='') as output_file:
        fc = csv.DictWriter(output_file, fieldnames=toCSV[0].keys())
        fc.writeheader()
        fc.writerows(toCSV)

In [9]:
def train(model, train_set, test_set, stats_df, n_epochs=200, log_interval=1, stats_dir=''):
    """
    Train based on expiriment params
    
    Parameters:
    epoch_sched: update lr and bs at epochs in this list
    lr_sched: update lr to value at matching epoch. Should be same length as epoch_sched
    bs_sched: update bs to value at matching epoch. Should be same length as epoch_sched
    """
    
    stats = list(stats_df.T.to_dict().values())
    print("[SETUP] Loaded {} epochs of stats".format(len(stats)))
    
    # preprocess data
    start = time()
    train_set = model.preprocess(train_set)
    test_set = model.preprocess(test_set)
    print("[SETUP] Pre-Processed in {} seconds".format(time() - start))
    
    history = []
    for epoch in range(min(n_epochs, len(stats))):
        
        start = time()
        
        # set stats
        stat = stats.pop(0)
        model.set_sim(stat)
        
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {} in {} seconds".format(score, time() - start))
        history.append(datum)
        
        if epoch % log_interval == 0:
            write_stats(history, stats_dir, 'results-ep{}.csv'.format(epoch))

    return history

In [10]:
def run_sim(stats_df_loc, out_folder, timings, n_epochs=200, grads_per_worker=128, max_bs=99999999):
    
    print("Running simulation....\n\n")
    start = time()
    
    stats_df = pd.read_csv(stats_df_loc)
    
    if 'lr_' in stats_df.columns:
        stats_df = stats_df.rename(columns={"lr_": "partial_fit__lr", "batch_size_": "partial_fit__batch_size"})
    
    init_lr = stats_df['partial_fit__lr'][0]
    init_bs = stats_df['partial_fit__batch_size'][0]
    momentum = stats_df['optimizer__momentum'][0]
    device = "cpu" if not torch.cuda.is_available() else "cuda:0"

    args = dict(
        module=Wide_ResNet,
        module__depth=16,
        module__widen_factor=4,
        module__dropout_rate=0.3,
        module__num_classes=len(classes),
        loss=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.SGD,
        optimizer__lr=init_lr,
        optimizer__momentum=momentum,
        optimizer__nesterov=True,
        optimizer__weight_decay=0.5e-3,
        batch_size=init_bs,
        max_epochs=200,
        device=device,
        grads_per_worker=grads_per_worker,
        client=client,
        lr=init_lr,
        max_batch_size=max_bs
    )
    model = DaskClassifierSimulator(**args)
    model.set_times(timings['mult'], timings['score'], timings['deepcopy'], timings['grad128'])
    
    # train
    hist = train(model, 
             train_set, 
             test_set, 
             stats_df,
             n_epochs=n_epochs, 
             log_interval=20,
             stats_dir=out_folder
            )
    write_stats(hist, out_folder, 'results-final.csv')
    
    print('Finished simulation in {} seconds'.format(time() - start))
    

In [None]:
# test exp
timings = { 'mult': True, 'score': 0.0, 'deepcopy': 0.0, 'grad128': 0.0 }
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/test'
run_sim(stats_path, out_path, timings, n_epochs=1, grads_per_worker=1)

# Siming grads per worker

In [12]:
## Timings for new simulation
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 1.55e-3 * 2  # seconds
GRAD_TIME_128 = 78.32e-3 
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [14]:
stats_path = './exp-dask/stats/dec-lr-512bs/exp-final.csv'
for grads_per_worker in [32, 64, 128, 256, 512]:
    out_path = 'sim-results/grads_per_worker_{}_test'.format(grads_per_worker)
    run_sim(stats_path, out_path, timings, n_epochs=200, grads_per_worker=grads_per_worker)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 18.027846813201904 seconds
[Epoch 0] Score: 0.489799976348877 in 17.758254051208496 seconds
[Epoch 1] Score: 0.6232999563217163 in 17.55052399635315 seconds
[Epoch 2] Score: 0.6603999733924866 in 17.918328285217285 seconds
[Epoch 3] Score: 0.7120999693870544 in 17.600931882858276 seconds
[Epoch 4] Score: 0.7465999722480774 in 17.818158864974976 seconds
[Epoch 5] Score: 0.7512999773025513 in 17.59074354171753 seconds
[Epoch 6] Score: 0.7569999694824219 in 17.739930868148804 seconds
[Epoch 7] Score: 0.7965999841690063 in 17.759063243865967 seconds
[Epoch 8] Score: 0.7978999614715576 in 17.564911603927612 seconds
[Epoch 9] Score: 0.8180999755859375 in 17.859560012817383 seconds
[Epoch 10] Score: 0.8219999670982361 in 17.58000612258911 seconds
[Epoch 11] Score: 0.8235999941825867 in 17.793426275253296 seconds
[Epoch 12] Score: 0.8337999582290649 in 17.658540725708008 seconds
[Epoch 13] Score: 0.83139997720

## One that has a faster network bandwidth than (simulated) in #2 (comment).

In [11]:
## Timings for new simulation
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.127488e-3  # seconds
GRAD_TIME_128 = 78.32e-3  # seconds
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [12]:
# exp 2.1 - dec-lr-P-machine-fast
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-mult-machines-fast'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 16.835314750671387 seconds
[Epoch 0] Score: 0.5990999937057495 in 39.403810262680054 seconds
[Epoch 1] Score: 0.6762999892234802 in 36.34965133666992 seconds
[Epoch 2] Score: 0.7156999707221985 in 36.57399344444275 seconds
[Epoch 3] Score: 0.7777999639511108 in 36.37462520599365 seconds
[Epoch 4] Score: 0.7678999900817871 in 36.30929780006409 seconds
[Epoch 5] Score: 0.7994999885559082 in 36.40059733390808 seconds
[Epoch 6] Score: 0.8030999898910522 in 36.31229782104492 seconds
[Epoch 7] Score: 0.8100000023841858 in 36.40604519844055 seconds
[Epoch 8] Score: 0.8186999559402466 in 36.391464710235596 seconds
[Epoch 9] Score: 0.8077999949455261 in 36.32364058494568 seconds
[Epoch 10] Score: 0.8211999535560608 in 36.37368655204773 seconds
[Epoch 11] Score: 0.8321999907493591 in 36.41829776763916 seconds
[Epoch 12] Score: 0.8306999802589417 in 36.52753829956055 seconds
[Epoch 13] Score: 0.8325999975204468 i

In [12]:
# exp 2.2 - increasing batch p machines
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-mult-machines-fast'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 16.845895528793335 seconds
[Epoch 0] Score: 0.5943999886512756 in 39.52752900123596 seconds
[Epoch 1] Score: 0.6757999658584595 in 36.32488012313843 seconds
[Epoch 2] Score: 0.7128999829292297 in 36.45812368392944 seconds
[Epoch 3] Score: 0.7423999905586243 in 36.31684446334839 seconds
[Epoch 4] Score: 0.7707999944686891 in 36.29584360122681 seconds
[Epoch 5] Score: 0.7875999808311462 in 36.53813362121582 seconds
[Epoch 6] Score: 0.7892999649047852 in 36.384525537490845 seconds
[Epoch 7] Score: 0.8071999549865723 in 36.45299315452576 seconds
[Epoch 8] Score: 0.8120999932289124 in 36.519848346710205 seconds
[Epoch 9] Score: 0.8226000070571899 in 36.42199158668518 seconds
[Epoch 10] Score: 0.8353999853134155 in 36.29736399650574 seconds
[Epoch 11] Score: 0.8224999904632568 in 36.34862685203552 seconds
[Epoch 12] Score: 0.8246999979019165 in 36.5230278968811 seconds
[Epoch 13] Score: 0.8355000019073486 in

In [13]:
# exp 2.3 - hybrid 1 P machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-mult-machines-fast'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 17.11887836456299 seconds
[Epoch 0] Score: 0.6014999747276306 in 36.57847809791565 seconds
[Epoch 1] Score: 0.6804999709129333 in 36.73394250869751 seconds
[Epoch 2] Score: 0.7328000068664551 in 36.50412917137146 seconds
[Epoch 3] Score: 0.7621999979019165 in 36.4488422870636 seconds
[Epoch 4] Score: 0.7834999561309814 in 36.51984262466431 seconds
[Epoch 5] Score: 0.8009999990463257 in 36.4921133518219 seconds
[Epoch 6] Score: 0.8064000010490417 in 36.401299476623535 seconds
[Epoch 7] Score: 0.8100000023841858 in 36.48680830001831 seconds
[Epoch 8] Score: 0.8197999596595764 in 36.39083909988403 seconds
[Epoch 9] Score: 0.812999963760376 in 36.69667673110962 seconds
[Epoch 10] Score: 0.8154999613761902 in 36.41271209716797 seconds
[Epoch 11] Score: 0.8373000025749207 in 36.42565107345581 seconds
[Epoch 12] Score: 0.8402000069618225 in 36.58277082443237 seconds
[Epoch 13] Score: 0.8319000005722046 in 36.

In [14]:
# exp 2.4 - hybrid 2 P machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-mult-machines-fast'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 16.99332880973816 seconds
[Epoch 0] Score: 0.6290000081062317 in 36.68964242935181 seconds
[Epoch 1] Score: 0.694599986076355 in 36.457961082458496 seconds
[Epoch 2] Score: 0.7508999705314636 in 36.5741183757782 seconds
[Epoch 3] Score: 0.7793999910354614 in 36.41502809524536 seconds
[Epoch 4] Score: 0.8016999959945679 in 36.72556400299072 seconds
[Epoch 5] Score: 0.8024999499320984 in 36.422792196273804 seconds
[Epoch 6] Score: 0.8096999526023865 in 36.416125535964966 seconds
[Epoch 7] Score: 0.8320999741554259 in 36.516974687576294 seconds
[Epoch 8] Score: 0.8307999968528748 in 36.55851697921753 seconds
[Epoch 9] Score: 0.8369999527931213 in 36.47108435630798 seconds
[Epoch 10] Score: 0.8292999863624573 in 36.424689292907715 seconds
[Epoch 11] Score: 0.8430999517440796 in 36.49501609802246 seconds
[Epoch 12] Score: 0.8443999886512756 in 36.853909492492676 seconds
[Epoch 13] Score: 0.8333999514579773 

In [15]:
# exp 11 - large bs 1 P machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-mult-machines-fast'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 17.09493923187256 seconds
[Epoch 0] Score: 0.5884999632835388 in 19.347447395324707 seconds
[Epoch 1] Score: 0.6516000032424927 in 19.218594789505005 seconds
[Epoch 2] Score: 0.7285000085830688 in 19.239405632019043 seconds
[Epoch 3] Score: 0.7662000060081482 in 19.223100900650024 seconds
[Epoch 4] Score: 0.7896999716758728 in 19.312239408493042 seconds
[Epoch 5] Score: 0.7972999811172485 in 19.263160467147827 seconds
[Epoch 6] Score: 0.8024999499320984 in 19.23438024520874 seconds
[Epoch 7] Score: 0.8089999556541443 in 19.220362663269043 seconds
[Epoch 8] Score: 0.8148999810218811 in 19.541128396987915 seconds
[Epoch 9] Score: 0.8120999932289124 in 19.248046875 seconds
[Epoch 10] Score: 0.8294000029563904 in 19.23132562637329 seconds
[Epoch 11] Score: 0.8185999989509583 in 19.22953224182129 seconds
[Epoch 12] Score: 0.8331999778747559 in 19.217407703399658 seconds
[Epoch 13] Score: 0.8416999578475952 

In [16]:
# exp 12 - large bs 2 P machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-mult-machines-fast'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats


KeyboardInterrupt: 







## First Sets of Experiments

In [11]:
## TIMINGS FOR NORMAL EXPERIMENTS
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.05855  # seconds
GRAD_TIME_128 = 0.07832  # seconds
timings = { 'mult': False, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [18]:
# exp 1 - dec-lr-1-machine 
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-1-machine'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 17.500604391098022 seconds
[Epoch 0] Score: 0.5990999937057495 in 60.29955196380615 seconds
[Epoch 1] Score: 0.6762999892234802 in 60.30539417266846 seconds
[Epoch 2] Score: 0.7156999707221985 in 59.774914503097534 seconds
[Epoch 3] Score: 0.7777999639511108 in 60.207035779953 seconds
[Epoch 4] Score: 0.7678999900817871 in 59.78869414329529 seconds
[Epoch 5] Score: 0.7994999885559082 in 60.308109283447266 seconds
[Epoch 6] Score: 0.8030999898910522 in 59.963244676589966 seconds
[Epoch 7] Score: 0.8100000023841858 in 60.10037112236023 seconds
[Epoch 8] Score: 0.8186999559402466 in 59.96827411651611 seconds
[Epoch 9] Score: 0.8077999949455261 in 60.28385829925537 seconds
[Epoch 10] Score: 0.8211999535560608 in 60.00210118293762 seconds
[Epoch 11] Score: 0.8321999907493591 in 60.082077980041504 seconds
[Epoch 12] Score: 0.8306999802589417 in 59.640068769454956 seconds
[Epoch 13] Score: 0.8325999975204468 

In [25]:
# exp 5 - large bs 1 1 machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-1-machine'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 17.082221746444702 seconds
[Epoch 0] Score: 0.5884999632835388 in 30.94543957710266 seconds
[Epoch 1] Score: 0.6516000032424927 in 30.91340970993042 seconds
[Epoch 2] Score: 0.7285000085830688 in 30.86415672302246 seconds
[Epoch 3] Score: 0.7662000060081482 in 31.18868088722229 seconds
[Epoch 4] Score: 0.7896999716758728 in 30.9765408039093 seconds
[Epoch 5] Score: 0.7972999811172485 in 30.914204597473145 seconds
[Epoch 6] Score: 0.8024999499320984 in 31.085127592086792 seconds
[Epoch 7] Score: 0.8089999556541443 in 30.978079080581665 seconds
[Epoch 8] Score: 0.8148999810218811 in 30.929124355316162 seconds
[Epoch 9] Score: 0.8120999932289124 in 30.941798210144043 seconds
[Epoch 10] Score: 0.8294000029563904 in 30.995038747787476 seconds
[Epoch 11] Score: 0.8185999989509583 in 30.99557065963745 seconds
[Epoch 12] Score: 0.8331999778747559 in 30.947920083999634 seconds
[Epoch 13] Score: 0.84169995784759

In [12]:
# exp 2 - inc-bs
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-1-machine'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 18.031914472579956 seconds
[Epoch 0] Score: 0.5943999886512756 in 67.52644038200378 seconds
[Epoch 1] Score: 0.6757999658584595 in 59.98326015472412 seconds
[Epoch 2] Score: 0.7128999829292297 in 59.854674100875854 seconds
[Epoch 3] Score: 0.7423999905586243 in 59.890366554260254 seconds
[Epoch 4] Score: 0.7707999944686891 in 60.007487058639526 seconds
[Epoch 5] Score: 0.7875999808311462 in 60.24218225479126 seconds
[Epoch 6] Score: 0.7892999649047852 in 59.979164361953735 seconds
[Epoch 7] Score: 0.8071999549865723 in 60.02729058265686 seconds
[Epoch 8] Score: 0.8120999932289124 in 59.98218131065369 seconds
[Epoch 9] Score: 0.8226000070571899 in 59.84120202064514 seconds
[Epoch 10] Score: 0.8353999853134155 in 59.62069082260132 seconds
[Epoch 11] Score: 0.8224999904632568 in 59.88772654533386 seconds
[Epoch 12] Score: 0.8246999979019165 in 60.31302237510681 seconds
[Epoch 13] Score: 0.8355000019073486

In [13]:
# exp 3 - hybrid 1 1 machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-1-machine'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 18.295875072479248 seconds
[Epoch 0] Score: 0.6014999747276306 in 60.282591104507446 seconds
[Epoch 1] Score: 0.6804999709129333 in 59.86171579360962 seconds
[Epoch 2] Score: 0.7328000068664551 in 60.11501145362854 seconds
[Epoch 3] Score: 0.7621999979019165 in 59.94831562042236 seconds
[Epoch 4] Score: 0.7834999561309814 in 60.12259292602539 seconds
[Epoch 5] Score: 0.8009999990463257 in 59.74931573867798 seconds
[Epoch 6] Score: 0.8064000010490417 in 60.086275577545166 seconds
[Epoch 7] Score: 0.8100000023841858 in 60.28357291221619 seconds
[Epoch 8] Score: 0.8197999596595764 in 59.819730043411255 seconds
[Epoch 9] Score: 0.812999963760376 in 59.997806549072266 seconds
[Epoch 10] Score: 0.8154999613761902 in 59.752153158187866 seconds
[Epoch 11] Score: 0.8373000025749207 in 59.817837953567505 seconds
[Epoch 12] Score: 0.8402000069618225 in 60.43029856681824 seconds
[Epoch 13] Score: 0.831900000572204

In [14]:
# exp 4 - hybrid 2 1 machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-1-machine'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 20.62927269935608 seconds
[Epoch 0] Score: 0.6290000081062317 in 59.91703271865845 seconds
[Epoch 1] Score: 0.694599986076355 in 60.07186412811279 seconds
[Epoch 2] Score: 0.7508999705314636 in 60.29324007034302 seconds
[Epoch 3] Score: 0.7793999910354614 in 59.68764781951904 seconds
[Epoch 4] Score: 0.8016999959945679 in 59.633214712142944 seconds
[Epoch 5] Score: 0.8024999499320984 in 60.19446587562561 seconds
[Epoch 6] Score: 0.8096999526023865 in 60.63145613670349 seconds
[Epoch 7] Score: 0.8320999741554259 in 60.047417879104614 seconds
[Epoch 8] Score: 0.8307999968528748 in 59.9759304523468 seconds
[Epoch 9] Score: 0.8369999527931213 in 60.12659549713135 seconds
[Epoch 10] Score: 0.8292999863624573 in 59.90186953544617 seconds
[Epoch 11] Score: 0.8430999517440796 in 59.623199224472046 seconds
[Epoch 12] Score: 0.8443999886512756 in 59.77467894554138 seconds
[Epoch 13] Score: 0.8333999514579773 in 

In [15]:
# exp 6 - large bs 2 1 machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-1-machine'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 18.92701292037964 seconds
[Epoch 0] Score: 0.4782999753952026 in 31.559585571289062 seconds
[Epoch 1] Score: 0.5977999567985535 in 31.12877869606018 seconds
[Epoch 2] Score: 0.665399968624115 in 31.154720783233643 seconds
[Epoch 3] Score: 0.6798999905586243 in 31.17481565475464 seconds
[Epoch 4] Score: 0.6807999610900879 in 31.187304496765137 seconds
[Epoch 5] Score: 0.7182999849319458 in 31.188523292541504 seconds
[Epoch 6] Score: 0.7193999886512756 in 31.131200790405273 seconds
[Epoch 7] Score: 0.7482999563217163 in 31.13277268409729 seconds
[Epoch 8] Score: 0.6961999535560608 in 31.131117343902588 seconds
[Epoch 9] Score: 0.7603999972343445 in 31.12556290626526 seconds
[Epoch 10] Score: 0.7512999773025513 in 31.473087549209595 seconds
[Epoch 11] Score: 0.7337999939918518 in 31.136924982070923 seconds
[Epoch 12] Score: 0.7601000070571899 in 31.121317863464355 seconds
[Epoch 13] Score: 0.7540000081062

In [16]:
# timings for multi machine set ups
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.0 # set internall based on N workers
GRAD_TIME_128 = 0.07832  # seconds
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [17]:
# exp 11 - large bs 1 P machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-mult-machines'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 17.18851327896118 seconds
[Epoch 0] Score: 0.5884999632835388 in 19.5691659450531 seconds
[Epoch 1] Score: 0.6516000032424927 in 19.500764846801758 seconds
[Epoch 2] Score: 0.7285000085830688 in 19.450701236724854 seconds
[Epoch 3] Score: 0.7662000060081482 in 19.43295645713806 seconds
[Epoch 4] Score: 0.7896999716758728 in 19.566246509552002 seconds
[Epoch 5] Score: 0.7972999811172485 in 19.443339109420776 seconds
[Epoch 6] Score: 0.8024999499320984 in 19.420652627944946 seconds
[Epoch 7] Score: 0.8089999556541443 in 19.434909343719482 seconds
[Epoch 8] Score: 0.8148999810218811 in 19.40559458732605 seconds
[Epoch 9] Score: 0.8120999932289124 in 19.432981729507446 seconds
[Epoch 10] Score: 0.8294000029563904 in 19.50568175315857 seconds
[Epoch 11] Score: 0.8185999989509583 in 19.56595015525818 seconds
[Epoch 12] Score: 0.8331999778747559 in 19.421775102615356 seconds
[Epoch 13] Score: 0.84169995784759

In [None]:
# exp 7 - dec-lr-P-machine 
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-mult-machines'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 8 - increasing batch p machines
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-mult-machines'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 16.900656938552856 seconds
[Epoch 0] Score: 0.5943999886512756 in 36.509028911590576 seconds
[Epoch 1] Score: 0.6757999658584595 in 36.62124061584473 seconds
[Epoch 2] Score: 0.7128999829292297 in 36.762213706970215 seconds
[Epoch 3] Score: 0.7423999905586243 in 36.45364427566528 seconds
[Epoch 4] Score: 0.7707999944686891 in 36.55291485786438 seconds
[Epoch 5] Score: 0.7875999808311462 in 36.46657967567444 seconds
[Epoch 6] Score: 0.7892999649047852 in 36.54781794548035 seconds
[Epoch 7] Score: 0.8071999549865723 in 36.41838455200195 seconds
[Epoch 8] Score: 0.8120999932289124 in 36.44644212722778 seconds
[Epoch 9] Score: 0.8226000070571899 in 36.46036744117737 seconds
[Epoch 10] Score: 0.8353999853134155 in 36.74067401885986 seconds
[Epoch 11] Score: 0.8224999904632568 in 36.499744176864624 seconds
[Epoch 12] Score: 0.8246999979019165 in 36.542011976242065 seconds
[Epoch 13] Score: 0.8355000019073486

In [None]:
# exp 9 - hybrid 1 P machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-mult-machines'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 19.526923418045044 seconds
[Epoch 0] Score: 0.6014999747276306 in 36.77826762199402 seconds
[Epoch 1] Score: 0.6804999709129333 in 36.57283806800842 seconds
[Epoch 2] Score: 0.7328000068664551 in 37.00673222541809 seconds
[Epoch 3] Score: 0.7621999979019165 in 36.65171551704407 seconds
[Epoch 4] Score: 0.7834999561309814 in 36.68503284454346 seconds
[Epoch 5] Score: 0.8009999990463257 in 36.57041096687317 seconds
[Epoch 6] Score: 0.8064000010490417 in 36.655731439590454 seconds
[Epoch 7] Score: 0.8100000023841858 in 36.68675351142883 seconds
[Epoch 8] Score: 0.8197999596595764 in 36.76146578788757 seconds
[Epoch 9] Score: 0.812999963760376 in 36.6549117565155 seconds
[Epoch 10] Score: 0.8154999613761902 in 37.05028772354126 seconds
[Epoch 11] Score: 0.8373000025749207 in 36.66020750999451 seconds
[Epoch 12] Score: 0.8402000069618225 in 36.65378379821777 seconds
[Epoch 13] Score: 0.8319000005722046 in 3

In [None]:
# exp 10 - hybrid 2 P machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-mult-machines'
run_sim(stats_path, out_path, timings)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 19.294373512268066 seconds
[Epoch 0] Score: 0.6290000081062317 in 36.81075620651245 seconds
[Epoch 1] Score: 0.694599986076355 in 36.79961657524109 seconds
[Epoch 2] Score: 0.7508999705314636 in 36.95730757713318 seconds
[Epoch 3] Score: 0.7793999910354614 in 36.75439763069153 seconds
[Epoch 4] Score: 0.8016999959945679 in 36.584755420684814 seconds
[Epoch 5] Score: 0.8024999499320984 in 37.01908612251282 seconds
[Epoch 6] Score: 0.8096999526023865 in 36.613943099975586 seconds
[Epoch 7] Score: 0.8320999741554259 in 36.70985221862793 seconds
[Epoch 8] Score: 0.8307999968528748 in 36.67209267616272 seconds
[Epoch 9] Score: 0.8369999527931213 in 36.70244026184082 seconds
[Epoch 10] Score: 0.8292999863624573 in 36.69097828865051 seconds
[Epoch 11] Score: 0.8430999517440796 in 36.6531183719635 seconds
[Epoch 12] Score: 0.8443999886512756 in 36.67304301261902 seconds
[Epoch 13] Score: 0.8333999514579773 in 

In [None]:
# exp 12 - large bs 2 P machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-mult-machines'
run_sim(stats_path, out_path, timings, max_bs=5120)

Running simulation....


[SETUP] Loaded 200 epochs of stats
[SETUP] Pre-Processed in 19.96380925178528 seconds
[Epoch 0] Score: 0.4782999753952026 in 19.74583601951599 seconds
[Epoch 1] Score: 0.5977999567985535 in 19.877222299575806 seconds
[Epoch 2] Score: 0.665399968624115 in 19.882566928863525 seconds
[Epoch 3] Score: 0.6798999905586243 in 19.965288877487183 seconds
[Epoch 4] Score: 0.6807999610900879 in 19.72574472427368 seconds
[Epoch 5] Score: 0.7182999849319458 in 19.639118432998657 seconds
[Epoch 6] Score: 0.7193999886512756 in 19.830687284469604 seconds
[Epoch 7] Score: 0.7482999563217163 in 19.884809017181396 seconds
[Epoch 8] Score: 0.6961999535560608 in 19.88707995414734 seconds
[Epoch 9] Score: 0.7603999972343445 in 19.92339301109314 seconds
[Epoch 10] Score: 0.7512999773025513 in 20.053954601287842 seconds
[Epoch 11] Score: 0.7337999939918518 in 19.621588230133057 seconds
[Epoch 12] Score: 0.7601000070571899 in 19.854926109313965 seconds
[Epoch 13] Score: 0.7540000081062