## Simulate DaskClassifier based on Timing Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import csv
import importlib
import pandas as pd
from time import time
from copy import copy
from dask.distributed import Client

In [3]:
# import simulator
os.chdir('..')
classifier = importlib.import_module("exp-dask.classifier")
from classifier import DaskClassifierSimulator

In [4]:
# training client
def _prep():
    from distributed.protocol import torch

client = Client(processes=False)
client.run(_prep)

from model import Wide_ResNet
client.upload_file("./exp-dask/model.py")

client

0,1
Client  Scheduler: inproc://192.168.0.107/32491/1  Dashboard: http://192.168.0.107:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 17.18 GB


In [5]:
# Gather and prepare data
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=True, download=True, transform=transform_train)
test_set = torchvision.datasets.CIFAR10(root='./exp-dask/data', train=False, download=True, transform=transform_test)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [6]:
def write_stats(hist, directory, filename):
    with open('./exp-dask/{}/{}'.format(directory, filename), 'w', encoding='utf8', newline='') as output_file:
        fc = csv.DictWriter(output_file, fieldnames=hist[0].keys())
        fc.writeheader()
        fc.writerows(hist)

In [7]:
def train(model, train_set, test_set, stats_df, n_epochs=200, log_interval=1, stats_dir=''):
    """
    Train based on expiriment params
    """
    
    # stats to load into the simulator
    stats = list(stats_df.T.to_dict().values())
    print("[SETUP] Loaded {} epochs of stats".format(len(stats)))
    
    # preprocess data
    start = time()
    train_set = model.preprocess(train_set)
    test_set = model.preprocess(test_set)
    print("[SETUP] Pre-Processed in {} seconds".format(time() - start))
    
    history = []
    # run simulations over all sim data
    for epoch in range(min(n_epochs, len(stats))):
        
        start = time()
        
        # set stats
        stat = stats.pop(0)
        model.set_sim(stat)
        
        # run
        print("[Epoch {}]".format(epoch), end="")
        model.partial_fit(train_set)
        score = model.score(test_set)
        
        # store data
        datum = {"epoch": epoch, "score": score, **model.get_params(), **model.meta_}
        print(" Score: {} in {} seconds".format(score, time() - start))
        history.append(datum)
        
        if epoch % log_interval == 0:
            write_stats(history, stats_dir, 'results-ep{}.csv'.format(epoch))

    return history

In [8]:
def run_sim(stats_df_loc, out_folder, timings, n_epochs=200, grads_per_worker=128, max_bs=99999999):
    """
    Sets up the simulator model, transforms stats (to account for changes in the base classifier and how
    it saves data), and calls the train function.
    """
    
    print("Running simulation....\n\n")
    start = time()
    
    # stats from an actual training run
    stats_df = pd.read_csv(stats_df_loc)
    
    # old versions of the DaskClassifier had different collumn names
    if 'lr_' in stats_df.columns:
        stats_df = stats_df.rename(columns={"lr_": "partial_fit__lr", "batch_size_": "partial_fit__batch_size"})
    
    # pull initial stats from the CSV
    init_lr = stats_df['partial_fit__lr'][0]
    init_bs = stats_df['partial_fit__batch_size'][0]
    momentum = stats_df['optimizer__momentum'][0]
    device = "cpu" if not torch.cuda.is_available() else "cuda:0"

    # args
    args = dict(
        module=Wide_ResNet,
        module__depth=16,
        module__widen_factor=4,
        module__dropout_rate=0.3,
        module__num_classes=len(classes),
        loss=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.SGD,
        optimizer__lr=init_lr,
        optimizer__momentum=momentum,
        optimizer__nesterov=True,
        optimizer__weight_decay=0.5e-3,
        batch_size=init_bs,
        max_epochs=200,
        device=device,
        grads_per_worker=grads_per_worker,
        client=client,
        lr=init_lr,
        max_batch_size=max_bs
    )
    
    # create model and set initial timings
    model = DaskClassifierSimulator(**args)
    model.set_times(timings['mult'], timings['score'], timings['deepcopy'], timings['grad128'])
    
    # train based on normal train function
    hist = train(model, 
             train_set, 
             test_set, 
             stats_df,
             n_epochs=n_epochs, 
             log_interval=20,
             stats_dir=out_folder
            )
    write_stats(hist, out_folder, 'results-final.csv')
    
    print('Finished simulation in {} seconds'.format(time() - start))
    

In [None]:
# test exp
timings = { 'mult': True, 'score': 0.0, 'deepcopy': 0.0, 'grad128': 0.0 }
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/test'
run_sim(stats_path, out_path, timings, n_epochs=1)

# Siming grads per worker
Simulate the bs=512 experiments with differen numbers of grads per worker

In [12]:
## Timings for new simulation
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 1.55e-3 * 2  # seconds
GRAD_TIME_128 = 78.32e-3 
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [None]:
stats_path = './exp-dask/stats/dec-lr-512bs/exp-final.csv'
for grads_per_worker in [32, 64, 128, 256, 512]:
    out_path = 'sim-results/grads_per_worker_{}_test'.format(grads_per_worker)
    run_sim(stats_path, out_path, timings, n_epochs=200, grads_per_worker=grads_per_worker)

## Simulations with faster network bandwidth #2 (comment).

In [11]:
## Timings for new simulation
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.127488e-3  # seconds
GRAD_TIME_128 = 78.32e-3  # seconds
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [None]:
# exp 2.1 - dec-lr-P-machine-fast
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-mult-machines-fast'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 2.2 - increasing batch p machines
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-mult-machines-fast'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 2.3 - hybrid 1 P machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-mult-machines-fast'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 2.4 - hybrid 2 P machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-mult-machines-fast'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 11 - large bs 1 P machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-mult-machines-fast'
run_sim(stats_path, out_path, timings, max_bs=5120)

In [None]:
# exp 12 - large bs 2 P machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-mult-machines-fast'
run_sim(stats_path, out_path, timings, max_bs=5120)







## Simulations for 1 machine with normal network bandwidth

In [11]:
## TIMINGS FOR NORMAL EXPERIMENTS
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.05855  # seconds
GRAD_TIME_128 = 0.07832  # seconds
timings = { 'mult': False, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [None]:
# exp 1 - dec-lr-1-machine 
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-1-machine'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 5 - large bs 1 1 machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-1-machine'
run_sim(stats_path, out_path, timings, max_bs=5120)

In [None]:
# exp 2 - inc-bs
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-1-machine'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 3 - hybrid 1 1 machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-1-machine'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 4 - hybrid 2 1 machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-1-machine'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 6 - large bs 2 1 machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-1-machine'
run_sim(stats_path, out_path, timings, max_bs=5120)

## Simulations for multiple machines with normal network bandwidth

In [16]:
# timings for multi machine set ups
SCORE_TIME = 0.0 # 5.39407711148262
DEEPCOPY_TIME = 0.0 # set internall based on N workers
GRAD_TIME_128 = 0.07832  # seconds
timings = { 'mult': True, 'score': SCORE_TIME, 'deepcopy': DEEPCOPY_TIME, 'grad128': GRAD_TIME_128 }

In [None]:
# exp 11 - large bs 1 P machine
stats_path = './exp-dask/stats/large-bs-0/exp-final.csv'
out_path = 'sim-results/large-bs1-mult-machines'
run_sim(stats_path, out_path, timings, max_bs=5120)

In [None]:
# exp 7 - dec-lr-P-machine 
stats_path = './exp-dask/stats/decreasing-lr/exp--final.csv'
out_path = 'sim-results/dec-lr-mult-machines'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 8 - increasing batch p machines
stats_path = './exp-dask/stats/increasing-bs/exp-final.csv'
out_path = 'sim-results/inc-bs-mult-machines'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 9 - hybrid 1 P machine
stats_path = './exp-dask/stats/hybrid/exp-final.csv'
out_path = 'sim-results/hybrid1-mult-machines'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 10 - hybrid 2 P machine
stats_path = './exp-dask/stats/hybrid-2/exp-final.csv'
out_path = 'sim-results/hybrid2-mult-machines'
run_sim(stats_path, out_path, timings)

In [None]:
# exp 12 - large bs 2 P machine
stats_path = './exp-dask/stats/large-bs-1/exp-final.csv'
out_path = 'sim-results/large-bs2-mult-machines'
run_sim(stats_path, out_path, timings, max_bs=5120)