In [44]:
from dask.distributed import Client, progress, get_worker
import os
import binascii

# Replace with IP address of the dask scheduler
client = Client("tcp://131.180.106.138:8786") 

In [58]:
#Constants
BATCH_SIZES = [100,500,1000,1500,2000]
XCLBIN_PATH = "a.xclbin"
PLATFORM = "alveo"

In [46]:
#Download the CIFAR 10 dataset 
!wget https://raw.githubusercontent.com/modestyachts/CIFAR-10.1/master/datasets/cifar10.1_v4_data.npy

--2020-11-01 17:23:59--  https://raw.githubusercontent.com/modestyachts/CIFAR-10.1/master/datasets/cifar10.1_v4_data.npy
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6208640 (5.9M) [application/octet-stream]
Saving to: ‘cifar10.1_v4_data.npy.10’


2020-11-01 17:24:00 (51.7 MB/s) - ‘cifar10.1_v4_data.npy.10’ saved [6208640/6208640]



In [54]:
def run_on_worker(ibuf_normal, index):
    from multiprocessing import Process,Queue
    import numpy as np
    import time
    
    def forked_process(queue, ibuf_normal):
        from driver import FINNAccelDriver
        from pynq.ps import Clocks
        finnDriver = FINNAccelDriver(len(ibuf_normal), XCLBIN_PATH, PLATFORM)
        ibuf_folded = finnDriver.fold_input(ibuf_normal)
        ibuf_packed = finnDriver.pack_input(ibuf_folded)
        finnDriver.copy_input_data_to_device(ibuf_packed)
        finnDriver.execute()
        obuf_packed = np.empty_like(finnDriver.obuf_packed_device)
        finnDriver.copy_output_data_from_device(obuf_packed)
        obuf_folded = finnDriver.unpack_output(obuf_packed)
        obuf_normal = finnDriver.unfold_output(obuf_folded)
        
        if PLATFORM != "alveo":
            fclk_mhz = Clocks.fclk0_mhz
        else:
            fclk_mhz = finnDriver.fclk_mhz
        queue.put((obuf_normal, fclk_mhz))
    
    
    # We need to run the Pynq overlay in a new forked process since it cannot be run in a non-Main thread    
    t0 = time.time()
    queue = Queue()
    p = Process(target=forked_process, args=(queue, ibuf_normal))
    p.start()
    result, fclk_mhz = queue.get()
    p.join()
    t1 = time.time()
    print("EXECUTION TIME ON THIS WORKER (s): ", t1 - t0)
    return {
        'data': result,
        'time': t1 - t0,
        'index': index,
        'fclk_mhz': fclk_mhz
    }

In [59]:
import time
import numpy as np
import json

num_of_workers = len(client.scheduler_info()["workers"])
full_cifar = np.load('cifar10.1_v4_data.npy')


for BATCH_SIZE in BATCH_SIZES:
    print("BATCH_SIZE:", BATCH_SIZE)
    partial_cifar = full_cifar[:BATCH_SIZE]
    t0 = time.time()
    
    # Split up the file into equal sized chunks based on number of available dask workers
    data_split = []
    start = 0
    chunk_size = int(len(partial_cifar)/num_of_workers)
    for i in range(num_of_workers - 1):
        data_split.append(partial_cifar[start: start+chunk_size])
        start += chunk_size
    data_split.append(partial_cifar[start:]) #Last partition

    # Scatter the data to the workers before calling run_on_worker on the workers
    distributed_data = client.scatter(data_split)
    futures = client.map(run_on_worker, distributed_data, range(num_of_workers))
    results = client.gather(futures)
    print("Received data from workers.")

    # Reorder the response based on original input order
    results.sort(key = lambda result: result['index'])  

    # Concatenate the result where each is an ndarray of the shape (BATCH_SIZE/num_of_workers, 1)
    merged_result = np.concatenate([r['data'] for r in results]) # FINAL RESULTS (CLASS LABELS)
    
    t1 = time.time()
    runtime = t1 - t0
    res = {}
    res["runtime[ms]"] = runtime*1000
    res["throughput[images/s]"] = BATCH_SIZE / runtime
    res["DRAM_in_bandwidth[Mb/s]"] = np.prod((BATCH_SIZE, 32, 32, 1, 3))*0.000001 / runtime
    res["DRAM_out_bandwidth[Mb/s]"] = np.prod((BATCH_SIZE, 1, 1))*0.000001 / runtime
    res["fclk[mhz]"] = results[0]['fclk_mhz']
    res["N"] = BATCH_SIZE
    print(res)
    print("**************************")    

BATCH_SIZE: 100
Received data from workers.
{'runtime[ms]': 17267.237186431885, 'throughput[images/s]': 5.791314436717022, 'DRAM_in_bandwidth[Mb/s]': 0.01779091794959469, 'DRAM_out_bandwidth[Mb/s]': 5.791314436717022e-06, 'fclk[mhz]': 100.0, 'N': 100}
**************************
