In [17]:
from dask.distributed import Client, progress, protocol
from distributed.protocol import serialize,deserialize
from time import time

client = Client("tcp://131.180.106.138:8786")
client

0,1
Client  Scheduler: tcp://131.180.106.138:8786,Cluster  Workers: 1  Cores: 40  Memory: 101.01 GB


In [18]:
# function which executes on the workers
def run_on_worker(data):
    print("Received data of type: ", type(data))
    return {'length': len(data)}

In [19]:
# Our dataset - a mock CSV file with ~4.4 million rows
import os
print("Size of dataset in MB: ", os.path.getsize('test.csv')/(1024*1024))

Size of dataset in MB:  523.6366100311279


### Using Arrow serialization:

In [20]:
from pyarrow import csv

t0 = time()
data_to_scatter = csv.read_csv("test.csv") 
print("Time to read csv: ", time() - t0)

t1 = time()
scattered_data = client.scatter(data_to_scatter)
print("Time to scatter: ", time() - t1)

x = client.submit(run_on_worker, scattered_data)
ret = x.result()
print("Received from workers: ", ret)

Time to read csv:  0.6191794872283936
Time to scatter:  6.664686918258667
Received from workers:  {'length': 4400001}


In [21]:
print("Serialized data:")
print(serialize(data_to_scatter))

# Measure time for serialization+deserialization roundtrip
%time roundtrip = deserialize(*serialize(data_to_scatter))
roundtrip == data_to_scatter

Serialized data:
({'type': 'pyarrow.lib.Table', 'type-serialized': b'\x80\x04\x95\x19\x00\x00\x00\x00\x00\x00\x00\x8c\x0bpyarrow.lib\x94\x8c\x05Table\x94\x93\x94.', 'serializer': 'dask'}, [<pyarrow.lib.Buffer object at 0x7f0d3989cdc0>])
CPU times: user 230 ms, sys: 156 ms, total: 386 ms
Wall time: 384 ms


True

### Using Dask's default serialization for DataFrames (pickle):

In [22]:
import pandas as pd

t0 = time()
data_to_scatter = pd.read_csv("test.csv") 
print("Time to read csv: ", time() - t0)

t1 = time()
scattered_data = client.scatter(data_to_scatter)
print("Time to scatter: ", time() - t1)

x = client.submit(run_on_worker, scattered_data)
ret = x.result()
print("Received from workers: ", ret)

Time to read csv:  8.168189525604248
Time to scatter:  9.629390716552734
Received from workers:  {'length': 4400001}


In [23]:
print("Serialized data:")
print(serialize(data_to_scatter)[0])

# Measure time for serialization+deserialization roundtrip

%time roundtrip = deserialize(*serialize(data_to_scatter))
roundtrip == data_to_scatter

Serialized data:
{'serializer': 'pickle'}
CPU times: user 2.94 s, sys: 1.46 s, total: 4.41 s
Wall time: 4.24 s


Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4399996,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4399997,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4399998,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4399999,True,True,True,True,True,True,True,True,True,True,True,True,True,True
