In [1]:
from dask.distributed import Client, LocalCluster

# Making a client
1. The Client satisfies most of the standard concurrent.futures - PEP-3148 interface with .submit, .map functions and Future objects, allowing the immediate and direct submission of tasks.

2. The Client registers itself as the default Dask scheduler, and so runs all dask collections like dask.array, dask.bag, dask.dataframe and dask.delayed

3. The Client has additional methods for manipulating data remotely.


In [2]:
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:55059,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:55072,Total threads: 2
Dashboard: http://127.0.0.1:55074/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55065,
Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-xodp2la6,Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-xodp2la6

0,1
Comm: tcp://127.0.0.1:55071,Total threads: 2
Dashboard: http://127.0.0.1:55076/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55064,
Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-77aevlpw,Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-77aevlpw

0,1
Comm: tcp://127.0.0.1:55073,Total threads: 2
Dashboard: http://127.0.0.1:55077/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55062,
Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-bl60b2o6,Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-bl60b2o6

0,1
Comm: tcp://127.0.0.1:55070,Total threads: 2
Dashboard: http://127.0.0.1:55075/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:55063,
Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-kwk0hg1h,Local directory: /Users/prajualpillai/Desktop/prajual/Personal_git/Dask/dask-worker-space/worker-kwk0hg1h


## 1. Dask Array

In [24]:
import dask.array as da
x = da.random.random((10000,10000,10), chunks=(1000,1000,5))
y = da.random.random((10000,10000,10), chunks=(1000,1000,5))
z = (da.arcsin(x)+da.arcsin(y)).sum(axis=(1,2))
z.persist()

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,7.81 kiB
Shape,"(10000,)","(1000,)"
Count,10 Tasks,10 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 78.12 kiB 7.81 kiB Shape (10000,) (1000,) Count 10 Tasks 10 Chunks Type float64 numpy.ndarray",10000  1,

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,7.81 kiB
Shape,"(10000,)","(1000,)"
Count,10 Tasks,10 Chunks
Type,float64,numpy.ndarray


Dask is very ineffecient when doing very small computations, therefore it is best practice to keep the chunksize relatively large, i.e fewer number of chunks should be used

In [20]:
%%time
x = da.random.random(10_000_000, chunks=(100000,))
x.sum().compute()

CPU times: user 133 ms, sys: 33.5 ms, total: 167 ms
Wall time: 206 ms


5000107.910778593

## Dataframe operations

In [23]:
import dask.dataframe as dd
path = "/Users/prajualpillai/Desktop/prajual/Personal_git/amex_default_prediction/amex-default-prediction/train_data.csv"
ddf = dd.read_csv(path)
ddf.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827
