In [1]:
import dask
from dask_kubernetes import KubeCluster
import numpy as np

In [2]:
# Specify a remote deployment using a load blanacer
dask.config.set({"kubernetes.scheduler-service-type": "LoadBalancer"})

<dask.config.set at 0x7f4f38734c40>

In [3]:
cluster = KubeCluster.from_yaml('worker-spec.yaml', namespace='dask', deploy_mode='remote')

Creating scheduler pod on cluster. This may take some time.


In [4]:
cluster.adapt(minimum=1, maximum=100)

distributed.deploy.adaptive - INFO - Adaptive scaling started: minimum=1 maximum=100


<distributed.deploy.adaptive.Adaptive at 0x7f4eba80f760>

In [5]:
# Example usage
from dask.distributed import Client
import dask.array as da

# Connect Dask to the cluster
client = Client(cluster)


In [6]:
client.scheduler_comm.comm.handshake_info()

{'compression': 'lz4', 'python': (3, 8, 0), 'pickle-protocol': 5}

In [7]:
# Create a large array and calculate the mean
array = da.ones((1000, 1000, 1000))
print(array.mean().compute())  # Should print 1.0|

1.0


In [8]:
print(array.mean().compute())

1.0


In [9]:
print(array.sum().compute())

1000000000.0


In [10]:
dir(array)

['A',
 'T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_function__',
 '__array_priority__',
 '__array_ufunc__',
 '__await__',
 '__bool__',
 '__class__',
 '__complex__',
 '__dask_graph__',
 '__dask_keys__',
 '__dask_layers__',
 '__dask_optimize__',
 '__dask_postcompute__',
 '__dask_postpersist__',
 '__dask_scheduler__',
 '__dask_tokenize__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__div__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__le__',
 '__len__',
 '__long__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rlshift__',
 '__rmatmul__',
 '__rmod__',


In [11]:
np.take(array, indices=[0, 10]).sum().compute()

2000000.0

In [12]:
from time import sleep

def inc(x):
    sleep(1)
    return x + 1

def add(x, y):
    sleep(1)
    return x + y

In [13]:
%%time
# This takes three seconds to run because we call each
# function sequentially, one after the other

x = inc(1)
y = inc(2)
z = add(x, y)

distributed.deploy.adaptive - INFO - Retiring workers [1, 2, 3, 4, 5, 6]


CPU times: user 14.3 ms, sys: 695 µs, total: 15 ms
Wall time: 3 s


In [14]:
from dask import delayed


In [15]:
%%time
# This runs immediately, all it does is build a graph

x = delayed(inc)(1)
y = delayed(inc)(2)
z = delayed(add)(x, y)

CPU times: user 837 µs, sys: 0 ns, total: 837 µs
Wall time: 620 µs


In [16]:
%%time
z.compute()

CPU times: user 6.51 ms, sys: 746 µs, total: 7.25 ms
Wall time: 2.04 s


5

In [17]:
data = range(1,100)

In [18]:
results = []

for x in data:
    y = delayed(inc)(x)
    results.append(y)

total = delayed(sum)(results)
print("Before computing:", total)  # Let's see what type of thing total is
result = total.compute()
print("After computing :", result)  # After it's computed

Before computing: Delayed('sum-88cd7034-ba93-466a-8cc4-20e3e217b918')


distributed.deploy.adaptive - INFO - Retiring workers [6]
distributed.deploy.adaptive - INFO - Retiring workers [7]
distributed.deploy.adaptive - INFO - Retiring workers [8]
distributed.deploy.adaptive - INFO - Retiring workers [9]
distributed.deploy.adaptive - INFO - Retiring workers [10, 11]
distributed.deploy.adaptive - INFO - Retiring workers [12]
distributed.deploy.adaptive - INFO - Retiring workers [13, 14]
distributed.deploy.adaptive - INFO - Retiring workers [15]
distributed.deploy.adaptive - INFO - Retiring workers [16, 17]
distributed.deploy.adaptive - INFO - Retiring workers [18]
distributed.deploy.adaptive - INFO - Retiring workers [22]


After computing : 5049


In [19]:
total.compute()

distributed.deploy.adaptive - INFO - Retiring workers [32, 33]
distributed.deploy.adaptive - INFO - Retiring workers [34, 35]
distributed.deploy.adaptive - INFO - Retiring workers [36, 37]
distributed.deploy.adaptive - INFO - Retiring workers [24, 38]
distributed.deploy.adaptive - INFO - Retiring workers [25, 26, 27]
distributed.deploy.adaptive - INFO - Retiring workers [28, 29]


5049

In [20]:
def double(x):
    sleep(1)
    return 2 * x

def is_even(x):
    return not x % 2

In [21]:
%%time
results = []
for x in data:
    if is_even(x):  # even
        y = delayed(double)(x)
    else:          # odd
        y = delayed(inc)(x)
    results.append(y)

total = delayed(sum)(results)
total.compute()

distributed.deploy.adaptive - INFO - Retiring workers [38, 39]
distributed.deploy.adaptive - INFO - Retiring workers [40, 41, 42]
distributed.deploy.adaptive - INFO - Retiring workers [43, 44]
distributed.deploy.adaptive - INFO - Retiring workers [45, 46, 47]
distributed.deploy.adaptive - INFO - Retiring workers [48, 30]


CPU times: user 97.8 ms, sys: 4.06 ms, total: 102 ms
Wall time: 9.18 s


7450

In [22]:
%%time
results = []
for x in data:
    def compute(x):
        if is_even(x):  # even
            return double(x)
        else:          # odd
            return inc(x)
    y = delayed(compute)(x)
    results.append(y)

total = delayed(sum)(results)
total.compute()

distributed.deploy.adaptive - INFO - Retiring workers [48, 49]
distributed.deploy.adaptive - INFO - Retiring workers [50, 51, 52]
distributed.deploy.adaptive - INFO - Retiring workers [53, 54]
distributed.deploy.adaptive - INFO - Retiring workers [56, 57, 55]
distributed.deploy.adaptive - INFO - Retiring workers [58, 59]


CPU times: user 96.8 ms, sys: 2.86 ms, total: 99.7 ms
Wall time: 9.22 s


7450

In [23]:
total.visualize()

RuntimeError: Drawing dask graphs requires the `graphviz` python library and the `graphviz` system library to be installed.

In [24]:
from dask import compute


In [25]:
import dask.bag as db
b = db.from_sequence(range(1,100))

In [26]:
iseven = lambda x: x % 2 == 0

In [27]:
add = lambda x, y: x + y

In [28]:
dict(b.foldby(iseven, add))

{False: 2500, True: 2450}

In [29]:
b.foldby(iseven, add)

dask.bag<foldby-b, npartitions=1>

In [30]:
b.foldby(iseven, add).compute()

[(False, 2500), (True, 2450)]

In [31]:
grouped = b.groupby(iseven)

In [58]:
badsum = grouped.map(lambda kv: (kv[0], sum(kv[1])))

In [41]:
f = client.scatter(badsum)

In [42]:
f

In [45]:
# Note I killed the worker in between
f

In [59]:
f = client.scatter(badsum)

AttributeError: 'NoneType' object has no attribute 'scatter'

In [51]:
f

In [49]:
dict(f.result())

{False: 2500, True: 2450}

In [55]:
# introduced network failure
f.result()

CancelledError: Bag-e76d6bcb53e9b16eb366eaf56753f911

In [56]:
f

In [34]:
from bokeh.io import output_notebook, push_notebook
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure, show
import numpy as np
output_notebook()

# set up plot background
N = 500
x = np.linspace(-5, 5, N)
y = np.linspace(-5, 5, N)
xx, yy = np.meshgrid(x, y)
d = (1 - xx)**2 + 2 * (yy - xx**2)**2
d = np.log(d)

p = figure(x_range=(-5, 5), y_range=(-5, 5))
p.image(image=[d], x=-5, y=-5, dw=10, dh=10, palette="Spectral11");

In [35]:
c = client
# a simple function with interesting minima
import time

def rosenbrock(point):
    """Compute the rosenbrock function and return the point and result"""
    time.sleep(0.1)
    score = (1 - point[0])**2 + 2 * (point[1] - point[0]**2)**2
    return point, score

In [36]:
from dask.distributed import as_completed
from random import uniform

scale = 5                  # Intial random perturbation scale
best_point = (0, 0)        # Initial guess
best_score = float('inf')  # Best score so far
startx = [uniform(-scale, scale) for _ in range(10)]
starty = [uniform(-scale, scale) for _ in range(10)]

# set up plot
source = ColumnDataSource({'x': startx, 'y': starty, 'c': ['grey'] * 10})
p.circle(source=source, x='x', y='y', color='c')
t = show(p, notebook_handle=True)

# initial 10 random points
futures = [c.submit(rosenbrock, (x, y)) for x, y in zip(startx, starty)]
iterator = as_completed(futures)

# TODO(holden): non-blocking?
for res in iterator:
    # take a completed point, is it an improvement?
    point, score = res.result()
    if score < best_score:
        best_score, best_point = score, point
        print(score, point)

    x, y = best_point
    newx, newy = (x + uniform(-scale, scale), y + uniform(-scale, scale))

    # update plot
    source.stream({'x': [newx], 'y': [newy], 'c': ['grey']}, rollover=20)
    push_notebook(document=t)

    # add new point, dynamically, to work on the cluster
    new_point = c.submit(rosenbrock, (newx, newy))
    iterator.add(new_point)  # Start tracking new task as well

    # Narrow search and consider stopping
    scale *= 0.99
    if scale < 0.001:
        break
point

274.9968983831787 (-3.4867626132507934, 0.8688940387929129)
79.3181876216135 (-1.5771031276959055, -3.5408767405583386)
48.924487263212505 (1.125739911854228, -3.6778424467386115)
30.91761466597759 (-0.46507024218626647, -3.5765439564473613)
16.194367016853636 (-1.5767571355373242, 0.30044925871571504)
15.570869280246132 (0.22295295116176383, -2.6858967743095215)
3.858299566386609 (-0.9414776958116242, 1.0972877231912648)
1.9027530006092823 (1.1216676324503725, 0.28655452220734823)
1.2357323461215586 (1.523088161324638, 3.0133792372382953)
0.21964675807654302 (1.0498205374468363, 1.431641500338622)
0.14538148603176626 (0.6397415881970008, 0.3209648185907157)
0.05146953430945986 (1.1441835224489465, 1.4330119909570365)
0.01963222152816608 (0.9190099427459704, 0.9254273798349462)
0.014427074938178994 (0.9932950595216985, 0.9018349611448154)
0.0011627889627663522 (1.012223184703376, 1.0471055814973145)
0.0009948908303475353 (1.0200906386469326, 1.023391059405376)
0.0007619015784544736 (1.

(0.9989807388737378, 1.0003551107165987)

In [37]:
dir(c)

['__aenter__',
 '__aexit__',
 '__await__',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_asynchronous',
 '_cancel',
 '_close',
 '_connecting_to_scheduler',
 '_dec_ref',
 '_deserializers',
 '_ensure_connected',
 '_expand_key',
 '_expand_resources',
 '_expand_retries',
 '_gather',
 '_gather_future',
 '_gather_keys',
 '_gather_remote',
 '_gather_semaphore',
 '_get_dataset',
 '_get_futures_error',
 '_get_scheduler_info',
 '_get_task_stream',
 '_get_versions',
 '_graph_to_futures',
 '_handle_cancelled_key',
 '_handle_error',
 '_handle_key_in_memory',
 '_handle_lost_data',
 '_handle_report',
 '_handle_restart',
 '_handle_retried_key

In [38]:
help(c)

Help on Client in module distributed.client object:

class Client(builtins.object)
 |  Client(address=None, loop=None, timeout='__no_default__', set_as_default=True, scheduler_file=None, security=None, asynchronous=False, name=None, heartbeat_interval=None, serializers=None, deserializers=None, extensions=[<class 'distributed.pubsub.PubSubClientExtension'>], direct_to_workers=None, connection_limit=512, **kwargs)
 |  
 |  Connect to and submit computation to a Dask cluster
 |  
 |  The Client connects users to a Dask cluster.  It provides an asynchronous
 |  user interface around functions and futures.  This class resembles
 |  executors in ``concurrent.futures`` but also allows ``Future`` objects
 |  within ``submit/map`` calls.  When a Client is instantiated it takes over
 |  all ``dask.compute`` and ``dask.persist`` calls by default.
 |  
 |  It is also common to create a Client without specifying the scheduler
 |  address , like ``Client()``.  In this case the Client creates a
 |  

In [39]:
c.list_datasets()

()

In [60]:
client

  client._reconnect()()


TypeError: 'coroutine' object is not callable

In [63]:
f