In [None]:
# Make sure were on ray 1.9
from ray.data.grouped_dataset import GroupedDataset

In [None]:
#tag::start-ray-local[]
import ray
ray.init(num_cpus=20) # In theory auto sensed, in practice... eh
#end::start-ray-local[]

In [None]:
#tag::local_fun[]
def hi():
    import os
    import socket
    return f"Running on {socket.gethostname()} in pid {os.getpid()}"
#end::local_fun[]

In [None]:
hi()

In [None]:
#tag::remote_fun[]
@ray.remote
def remote_hi():
    import os
    import socket
    return f"Running on {socket.gethostname()} in pid {os.getpid()}"
future = remote_hi.remote()
ray.get(future)
#end::remote_fun[]

In [None]:
#tag::sleepy_task_hello_world[]
import timeit

def slow_task(x):
    import time
    time.sleep(2) # Do something sciency/business
    return x

@ray.remote
def remote_task(x):
    return slow_task(x)

things = range(10)

very_slow_result = map(slow_task, things)
slowish_result = map(lambda x: remote_task.remote(x), things)

slow_time = timeit.timeit(lambda: list(very_slow_result), number=1)
fast_time = timeit.timeit(lambda: list(ray.get(list(slowish_result))), number=1)
print(f"In sequence {slow_time}, in parallel {fast_time}")
#end::sleepy_task_hello_world[]

In [None]:
slowish_result = map(lambda x: remote_task.remote(x), things)
ray.get(list(slowish_result))

In [None]:
# Note: if we were on a "real" cluster we'd have to do more magic to install it on all the nodes in the cluster.
!pip install bs4

In [None]:
#tag::mini_crawl_task[]
@ray.remote
def crawl(url, depth=0, maxdepth=1, maxlinks=4):
    links = []
    link_futures = []
    import requests
    from bs4 import BeautifulSoup
    try:
        f = requests.get(url)
        links += [(url, f.text)]
        if (depth > maxdepth):
            return links # base case
        soup = BeautifulSoup(f.text, 'html.parser')
        c = 0
        for link in soup.find_all('a'):
            try:
                c = c + 1
                link_futures += [crawl.remote(link["href"], depth=(depth+1), maxdepth=maxdepth)]
                # Don't branch too much were still in local mode and the web is big
                if c > maxlinks:
                    break
            except:
                pass
        for r in ray.get(link_futures):
            links += r
        return links
    except requests.exceptions.InvalidSchema:
        return [] # Skip non-web links
    except requests.exceptions.MissingSchema:
        return [] # Skip non-web links

ray.get(crawl.remote("http://holdenkarau.com/"))
#end::mini_crawl_task[]

In [None]:
#tag::actor[]
@ray.remote
class HelloWorld(object):
    def __init__(self):
        self.value = 0
    def greet(self):
        self.value += 1
        return f"Hi user #{self.value}"

# Make an instance of the actor
hello_actor = HelloWorld.remote()

# Call the actor
print(ray.get(hello_actor.greet.remote()))
print(ray.get(hello_actor.greet.remote()))
#end::actor[]


In [None]:
#tag::ds[]
# Create a Dataset of URLS objects. We could also load this from a text file with ray.data.read_text()
urls = ray.data.from_items([
    "https://github.com/scalingpythonml/scalingpythonml",
    "https://github.com/ray-project/ray"])

def fetch_page(url):
    import requests
    f = requests.get(url)
    return f.text

pages = urls.map(fetch_page)
# Look at a page to make sure it worked
pages.take(1)
#end:ds[]

In [None]:
#tag::ray_wordcount_on_ds[]
words = pages.flat_map(lambda x: x.split(" ")).map(lambda w: (w, 1))
grouped_words = words.groupby(lambda wc: wc[0])
#end::ray_wordcount_on_ds[]

In [None]:
word_counts = grouped_words.count()

In [None]:
word_counts.show()