# Getting Started with Ray Core

For a book on distributed Python, it's not without a certain irony that Python on its own is
largely ineffective for distributed computing.
Its interpreter is effectively single threaded which makes it difficult to e.g. leverage multiple CPUs on
the same machine, let alone a whole cluster of machines, using plain Python.
That means you need extra tooling, and luckily the Python ecosystem has some options for you.
For instance, libraries like `multiprocessing` can help you distribute work on a single machine, but not beyond.

In [1]:
import ray
ray.init()

2022-10-06 17:10:40,049	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.9.13
Ray version:,2.0.0
Dashboard:,http://127.0.0.1:8265


In [2]:
# tag::retrieve[]
import time

database = [  # <1>
    "Learning", "Ray",
    "Flexible", "Distributed", "Python", "for", "Data", "Science"
]

def retrieve(item):
    time.sleep(item / 10.)  # <2>
    return item, database[item]
# end::retrieve[]

In [3]:
# tag::duration[]
def print_runtime(input_data, start_time, decimals=1):
    print(f'Runtime: {time.time() - start_time:.{decimals}f} seconds, data:')
    print(*input_data, sep="\n")

start = time.time()
data = [retrieve(item) for item in range(8)]  # <1>
print_runtime(data, start)  # <2>
# end::duration[]

Runtime: 2.8 seconds, data:
(0, 'Learning')
(1, 'Ray')
(2, 'Flexible')
(3, 'Distributed')
(4, 'Python')
(5, 'for')
(6, 'Data')
(7, 'Science')


In [1]:
def decorator(func):
    def ins_func():
        print("before f")        
        func()
        print("after f")
    return ins_func

@decorator
def sum():
    print(1+1)

sum()

before f
2
after f


In [4]:
# tag::remote[]
@ray.remote  # <1>
def retrieve_task(item):
    return retrieve(item)  # <2>
# end::remote[]

In [11]:
f = retrieve_task.remote(0)
f

ObjectRef(8849b62d89cb30f9ffffffffffffffffffffffff0100000001000000)

In [6]:
# tag::duration_remote[]
start = time.time()
data_references = [  # <1>
    retrieve_task.remote(item) for item in range(8)
]
data = ray.get(data_references)  # <2>
print_runtime(data, start, 2)
# end::duration_remote[]

Runtime: 0.81 seconds, data:
(0, 'Learning')
(1, 'Ray')
(2, 'Flexible')
(3, 'Distributed')
(4, 'Python')
(5, 'for')
(6, 'Data')
(7, 'Science')


In [7]:
data_references

[ObjectRef(c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000),
 ObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000),
 ObjectRef(c2668a65bda616c1ffffffffffffffffffffffff0100000001000000),
 ObjectRef(32d950ec0ccf9d2affffffffffffffffffffffff0100000001000000),
 ObjectRef(e0dc174c83599034ffffffffffffffffffffffff0100000001000000),
 ObjectRef(f4402ec78d3a2607ffffffffffffffffffffffff0100000001000000),
 ObjectRef(f91b78d7db9a6593ffffffffffffffffffffffff0100000001000000),
 ObjectRef(82891771158d68c1ffffffffffffffffffffffff0100000001000000)]

In [8]:
database

['Learning',
 'Ray',
 'Flexible',
 'Distributed',
 'Python',
 'for',
 'Data',
 'Science']

In [9]:
# tag::object_store[]
database_object_ref = ray.put(database)  # <1>

In [10]:
database_object_ref

ObjectRef(00ffffffffffffffffffffffffffffffffffffff0100000001000000)

In [12]:


@ray.remote
def retrieve_task(item):
    obj_store_data = ray.get(database_object_ref)  # <2>
    time.sleep(item / 10.)
    return item, obj_store_data[item]
# end::object_store[]

In [15]:
f = retrieve_task.remote(0)
ray.get(f)

(0, 'Learning')

In [7]:
database_object_ref

ObjectRef(00ffffffffffffffffffffffffffffffffffffff0100000001000000)

In [8]:
# tag::duration_object_store[]
start = time.time()
data_references = [retrieve_task.remote(item) for item in range(8)]
all_data = []

while len(data_references) > 0:  # <1>
    finished, data_references = ray.wait(  # <2>
        data_references, num_returns=2, timeout=7.0
    )
    data = ray.get(finished)
    print_runtime(data, start, 3)  # <3>
    all_data.extend(data)  # <4>

# end::duration_object_store[]
print_runtime(all_data, start)

Runtime: 0.137 seconds, data:
(0, 'Learning')
(1, 'Ray')
Runtime: 0.337 seconds, data:
(2, 'Flexible')
(3, 'Distributed')
Runtime: 0.538 seconds, data:
(4, 'Python')
(5, 'for')
Runtime: 0.740 seconds, data:
(6, 'Data')
(7, 'Science')
Runtime: 0.7 seconds, data:
(0, 'Learning')
(1, 'Ray')
(2, 'Flexible')
(3, 'Distributed')
(4, 'Python')
(5, 'for')
(6, 'Data')
(7, 'Science')


In [9]:
# tag::task_dependency[]
@ray.remote
def follow_up_task(retrieve_result):  # <1>
    original_item, _ = retrieve_result
    follow_up_result = retrieve(original_item + 1)  # <2>
    return retrieve_result, follow_up_result  # <3>


retrieve_refs = [retrieve_task.remote(item) for item in [0, 2, 4, 6]]
follow_up_refs = [follow_up_task.remote(ref) for ref in retrieve_refs]  # <4>

result = [print(data) for data in ray.get(follow_up_refs)]
# end::task_dependency[]

((0, 'Learning'), (1, 'Ray'))
((2, 'Flexible'), (3, 'Distributed'))
((4, 'Python'), (5, 'for'))
((6, 'Data'), (7, 'Science'))


In [16]:
# tag::actors[]
@ray.remote  # <1>
class DataTracker:
    def __init__(self):
        self._counts = 0

    def increment(self):
        self._counts += 1

    def counts(self):
        return self._counts
# end::actors[]

In [17]:
# tag::actors_remote[]
@ray.remote
def retrieve_tracker_task(item, tracker):  # <1>
    obj_store_data = ray.get(database_object_ref)
    time.sleep(item / 10.)
    tracker.increment.remote()  # <2>
    return item, obj_store_data[item]


tracker = DataTracker.remote()  # <3>

data_references = [  # <4>
    retrieve_tracker_task.remote(item, tracker) for item in range(8)
]
data = ray.get(data_references)
print(ray.get(tracker.counts.remote()))  # <5>
# end::actors_remote[]

8


In [18]:
tracker

Actor(DataTracker, 47efc962b23479b86e855a9701000000)

In [23]:
# tag::ownership[]
@ray.remote
def task_owned():
    return


@ray.remote
def task(dependency):
    res_owned = task_owned.remote()
    return


val = ray.put("value")
res = task.remote(dependency=val)
res
# end::ownership[]

ObjectRef(cae5e964086715a4ffffffffffffffffffffffff0100000001000000)

In [24]:
val

ObjectRef(00ffffffffffffffffffffffffffffffffffffff0100000005000000)

# ![Task dependency](https://raw.githubusercontent.com/maxpumperla/learning_ray/main/notebooks/images/chapter_02/task_dependency.png)

# ![Workder Node](https://raw.githubusercontent.com/maxpumperla/learning_ray/main/notebooks/images/chapter_02/worker_node.png)

# ![Ray architecture](https://raw.githubusercontent.com/maxpumperla/learning_ray/main/notebooks/images/chapter_02/architecture.png)

In [17]:

def sum(a,b):
    yield a+b

a = sum(2,3)
a

<generator object sum at 0x7fea4c72ecf0>

In [21]:
b = sum(2,4)
b

<generator object sum at 0x7fea4c72e200>

In [29]:
def gen1(a):
    for i in a:
        yield i**3

num = [1,2,3]
cubes = gen1(num)

# cubes

next(cubes)

1

In [30]:
next(cubes)

8

In [31]:
next(cubes)

27

In [32]:
next(cubes)

StopIteration: 

In [29]:


# tag::map_fct[]
def map_function(document):
    for word in document.lower().split():
        yield word, 1
# end::map_fct[]


# tag::map[]
import ray


@ray.remote
def apply_map(corpus):
    map_results = []
    for document in corpus:
        for result in map_function(document):
            map_results.append(result)
    return map_results
# end::map[]


# tag::shuffle[]
@ray.remote
def apply_shuffle(map_results):
    shuffle_results = dict()
    for key, value in map_results:
        if key not in shuffle_results:
            shuffle_results[key] = []
        shuffle_results[key].append(value)
    return shuffle_results
# end::shuffle[]


# tag::reduce[]
def reduce_function(key, values):
    total = 0
    for count in values:
        total += count
    return key, total


@ray.remote
def apply_reduce(shuffle_results):
    reduce_results = dict()
    for key, values in shuffle_results.items():
        _, count = reduce_function(key, values)
        reduce_results[key] = count
    return reduce_results
# end::reduce[]


# tag::apply_mr[]
import subprocess
zen_of_python = subprocess.check_output(["python", "-c", "import this"])
text_data = zen_of_python.split()


map_results = apply_map.remote(text_data)
group_results = apply_shuffle.remote(map_results)
reduce_results = apply_reduce.remote(group_results)
counts = ray.get(reduce_results)

sorted_counts = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_counts)
# end::apply_mr[]

[(b'is', 10), (b'better', 8), (b'than', 8), (b'the', 6), (b'to', 5), (b'of', 3), (b'although', 3), (b'be', 3), (b'special', 2), (b'should', 2), (b'never', 2), (b'unless', 2), (b'one', 2), (b'way', 2), (b'do', 2), (b'may', 2), (b'if', 2), (b'implementation', 2), (b'explain,', 2), (b'a', 2), (b'idea.', 2), (b'zen', 1), (b'python,', 1), (b'by', 1), (b'tim', 1), (b'peters', 1), (b'beautiful', 1), (b'ugly.', 1), (b'explicit', 1), (b'implicit.', 1), (b'simple', 1), (b'complex.', 1), (b'complex', 1), (b'complicated.', 1), (b'flat', 1), (b'nested.', 1), (b'sparse', 1), (b'dense.', 1), (b'readability', 1), (b'counts.', 1), (b'cases', 1), (b"aren't", 1), (b'enough', 1), (b'break', 1), (b'rules.', 1), (b'practicality', 1), (b'beats', 1), (b'purity.', 1), (b'errors', 1), (b'pass', 1), (b'silently.', 1), (b'explicitly', 1), (b'silenced.', 1), (b'in', 1), (b'face', 1), (b'ambiguity,', 1), (b'refuse', 1), (b'temptation', 1), (b'guess.', 1), (b'there', 1), (b'one--', 1), (b'and', 1), (b'preferably', 1