In [None]:
import os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process


def print_rank():
    print('Hello from process {} (out of {})!'.format(dist.get_rank(), dist.get_world_size()))

def init_process(rank, size, fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '21'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn()

def main(fn, size=4):
    processes = []
    for rank in range(size):
        p = Process(target=init_process, args=(rank, size, fn))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()


main(print_rank, size=4)

Hello from process 0 (out of 4)!
Hello from process 2 (out of 4)!
Hello from process 3 (out of 4)!
Hello from process 1 (out of 4)!


In [None]:
#Answer for Task 1
import random

def broadcast_random(seed=1234):
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    #print(f"I am {rank} of {size} with a tensor {tensor}")
    
    random.seed(seed)
    random_group = random.sample([i for i in range(1,size)], 2)
    random_group = random_group + [0]
    print(f"Rank 0 broadcasts to the group {random_group}")
    group = dist.new_group(random_group)

    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print('Rank ', rank, ' has data ', tensor)

main(broadcast_random, size=4)

Rank 0 broadcasts to the group [2, 1, 0]
Rank 0 broadcasts to the group [2, 1, 0]
Rank 0 broadcasts to the group [2, 1, 0]
Rank 0 broadcasts to the group [2, 1, 0]
**********
Starting Communication
************
Rank  3  has data  tensor(3)
Rank  1  has data  tensor(0)
Rank  2  has data  tensor(0)
Rank  0  has data  tensor(0)


In [None]:
#Answer for Task 2

def server_client_communication():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    iterations = 10
    random.seed(0)
    seeds = [random.randint(0,10000) for i in range(iterations)]
    for i, sd in zip(range(iterations), seeds):
        # Step 1
        random.seed(sd)
        random_group = random.sample([i for i in range(1,size)], 2)
        random_group = random_group + [0]
        if rank == 0: print(f"Iter {i}: Rank 0 broadcasts to the group {random_group}")
        random_group_dist = dist.new_group(random_group)
        dist.broadcast(tensor=tensor, src=0, group=random_group_dist)
        
        # Step 2
        if rank in random_group and rank != 0: 
            tensor += 1

        # Step 3
        if rank == 0: tensor_old = tensor.clone()
        dist.reduce(tensor=tensor, dst=0, op=dist.ReduceOp.SUM, group=random_group_dist)
        if rank == 0: tensor -= tensor_old
        tensor = tensor/2

    if rank == 0: print(f"The final value of Rank {0} is {tensor}")

main(server_client_communication, size=4)

Iter 0: Rank 0 broadcasts to the group [2, 1, 0]
Iter 1: Rank 0 broadcasts to the group [3, 2, 0]
Iter 2: Rank 0 broadcasts to the group [3, 1, 0]
Iter 3: Rank 0 broadcasts to the group [2, 1, 0]
Iter 4: Rank 0 broadcasts to the group [3, 1, 0]
Iter 5: Rank 0 broadcasts to the group [3, 2, 0]
Iter 6: Rank 0 broadcasts to the group [3, 2, 0]
Iter 7: Rank 0 broadcasts to the group [3, 1, 0]
Iter 8: Rank 0 broadcasts to the group [2, 3, 0]
Iter 9: Rank 0 broadcasts to the group [2, 1, 0]
The final value of Rank 0 is 10.0


In [None]:
# Task 3: Use "send" and "receive" to achieve the goal of reduce that rank 0 has the sum of all the values from rank 1 to 3
def collect_sum():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    print(f"I am {rank} of {size} with a tensor {tensor}")
    if rank == 0:
        print("**********\nStarting Communication\n************")
        received_time = 0
        tensor_sum = 0
        while received_time < size-1:
            dist.recv(tensor)
            tensor_sum += tensor
            received_time += 1
        tensor = tensor_sum
    else:
        dist.send(tensor, dst=0)
    print('Rank ', rank, ' has data ', tensor.item())

main(collect_sum, size=4)

I am 3 of 4 with a tensor 4
I am 2 of 4 with a tensor 3
I am 0 of 4 with a tensor 1
I am 1 of 4 with a tensor 2
**********
Starting Communication
************
Rank  3  has data  4
Rank  2  has data  3
Rank  1  has data  2
Rank  0  has data  9
