# ML Optimization - TD1 - Homework

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())

True
1.8.0
11.1
8005


## Local GPU Property

In [2]:
#Activate GPU usage, Runtime -> Change Runtime Type -> Choose GPU type
! nvidia-smi

Wed Mar 10 22:26:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 980 Ti  Off  | 00000000:01:00.0  On |                  N/A |
| 32%   38C    P5    34W / 260W |    886MiB /  6075MiB |     23%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
print(torch.cuda.device_count())

1


In [4]:
print(torch.cuda.current_device())

0


In [5]:
a = torch.randn(10000000,device='cuda')
! nvidia-smi

Wed Mar 10 22:26:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 980 Ti  Off  | 00000000:01:00.0  On |                  N/A |
| 32%   38C    P5    34W / 260W |   1509MiB /  6075MiB |     39%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [6]:
del a 
torch.cuda.empty_cache()
!nvidia-smi

Wed Mar 10 22:26:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 980 Ti  Off  | 00000000:01:00.0  On |                  N/A |
| 32%   38C    P5    34W / 260W |   1469MiB /  6075MiB |     31%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Torch Distributed

In [7]:
import os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process

def print_rank():
    print('\nHello from process {} (out of {})!'.format(dist.get_rank(), dist.get_world_size()))

def init_process(rank, size, fn, backend='gloo'): #gloo: anyone can send messages to anyone
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29501'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn()

def main(fn, size=4):
    processes = []
    for rank in range(size):
        p = Process(target=init_process, args=(rank, size, fn))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

main(print_rank, size=4)


Hello from process 3 (out of 4)!
Hello from process 2 (out of 4)!

Hello from process 0 (out of 4)!
Hello from process 1 (out of 4)!




#### Q1: Which method is used to launch multiple processes?

The method ``init_process`` is used.

#### Q2: After initilization, the rank of the process and the worldsize can be obtained by which functions in torch.distributed?

The methods of a Process object used for this purpose are ``get_rank()`` and ``get_world_size()``.

## Communication

### Broadcast

In [8]:
def broadcast():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    group = dist.new_group([0,1,2,3])
    print(f"\nI am {rank} of {size} with a tensor {tensor}")
    
    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print(f"\nRank {rank} has data {tensor}")

main(broadcast, size=4)


I am 2 of 4 with a tensor 2

I am 3 of 4 with a tensor 3
I am 0 of 4 with a tensor 0
I am 1 of 4 with a tensor 1

**********
Starting Communication
************


Rank 2 has data 0
Rank 0 has data 0

Rank 1 has data 0

Rank 3 has data 0



#### Q3: In the above code, which rank is the one who broadcasts?

The rank that broadcast to the other is ```rank 0```.

#### Task 1: If Rank 0 just wants to broadcast to a random subset of all the processes, please write down the new code to acheive that.

In [9]:
import random as rd
lst = [0]+sorted(rd.sample([1,2,3], rd.randint(0,3)))
print(f"subset: {lst}")

def broadcast_random():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank)
    group =  dist.new_group(lst)
    print(f"\nI am {rank} of {size} with a tensor {tensor}")
    
    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print(f"\nRank {rank} has data {tensor}")

main(broadcast_random, size=4)

subset: [0, 1, 2, 3]

I am 3 of 4 with a tensor 3

I am 0 of 4 with a tensor 0

I am 2 of 4 with a tensor 2
I am 1 of 4 with a tensor 1
**********
Starting Communication
************


Rank 2 has data 0
Rank 0 has data 0
Rank 3 has data 0

Rank 1 has data 0




### Reduce

In [10]:
def reduce():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    if rank == 0:
        tensor_old = tensor.clone()
    group = dist.new_group([0,1,2,3])
    print(f"\nI am {rank} of {size} with a tensor {tensor}")
    if rank == 0:
        print("**********\nStarting Communication\n************")
    dist.reduce(tensor=tensor, dst=0, op=dist.ReduceOp.SUM, group = group)
    if rank == 0:
        tensor -= tensor_old
    print(f"\nRank {rank} has data {tensor.item()}")

main(reduce, size=4)


I am 3 of 4 with a tensor 4

I am 0 of 4 with a tensor 1
**********
Starting Communication
************
I am 2 of 4 with a tensor 3
I am 1 of 4 with a tensor 2



Rank 2 has data 7
Rank 3 has data 4


Rank 1 has data 9
Rank 0 has data 9



#### Q4: What does the above code achieve?

The code above achieve a reduce operation over the tensors held in a set of nodes. Here the reducer function performs a sum with the implicit accumulator set to 0

#### Q5: Check the values of every rank after "reduce", try to explain the reason.**

The operation performed by the reduce is similar to the following code, albeit in a parallelized way:

```python
from functools import reduce
reduce(lambda x, y: x+y, map(lambda x:x+1 if x !=0 else x, [0,1,2,3]))
````

The reduce operation is triggered when the main function is evaluated for rank=0, which evaluate all other node and perform a sum of the value held in their tensor (the reduce operation), leading to:

```tensor of rank 0 is updated with value (1+1) + (2+1) + (3+1) = 9```

#### Task 2 [Server-Client communication]: Write a function which runs for 10 iterations: Among each iteration,

1. rank 0 broadcasts to a random subset of all the processes
2. the processes in the subset update their states by adding one unit
3. rank 0 gets the average of the states from the processes in the subset.

In [11]:
def reduce_10_iterations():
    """
    Performs 10 reduction operations
    """
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    group = dist.new_group([0,1,2,3])
    
    print(f"\nI am {rank+1} of {size} with a tensor {tensor}")
    
    # Records old
    if rank == 0: 
        print("****\nStarting Communication\n****")
        tensor_old = tensor.clone()
    
    # Performs the 10-fold reduction
    for i in range(10):
        if rank == 0:
            print(f"\n#### ITERATION/EVALUATION {i}")
        dist.reduce(tensor=tensor, dst=0, op=dist.ReduceOp.SUM, group = group)
    
    # Removes over-added value
    if rank == 0: 
        tensor -= tensor_old
    
    # Prints result
    print(f"\nRank {rank+1} has data {tensor.item()}")

In [12]:
main(reduce_10_iterations, size=4)


I am 2 of 4 with a tensor 2
I am 3 of 4 with a tensor 3
I am 4 of 4 with a tensor 4



I am 1 of 4 with a tensor 1
****
Starting Communication
****

#### ITERATION/EVALUATION 0

#### ITERATION/EVALUATION 1

#### ITERATION/EVALUATION 2

#### ITERATION/EVALUATION 3

#### ITERATION/EVALUATION 4

#### ITERATION/EVALUATION 5

#### ITERATION/EVALUATION 6

#### ITERATION/EVALUATION 7

Rank 4 has data 4
#### ITERATION/EVALUATION 8


Rank 3 has data 43

#### ITERATION/EVALUATION 9

Rank 2 has data 252

Rank 1 has data 1065


In [13]:
def send_receive():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    print(f"\nI am {rank} of {size} with a tensor {tensor}")
    if rank == 0:
        print("****\nStarting Communication\n****")
        dist.recv(tensor, src=1)
    if rank == 1:
        dist.send(tensor, dst=0)
    if rank == 2:
        dist.recv(tensor)
    if rank == 3:
        dist.send(tensor, dst=2)
    print(f"\nRank {rank} has data {tensor.item()}")

main(send_receive, size=4)


I am 2 of 4 with a tensor 3
I am 3 of 4 with a tensor 4
I am 1 of 4 with a tensor 2
I am 0 of 4 with a tensor 1



Rank 2 has data 4
Rank 3 has data 4
****
Starting Communication
****



Rank 0 has data 2

Rank 1 has data 2


#### Q6: What does the above code acheive?

Node w/ Rank 0 receives data from Node w/ Rank 1. Node w/ Rank 2 receives data from Node w/ Rank 4.

#### Task 3: Use "send" and "receive" to achieve the goal of reduce that rank 0 has the sum of all the values from rank 1 to 3

In [18]:
def send_receive_reduce():
    rank = dist.get_rank()
    size = dist.get_world_size()
    tensor = torch.tensor(rank+1)
    t = tensor.clone()
    
    print(f"\nI am {rank+1} of {size} with a tensor {tensor}")
    if rank == 0:
        print("\n****\nStarting Communication\n****")
        dist.recv(tensor, src=1)
    if rank == 1:
        dist.recv(tensor, src=2)
        tensor += t
        dist.send(tensor, dst=0)
    if rank == 2:
        dist.recv(tensor, src=3)
        tensor += t
        dist.send(tensor, dst=1)
    if rank == 3:
        dist.send(tensor, dst=2)
    print(f"\nRank {rank+1} has data {tensor.item()}")
    
main(send_receive_reduce, size=4)


I am 2 of 4 with a tensor 2
I am 3 of 4 with a tensor 3
I am 4 of 4 with a tensor 4



I am 1 of 4 with a tensor 1

****
Starting Communication
****

Rank 4 has data 4

Rank 2 has data 9
Rank 3 has data 7


Rank 1 has data 9


## Spawn

In [19]:
%%writefile Spawn.py
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as multiprocess
import argparse


def parse():
    parser = argparse.ArgumentParser()
    parser.add_argument('--size', type=int, help='the number of processes')
    parser.add_argument('--func', type=str, help='choose the function to execute')
    args = parser.parse_args()
    return args

def print_rank(tensor):
    print('Hello from process {} (out of {}) with tensor {}!'.format(dist.get_rank(), dist.get_world_size(), tensor))

def broadcast(tensor):
    rank = dist.get_rank()
    size = dist.get_world_size()
    group = dist.new_group([0,1])
    if rank == 0 : print("**********\nStarting Communication\n************")
    dist.broadcast(tensor=tensor, src=0, group=group)
    print('Rank ', rank, ' has data ', tensor)

def init_process(rank, size, fn, backend):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29501'
    if torch.cuda.is_available() == True:
        device = torch.device('cuda:'+str(rank))
    else:
        device = torch.device('cpu')
    tensor = torch.tensor(rank, device=device)
    # Use torch.Tensor.item() to get a Python number from a tensor containing a single value:
    print(f"I am {rank} with a tensor {tensor.item()}")
    # Get a numpy array from a tensor array: tensor.numpy() or tensor.cpu().numpy()
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(tensor)

if __name__== '__main__':
    args = parse()
    if torch.cuda.is_available() == True:
        backend = 'nccl'
        if torch.cuda.device_count()<args.size:
            raise ValueError('size should not larger than the number of GPUs')
    else:
        backend = 'gloo'
            
    print(f"Backend: {backend}")
    function_mapping = {'print_rank':print_rank, 'broadcast':broadcast}
    multiprocess.spawn(init_process, args=(args.size, function_mapping[args.func], backend), nprocs=args.size,join=True, daemon=False)

Writing Spawn.py


In [20]:
! python Spawn.py --size 1 --func "print_rank"

Backend: nccl
I am 0 with a tensor 0
Hello from process 0 (out of 1) with tensor 0!


#### Q7: How to put the tensor to GPU?

```X=torch.ones(10)``` or ```x.device``` then ```x.to("cuda")```

#### Q8: How to use "spawn" funtion offered by multiprocessing package?

Spawn is a script to be launch via command line that reuses the functions already seen before.

#### Task 4 [After class]: Reserve two GPUs from NEF and change the script that rank 1 can broadcast to the rest. Then rerun to check.

see screenshots