# Multi-GPU Mean Calculation

In [1]:
from dask.distributed import Client

import cudf, dask_cudf
from dask_cuml import mean

There are a couple ways to get data into cuml, which will need to be tested:
1. A large cudf object could be created and then passed to dask_cudf
2. The workers are asked to fetch the data directly

Since this will likely be running in a single worker per GPU mode, it will be important that the cuDF's are able to work across the GPUs (e.g. When a very large cuDF is partitioned across the workers- it will be important that the GPU memory is re-allocated to the new worker's local device and de-allocated on the cuDF's old device.)

__Example workflow__:
- User allocates a dask_cudf (or, eventually, a dask_cuml_array) and distributes it across the cluster
- User calls MGMean().calculate(dask_cudf) after the dask_cudf
- MGMean performs redistribution / preprocessing
- MGMean gathers allocations (hostname/device/key triplets) from Dask workers
- MGMean c++ code is executed with the allocation information as its argument


In [2]:
client = Client("10.31.241.47:8786")

In [3]:
client

0,1
Client  Scheduler: tcp://10.31.241.47:8786,Cluster  Workers: 7  Cores: 7  Memory: 47.33 GB


In [4]:
def create_cudf(dev):
    import numba.cuda
    import numpy as np
    numba.cuda.select_device(dev)
    print("Creating dataframe on device " + str(dev))
    return cudf.DataFrame(
        [('a', np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).astype(np.float32)), 
         ('b', np.array([2.0, 3.0, 4.0, 5.0, 6.0, 7.0]).astype(np.float32))]
    )


In [5]:
workers = list(client.has_what().keys())
workers

['tcp://10.31.241.47:40995',
 'tcp://10.31.241.47:37008',
 'tcp://10.31.241.47:40151',
 'tcp://10.31.241.47:36847',
 'tcp://10.31.241.47:42678',
 'tcp://10.31.241.47:44276',
 'tcp://10.31.241.47:33084']

In [15]:
# Copyright (c) 2018, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from cuml import MGMean as cumlMGMean

from tornado import gen
import dask_cudf, cudf

import time

from dask.distributed import get_worker, get_client

from dask import delayed
from collections import defaultdict
from dask.distributed import wait, default_client
import dask.dataframe as dd
import dask.array as da

from toolz import first, assoc
from distributed import Client


def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port

def to_gpu_matrix(df):
    rm = df.as_gpu_matrix(order='F')
    print("GPU: " + str(rm))
    print("CTYPES: "+ str(rm.device_ctypes_pointer))
    series = build_output_series(rm)
    return (rm, series, to_gpu_array(series))

def build_output_series(gpu_matrix):
    import numpy as np
    return cudf.Series(np.zeros(gpu_matrix.shape[1], dtype=gpu_matrix.dtype))

def to_gpu_array(mean_):
    return mean_._column._data.to_gpu_array()
    
def alloc_dict(ipcs):
    in_ipc, out_ipc = ipcs
    cai = in_ipc.__cuda_array_interface__
    return {"ptr": in_ipc.device_ctypes_pointer.value,
            "out_ptr": out_ipc.device_ctypes_pointer.value,
            "dtype": cai["typestr"],
            "shape": cai["shape"]
    }

def get_ipc_handles(data):
    gpu_matrix, series, gpu_array = data
    return (gpu_matrix.get_ipc_handle(), gpu_array.get_ipc_handle())


class MGMean(object):

    def calculate(self, dask_df):
        client = default_client()

        # Keep the futures around so the GPU memory doesn't get
        # deallocated on the workers.
        gpu_futures, ipc_futures = client.sync(self._get_mg_info, dask_df)
        wait(ipc_futures)
        
        # Run on a single worker on each unique host
        def calc_mean(ipcs):
            open_ipcs = [(x[0].open(), x[1].open()) for x in ipcs]
            m = cumlMGMean()
            outs = m.calculate(list(map(alloc_dict, open_ipcs)))
            [(x[0].close(), x[1].close()) for x in ipcs]
            return outs

        who_has = client.who_has(ipc_futures)

        print(str(who_has))
        
        key_to_host_dict = {}
        for key in who_has:
            key_to_host_dict[key] = parse_host_port(who_has[key][0])
            
        hosts_to_key_dict = {}
        for key, host in key_to_host_dict.items():
            if host not in hosts_to_key_dict:
                hosts_to_key_dict[host] = set([key])
            else:
                hosts_to_key_dict[host].add(key)

        workers = [x[0] for x in list(who_has.values())]
        hosts_dict = build_host_dict(workers)
        f = []
        for host, ports in hosts_dict.items():
            exec_node = (host, random.sample(ports, 1)[0])
            
            # remove exec node from ipc_futures
            keys = set(hosts_to_key_dict[exec_node])
            
            final_ipc_futures = list(filter(lambda x: x.key not in keys and key_to_host_dict[x.key][0] == host, ipc_futures))
            
            f.append(client.submit(calc_mean, final_ipc_futures, workers = [exec_node]))

        wait(f)
        
        return f, gpu_futures

    @gen.coroutine
    def _get_mg_info(self, dask_df):

        client = default_client()

        if isinstance(dask_df, dd.DataFrame):
            data_parts = dask_df.to_delayed()
            parts = list(map(delayed, data_parts))
            parts = client.compute(parts)  # Start computation in the background
            yield wait(parts)
            for part in parts:
                if part.status == 'error':
                    yield part  # trigger error locally
        else:
            data_parts = dask_df


        key_to_part_dict = dict([(str(part.key), part) for part in data_parts])

        who_has = yield client.who_has(data_parts)
        worker_map = []

        for key, workers in who_has.items():
            worker_map.append((first(workers), key_to_part_dict[key]))

        gpu_data = [[worker, client.submit(to_gpu_matrix, part, workers=worker)]
                    for worker, part in worker_map]
        
        input_ipc_handles = [client.submit(get_ipc_handles, future, workers=worker) for worker, future in gpu_data]
                
        raise gen.Return((gpu_data, input_ipc_handles))

In [16]:
from dask.distributed import wait
import random

def parse_host_port(address):
    if '://' in address:
        address = address.rsplit('://', 1)[1]
    host, port = address.split(':')
    port = int(port)
    return host, port

def build_host_dict(workers):
    hosts = set(map(lambda x: parse_host_port(x), workers))
    hosts_dict = {}
    for host, port in hosts:
        if host not in hosts_dict:
            hosts_dict[host] = set([port])
        else:
            hosts_dict[host].add(port)
            
    return hosts_dict
    

def assign_gpus(client):
    
    """
    Supports a multi-GPU & multi-Node environment by assigning a single local GPU
    to each worker in the cluster. This is necessary due to Numba's restriction
    that only a single CUDA context (and thus a single device) can be active on a 
    thread at a time. 
    
    The GPU assignments are valid as long as the future returned from this function
    is held in scope. This allows any functions that need to allocate GPU data to
    utilize the CUDA context on the same device, otherwise data could be lost.
    """

    workers = list(client.has_what().keys())
    hosts_dict = build_host_dict(workers)
    
    def get_gpu_info():
        import numba.cuda
        return [x.id for x in numba.cuda.gpus]
    
    gpu_info = dict([(host, 
                      client.submit(get_gpu_info, 
                                    workers = [(host, random.sample(hosts_dict[host], 1)[0])])) 
                     for host in hosts_dict])
    wait(list(gpu_info.values()))
    
    # Scatter out a GPU device ID to workers
    f = []
    for host, future in gpu_info.items():
        gpu_ids = future.result()
        ports = random.sample(hosts_dict[host], min(len(gpu_ids), len(hosts_dict[host])))
        
        f.extend([client.scatter(device_id, workers = [(host,port)]) for device_id, port in zip(gpu_ids, ports)])
    wait(f)
        
    return f

In [17]:
assignments = assign_gpus(client)

In [18]:
client.who_has()

{'create_cudf-179c5d9da0a793490e0f86206758d243': ('tcp://10.31.241.47:36847',),
 'create_cudf-27d807310733824947dcbbb642b78ed2': ('tcp://10.31.241.47:40995',),
 'create_cudf-35124cbc4008b9274989a37db6f59f46': ('tcp://10.31.241.47:37008',),
 'create_cudf-590c28dd03d79f0cc5b1751bbacf2ea7': ('tcp://10.31.241.47:33084',),
 'create_cudf-61c4a99828ec0f1c1bc586976a8941f2': ('tcp://10.31.241.47:44276',),
 'create_cudf-7260dc5a79c4ba5e24f5b76846103266': ('tcp://10.31.241.47:42678',),
 'create_cudf-7b059e2b58d6f4010f5ab98474a70df1': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-2b068aeeb0819e54862746f15dcb442e': ('tcp://10.31.241.47:40995',),
 'get_ipc_handles-49172f967eeaac7fe3b6719bb15ccfdc': ('tcp://10.31.241.47:44276',),
 'get_ipc_handles-80b9f47e081c4cd701dc1f28b029fc4c': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-86d26af010124bf2eb5b5f3c487b702d': ('tcp://10.31.241.47:37008',),
 'get_ipc_handles-a26dbd5ce1204f917ac21f4fa628dd41': ('tcp://10.31.241.47:33084',),
 'get_ipc_handles-c2

In [19]:
res = [client.submit(create_cudf, x[0], workers = [x[1]]) for x in zip(assignments, workers)]
wait(res)

DoneAndNotDoneFutures(done={<Future: status: finished, type: DataFrame, key: create_cudf-27d807310733824947dcbbb642b78ed2>, <Future: status: finished, type: DataFrame, key: create_cudf-179c5d9da0a793490e0f86206758d243>, <Future: status: finished, type: DataFrame, key: create_cudf-7b059e2b58d6f4010f5ab98474a70df1>, <Future: status: finished, type: DataFrame, key: create_cudf-590c28dd03d79f0cc5b1751bbacf2ea7>, <Future: status: finished, type: DataFrame, key: create_cudf-7260dc5a79c4ba5e24f5b76846103266>, <Future: status: finished, type: DataFrame, key: create_cudf-35124cbc4008b9274989a37db6f59f46>, <Future: status: finished, type: DataFrame, key: create_cudf-61c4a99828ec0f1c1bc586976a8941f2>}, not_done=set())

In [20]:
client.who_has()

{'create_cudf-179c5d9da0a793490e0f86206758d243': ('tcp://10.31.241.47:36847',),
 'create_cudf-27d807310733824947dcbbb642b78ed2': ('tcp://10.31.241.47:40995',),
 'create_cudf-35124cbc4008b9274989a37db6f59f46': ('tcp://10.31.241.47:37008',),
 'create_cudf-590c28dd03d79f0cc5b1751bbacf2ea7': ('tcp://10.31.241.47:33084',),
 'create_cudf-61c4a99828ec0f1c1bc586976a8941f2': ('tcp://10.31.241.47:44276',),
 'create_cudf-7260dc5a79c4ba5e24f5b76846103266': ('tcp://10.31.241.47:42678',),
 'create_cudf-7b059e2b58d6f4010f5ab98474a70df1': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-2b068aeeb0819e54862746f15dcb442e': ('tcp://10.31.241.47:40995',),
 'get_ipc_handles-49172f967eeaac7fe3b6719bb15ccfdc': ('tcp://10.31.241.47:44276',),
 'get_ipc_handles-80b9f47e081c4cd701dc1f28b029fc4c': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-86d26af010124bf2eb5b5f3c487b702d': ('tcp://10.31.241.47:37008',),
 'get_ipc_handles-a26dbd5ce1204f917ac21f4fa628dd41': ('tcp://10.31.241.47:33084',),
 'get_ipc_handles-c2

In [21]:
import numpy as np
dask_df = dask_cudf.from_cudf(df, chunksize = 2)

NameError: name 'df' is not defined

### Persist the Dataframe to scatter it out to the workers

In [None]:
dask_df = client.persist(dask_df)

In [None]:
res

In [22]:
client.who_has()

{'create_cudf-179c5d9da0a793490e0f86206758d243': ('tcp://10.31.241.47:36847',),
 'create_cudf-27d807310733824947dcbbb642b78ed2': ('tcp://10.31.241.47:40995',),
 'create_cudf-35124cbc4008b9274989a37db6f59f46': ('tcp://10.31.241.47:37008',),
 'create_cudf-590c28dd03d79f0cc5b1751bbacf2ea7': ('tcp://10.31.241.47:33084',),
 'create_cudf-61c4a99828ec0f1c1bc586976a8941f2': ('tcp://10.31.241.47:44276',),
 'create_cudf-7260dc5a79c4ba5e24f5b76846103266': ('tcp://10.31.241.47:42678',),
 'create_cudf-7b059e2b58d6f4010f5ab98474a70df1': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-2b068aeeb0819e54862746f15dcb442e': ('tcp://10.31.241.47:40995',),
 'get_ipc_handles-49172f967eeaac7fe3b6719bb15ccfdc': ('tcp://10.31.241.47:44276',),
 'get_ipc_handles-80b9f47e081c4cd701dc1f28b029fc4c': ('tcp://10.31.241.47:40151',),
 'get_ipc_handles-86d26af010124bf2eb5b5f3c487b702d': ('tcp://10.31.241.47:37008',),
 'get_ipc_handles-a26dbd5ce1204f917ac21f4fa628dd41': ('tcp://10.31.241.47:33084',),
 'get_ipc_handles-c2

In [23]:
m = MGMean()

In [24]:
result = m.calculate(res)

{'get_ipc_handles-dc612b62f6fe257a311baa56b6b43b8b': ('tcp://10.31.241.47:40151',), 'get_ipc_handles-b1d8ee8a0dec64461efb582c51f45707': ('tcp://10.31.241.47:37008',), 'get_ipc_handles-483a462d5eb89b76340861290fbc6389': ('tcp://10.31.241.47:36847',), 'get_ipc_handles-fadeefdd9e839ca8b6efe1f582665db7': ('tcp://10.31.241.47:33084',), 'get_ipc_handles-35c59eab8103f069cc913dff867bbef9': ('tcp://10.31.241.47:42678',), 'get_ipc_handles-b9656219efc0baf053b48ef5bb6a364c': ('tcp://10.31.241.47:40995',), 'get_ipc_handles-96a5368da8ad05804c74af57d9eee13d': ('tcp://10.31.241.47:44276',)}


In [31]:
result[0][0].result()

<cudf.Series nrows=2 >

AttributeError: 'Future' object has no attribute 'value'

In [9]:
print(str(result))

           
0 4.1666665
1       4.5


In [15]:
client.who_has()

{'create_cudf-1e63289d8f2e2d479b822171ff82d577': ('tcp://10.31.241.47:39491',),
 'create_cudf-204083a67d003680231ed385b47d33f4': ('tcp://10.31.241.47:44195',),
 'create_cudf-37d193b5ba47dd60656b3abcc3f6574e': ('tcp://10.31.241.47:33171',),
 'create_cudf-6a87fca58b6a4529e0a2bf85fa39633e': ('tcp://10.31.241.47:43476',),
 'create_cudf-d19cf5f63f21f6f4ab4fc7ee04d3e4a2': ('tcp://10.31.241.47:39894',),
 'create_cudf-d29a24a6713817ed09b1d096644fc07c': ('tcp://10.31.241.47:39807',),
 'create_cudf-e4c4e25e1f0f308e316b98cf348c2ea1': ('tcp://10.31.241.47:46100',),
 'create_cudf-fa8ea9204c42d85b601fd07cb8a5e5ef': ('tcp://10.31.241.47:34391',),
 'get_ipc_handles-017b01e8906595d9cc15bf341259597b': ('tcp://10.31.241.47:33171',),
 'get_ipc_handles-08a43b6806ac67055e27037bc9b24184': ('tcp://10.31.241.47:43476',),
 'get_ipc_handles-3ac519d1ea0ba8e89c5a30f2d4426e69': ('tcp://10.31.241.47:34391',),
 'get_ipc_handles-48399c980bc59f161512322c5c57059d': ('tcp://10.31.241.47:39807',),
 'get_ipc_handles-54d0d0