Test MapD->PyCUDF->matrix

In [1]:
PWD = !pwd

In [2]:
import sys
import os.path

Add import path to MapD Thrift binding

In [3]:
mapd_thrift_path = os.path.join(PWD[0], 'gen-py')
sys.path.append(mapd_thrift_path)

Add import path to Arrow Schema

In [4]:
arrow_schema_path = os.path.join(PWD[0], 'arrow_schema')
sys.path.append(arrow_schema_path)

In [5]:
from thrift.protocol import TBinaryProtocol
from thrift.protocol import TJSONProtocol
from thrift.transport import TSocket
from thrift.transport import THttpClient
from thrift.transport import TTransport

In [6]:
from mapd import MapD
from mapd import ttypes

MapD connection

In [7]:
def get_client(host_or_uri, port, http):
  if http:
    transport = THttpClient.THttpClient(host_or_uri)
    protocol = TJSONProtocol.TJSONProtocol(transport)
  else:
    socket = TSocket.TSocket(host_or_uri, port)
    transport = TTransport.TBufferedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

  client = MapD.Client(protocol)
  transport.open()
  return client

In [8]:
db_name = 'mapd'
user_name = 'mapd'
passwd = 'HyperInteractive'
hostname = 'localhost'
portno = 9091

client = get_client(hostname, portno, False)
session = client.connect(user_name, passwd, db_name)
print('Connection complete')

Connection complete


The Query

In [9]:
query = 'select dest_lat, dest_lon from flights_2008_7M limit 23;'
print('Query is : ' + query)

# always use True for is columnar
results = client.sql_execute_cudf(session, query, device_id=0, first_n=-1)

Query is : select dest_lat, dest_lon from flights_2008_7M limit 23;


In [10]:
results

TCuDataFrame(df_handle=b'\xb0hT\x04\x00\x00\x00\x00\x82F\x00\x00\x00\x00\x00\x00\xe0\x02\x00\x00\x00\x00\x00\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x00\xb0\x00\xd0\xc1\xaa\x00\x00\\', df_size=736, schema=b'\x10\x00\x00\x00\x0c\x00\x0e\x00\x06\x00\x05\x00\x08\x00\x00\x00\x0c\x00\x00\x00\x00\x01\x01\x00\x10\x00\x00\x00\x00\x00\n\x00\x08\x00\x00\x00\x04\x00\x00\x00\n\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00l\x00\x00\x00\x04\x00\x00\x00\xb0\xff\xff\xff\x00\x00\x01\x038\x00\x00\x00\x1c\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00\x1c\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x9a\xff\xff\xff\x00\x00\x01\x00\x8c\xff\xff\xff \x00\x01\x00\x94\xff\xff\xff\x01\x00\x02\x00\x08\x00\x00\x00dest_lon\x00\x00\x00\x00\x14\x00\x18\x00\x08\x00\x06\x00\x07\x00\x0c\x00\x00\x00\x10\x00\x14\x00\x00\x00\x14\x00\x00\x00\x00\x00\x01\x03H\x00\x00\x00$\x00\x00\x00\x14\x00\x00\x00\x04\x00\x00\x00\x02\x00\x00\x00,\

Use Numba to access the IPC memory handle

Note: this requires numba 0.32.0 + PR #2023

```bash
git clone https://github.com/numba/numba
cd numba
git fetch origin pull/2023/merge:pr/2023
git checkout pr/2023
```

In [11]:
from numba import cuda
from numba.cuda.cudadrv import drvapi

In [12]:
ipc_handle = drvapi.cu_ipc_mem_handle(*results.df_handle)

In [13]:
ipch = cuda.driver.IpcHandle(None, ipc_handle, size=results.df_size)

In [14]:
ctx = cuda.current_context()

In [15]:
dptr = ipch.open(ctx)

In [16]:
dptr

<numba.cuda.cudadrv.driver.OwnedPointer at 0x7fb084a02b38>

`dptr` is GPU memory containing the query result

Convert `dptr` into a GPU device ndarray (numpy array like object on GPU)

In [17]:
import numpy as np

In [18]:
dtype = np.dtype(np.byte)
darr = cuda.devicearray.DeviceNDArray(shape=dptr.size, strides=dtype.itemsize, dtype=dtype, gpu_data=dptr)

Use PyCUDF to read the arrow metadata from the query

In [19]:
from pycudf.gpuarrow import GpuArrowReader

In [20]:
reader = GpuArrowReader(darr)

In [21]:
reader.to_dict()

OrderedDict([('dest_lat',
              <numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7fb07d1d06d8>),
             ('dest_lon',
              <numba.cuda.cudadrv.devicearray.DeviceNDArray at 0x7fb07d1c5d68>)])

Wrap result in a Python CUDA DataFrame

In [22]:
from pycudf.dataframe import DataFrame

In [23]:
df = DataFrame()
for k, v in reader.to_dict().items():
    df[k] = v

In [24]:
df.columns, len(df)

(('dest_lat', 'dest_lon'), 23)

Turn the dataframe into a matrix

In [25]:
df.as_matrix()

array([[  37.36186218, -121.92900848],
       [  37.36186218, -121.92900848],
       [  38.69542313, -121.59076691],
       [  38.69542313, -121.59076691],
       [  38.69542313, -121.59076691],
       [  35.04022217, -106.60919189],
       [  33.5629425 ,  -86.75354767],
       [  39.17539978,  -76.66819763],
       [  32.84711456,  -96.85176849],
       [  32.84711456,  -96.85176849],
       [  30.49405479,  -81.68785858],
       [  32.84711456,  -96.85176849],
       [  30.49405479,  -81.68785858],
       [  32.84711456,  -96.85176849],
       [  28.42888832,  -81.31602478],
       [  26.07258415,  -80.15274811],
       [  28.42888832,  -81.31602478],
       [  29.64541817,  -95.27889252],
       [  41.78598404,  -87.75242615],
       [  29.64541817,  -95.27889252],
       [  41.78598404,  -87.75242615],
       [  29.64541817,  -95.27889252],
       [  41.78598404,  -87.75242615]], dtype=float32)

Cleanup the IPC handle

In [26]:
ipch.close()