Skip to content

Commit

Permalink
Merge pull request #63 from sklam/fix/masked_one_hot
Browse files Browse the repository at this point in the history
Fix #60: one hot does not support column with null values
  • Loading branch information
seibert committed Aug 23, 2017
2 parents 89eed51 + 5b5a956 commit f561e20
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 6 deletions.
35 changes: 32 additions & 3 deletions pygdf/cudautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,17 +319,46 @@ def fillna(data, mask, value):
# Binary kernels
#

@cuda.jit
def gpu_equal_constant_masked(arr, mask, val, out):
i = cuda.grid(1)
if i < out.size:
res = (arr[i] == val) if mask_get(mask, i) else False
out[i] = res


@cuda.jit
def gpu_equal_constant(arr, val, out):
i = cuda.grid(1)
if i < out.size:
out[i] = (arr[i] == val)


def apply_equal_constant(arr, val, dtype):
def apply_equal_constant(arr, mask, val, dtype):
"""Compute ``arr[mask] == val``
Parameters
----------
arr : device array
data
mask : device array
validity mask
val : scalar
value to compare against
dtype : np.dtype
output array dtype
Returns
-------
result : device array
"""
out = cuda.device_array(shape=arr.size, dtype=dtype)
configured = gpu_equal_constant.forall(out.size)
configured(arr, val, out)
if mask is not None:
configured = gpu_equal_constant_masked.forall(out.size)
configured(arr, mask, val, out)
else:
configured = gpu_equal_constant.forall(out.size)
configured(arr, val, out)
return out


Expand Down
7 changes: 5 additions & 2 deletions pygdf/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,11 @@ def one_hot_encoding(self, cats, dtype='float64'):
dtype = np.dtype(dtype)
out = []
for cat in cats:
buf = cudautils.apply_equal_constant(arr=self.to_gpu_array(),
val=cat, dtype=dtype)
mask = None #self.nullmask.to_gpu_array()
buf = cudautils.apply_equal_constant(
arr=self.data.to_gpu_array(),
mask=mask,
val=cat, dtype=dtype)
out.append(Series(buf))
return out

Expand Down
27 changes: 26 additions & 1 deletion pygdf/tests/test_onehot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from numba import cuda

from pygdf.dataframe import DataFrame
from pygdf.dataframe import DataFrame, Series
from . import utils


def test_onehot_simple():
Expand Down Expand Up @@ -49,6 +50,30 @@ def test_onehot_random():
np.testing.assert_equal(arr, mask)


def test_onehot_masked():
np.random.seed(0)
high = 5
size = 100
arr = np.random.randint(low=0, high=high, size=size)
bitmask = utils.random_bitmask(size)
bytemask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:size],
dtype=np.bool_)
arr[~bytemask] = -1

df = DataFrame()
df['a'] = Series(arr).set_mask(bitmask)

out = df.one_hot_encoding('a', cats=list(range(high)),
prefix='a', dtype=np.int32)

assert out.columns == tuple(['a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4'])
np.testing.assert_array_equal(out['a_0'] == 1, arr == 0)
np.testing.assert_array_equal(out['a_1'] == 1, arr == 1)
np.testing.assert_array_equal(out['a_2'] == 1, arr == 2)
np.testing.assert_array_equal(out['a_3'] == 1, arr == 3)
np.testing.assert_array_equal(out['a_4'] == 1, arr == 4)


if __name__ == '__main__':
test_onehot_random()

0 comments on commit f561e20

Please sign in to comment.