Skip to content
Permalink
Browse files

Merge 6bb31b2 into 5160b11

  • Loading branch information
ssanderson committed Jun 7, 2017
2 parents 5160b11 + 6bb31b2 commit b8e3f142ed474a6090c2283c29c068785b4ffb52
@@ -468,6 +468,70 @@ class C(Classifier):
)
self.assertEqual(errmsg, expected)

@parameter_space(
__fail_fast=True,
labelarray_dtype=(categorical_dtype, bytes_dtype, unicode_dtype),
relabel_func=[
lambda s: s[0],
lambda s: str(len(s)),
lambda s: str(len([c for c in s if c == 'a'])),
lambda s: None,
]
)
def test_relabel_strings(self, relabel_func, labelarray_dtype):

class C(Classifier):
inputs = ()
dtype = categorical_dtype
missing_value = None
window_length = 0

c = C()

raw = np.asarray(
[['a', 'aa', 'aaa', 'abab'],
['bab', 'aba', 'aa', 'bb'],
['a', 'aba', 'abaa', 'abaab'],
['a', 'aa', 'aaa', 'aaaa']],
dtype=labelarray_dtype,
)
raw_relabeled = np.vectorize(relabel_func, otypes=[object])(raw)

data = LabelArray(raw, missing_value=None)

terms = {
'relabeled': c.relabel(relabel_func),
}
expected_results = {
'relabeled': LabelArray(raw_relabeled, missing_value=None),
}

self.check_terms(
terms,
expected_results,
initial_workspace={c: data},
mask=self.build_mask(self.ones_mask(shape=data.shape)),
)

def test_relabel_int_classifier_not_yet_supported(self):
class C(Classifier):
inputs = ()
dtype = int64_dtype
missing_value = -1
window_length = 0

c = C()

with self.assertRaises(TypeError) as e:
c.relabel(lambda x: 0 / 0) # Function should never be called.

result = str(e.exception)
expected = (
"relabel() is only defined on Classifiers producing strings "
"but it was called on a Classifier of dtype int64."
)
self.assertEqual(result, expected)


class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
def test_reversability_categorical(self):
@@ -109,6 +109,66 @@ def test_compare_to_str(self,
np_contains(strs) & notmissing,
)

@parameter_space(
__fail_fast=True,
f=[
lambda s: str(len(s)),
lambda s: s[0],
lambda s: ''.join(reversed(s)),
lambda s: '',
]
)
def test_map(self, f):
data = np.array(
[['E', 'GHIJ', 'HIJKLMNOP', 'DEFGHIJ'],
['CDE', 'ABCDEFGHIJKLMNOPQ', 'DEFGHIJKLMNOPQRS', 'ABCDEFGHIJK'],
['DEFGHIJKLMNOPQR', 'DEFGHI', 'DEFGHIJ', 'FGHIJK'],
['EFGHIJKLM', 'EFGHIJKLMNOPQRS', 'ABCDEFGHI', 'DEFGHIJ']],
dtype=object,
)
la = LabelArray(data, missing_value=None)

numpy_transformed = np.vectorize(f)(data)
la_transformed = la.map(f).as_string_array()

assert_equal(numpy_transformed, la_transformed)

@parameter_space(missing=['A', None])
def test_map_ignores_missing_value(self, missing):
data = np.array([missing, 'B', 'C'], dtype=object)
la = LabelArray(data, missing_value=missing)

def increment_char(c):
return chr(ord(c) + 1)

result = la.map(increment_char)
expected = LabelArray([missing, 'C', 'D'], missing_value=missing)
assert_equal(result.as_string_array(), expected.as_string_array())

@parameter_space(
__fail_fast=True,
f=[
lambda s: 0,
lambda s: 0.0,
lambda s: object(),
]
)
def test_map_requires_f_to_return_a_string(self, f):
la = LabelArray(self.strs, missing_value=None)

with self.assertRaises(TypeError):
la.map(f)

def test_map_can_only_return_none_if_missing_value_is_none(self):

# Should work.
la = LabelArray(self.strs, missing_value=None)
la.map(lambda x: None)

la = LabelArray(self.strs, missing_value="__MISSING__")
with self.assertRaises(TypeError):
la.map(lambda x: None)

@parameter_space(
__fail_fast=True,
missing_value=('', 'a', 'not in the array', None),
@@ -436,6 +496,73 @@ def test_narrow_condense_back_to_valid_size(self):
assert_equal(arr.itemsize, 2)
self.check_roundtrip(arr)

def test_map_shrinks_code_storage_if_possible(self):
arr = LabelArray(
# Drop the last value so we fit in a uint16 with None as a missing
# value.
self.create_categories(16, plus_one=False)[:-1],
missing_value=None,
)

self.assertEqual(arr.itemsize, 2)

def either_A_or_B(s):
return ('A', 'B')[sum(ord(c) for c in s) % 2]

result = arr.map(either_A_or_B)

self.assertEqual(set(result.categories), {'A', 'B', None})
self.assertEqual(result.itemsize, 1)

assert_equal(
np.vectorize(either_A_or_B)(arr.as_string_array()),
result.as_string_array(),
)

def test_map_never_increases_code_storage_size(self):
# This tests a pathological case where a user maps an impure function
# that returns a different label on every invocation, which in a naive
# implementation could cause us to need to **increase** the size of our
# codes after a map.
#
# This doesn't happen, however, because we guarantee that the user's
# mapping function will be called on each unique category exactly once,
# which means we can never increase the number of categories in the
# LabelArray after mapping.

# Using all but one of the categories so that we still fit in a uint8
# with an extra category for None as a missing value.
categories = self.create_categories(8, plus_one=False)[:-1]

larger_categories = self.create_categories(16, plus_one=False)

# Double the length of the categories so that we have to increase the
# required size after our map.
categories_twice = categories + categories

arr = LabelArray(categories_twice, missing_value=None)
assert_equal(arr.itemsize, 1)

gen_unique_categories = iter(larger_categories)

def new_string_every_time(c):
# Return a new unique category every time so that every result is
# different.
return next(gen_unique_categories)

result = arr.map(new_string_every_time)

# Result should still be of size 1.
assert_equal(result.itemsize, 1)

# Result should be the first `len(categories)` entries from the larger
# categories, repeated twice.
expected = LabelArray(
larger_categories[:len(categories)] * 2,
missing_value=None,
)
assert_equal(result.as_string_array(), expected.as_string_array())

def manual_narrow_condense_back_to_valid_size_slow(self):
"""This test is really slow so we don't want it run by default.
"""
@@ -14,6 +14,23 @@ cdef inline double log2(double d):
return log(d) / log(2);


cpdef inline smallest_uint_that_can_hold(Py_ssize_t maxval):
"""Choose the smallest numpy unsigned int dtype that can hold ``size``.
"""
if maxval < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
return np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
return unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(maxval) / 8))]
)


ctypedef fused unsigned_integral:
np.uint8_t
np.uint16_t
@@ -213,19 +230,7 @@ cpdef factorize_strings(np.ndarray[object] values,
raise ValueError('nvalues larger than uint64')

length = len(categories_array)
if length < 1:
# lim x -> 0 log2(x) == -infinity so we floor at uint8
narrowest_dtype = np.uint8
else:
# The number of bits required to hold the codes up to ``length`` is
# log2(length). The number of bits per bytes is 8. We cannot have
# fractional bytes so we need to round up. Finally, we can only have
# integers with widths 1, 2, 4, or 8 so so we need to round up to the
# next value by looking up the next largest size in ``_int_sizes``.
narrowest_dtype = unsigned_int_dtype_with_size_in_bytes(
_int_sizes[int(np.ceil(log2(length) / 8))]
)

narrowest_dtype = smallest_uint_that_can_hold(length)
if codes.dtype != narrowest_dtype:
# condense the codes down to the narrowest dtype possible
codes = codes.astype(narrowest_dtype)
@@ -29,6 +29,7 @@
from ._factorize import (
factorize_strings,
factorize_strings_known_categories,
smallest_uint_that_can_hold,
)


@@ -136,6 +137,7 @@ class LabelArray(ndarray):
http://docs.scipy.org/doc/numpy-1.10.0/user/basics.subclassing.html
"""
SUPPORTED_SCALAR_TYPES = (bytes, unicode, type(None))
SUPPORTED_NON_NONE_SCALAR_TYPES = (bytes, unicode)

@preprocess(
values=coerce(list, partial(np.asarray, dtype=object)),
@@ -565,6 +567,64 @@ def f_to_use(x):
# locations in our indices.
return results[self.as_int_array()]

def map(self, f):
"""
Map a function from str -> str element-wise over ``self``.
``f`` will be applied exactly once to each non-missing unique value in
``self``. Missing values will always map to ``self.missing_value``.
"""
# f() should only return None if None is our missing value.
if self.missing_value is None:
allowed_outtypes = self.SUPPORTED_SCALAR_TYPES
else:
allowed_outtypes = self.SUPPORTED_NON_NONE_SCALAR_TYPES

def f_to_use(x,
missing_value=self.missing_value,
otypes=allowed_outtypes):

if x == missing_value:
return x

ret = f(x)

if not isinstance(ret, otypes):
raise TypeError(
"Expected f() to return a string. Got %s." % (
type(ret).__name__
)
)

return ret

new_categories_with_duplicates = (
np.vectorize(f_to_use, otypes=[object])(self.categories)
)

# If f() maps multiple inputs to the same output, then we can end up
# with the same code duplicated multiple times. Compress the categories
# by running them through np.unique, and then use the reverse lookup
# table to compress codes as well.
new_categories, bloated_reverse_index = np.unique(
new_categories_with_duplicates,
return_inverse=True
)

# `reverse_index` will always be a 64 bit integer even if we can hold a
# smaller array.
reverse_index = bloated_reverse_index.astype(
smallest_uint_that_can_hold(len(new_categories))
)
new_codes = np.take(reverse_index, self.as_int_array())

return self.from_codes_and_metadata(
new_codes,
new_categories,
dict(zip(new_categories, range(len(new_categories)))),
missing_value=self.missing_value,
)

def startswith(self, prefix):
"""
Element-wise startswith.

0 comments on commit b8e3f14

Please sign in to comment.
You can’t perform that action at this time.