In [1]:
%numpy

Numpy 1.14.0rc1


In [2]:
a = ['a'*i for i in range(5)]
a

['', 'a', 'aa', 'aaa', 'aaaa']

In [3]:
b = np.array(a)
b

array(['', 'a', 'aa', 'aaa', 'aaaa'], dtype='<U4')

In [4]:
c = np.char.ljust(a, 4)
c

array(['    ', 'a   ', 'aa  ', 'aaa ', 'aaaa'], dtype='<U4')

In [5]:
c.view(np.chararray)

chararray(['', 'a', 'aa', 'aaa', 'aaaa'], dtype='<U4')

In [6]:
np.char.rstrip(c)

array(['', 'a', 'aa', 'aaa', 'aaaa'], dtype='<U4')

In [7]:
%load_ext cython

In [32]:
def rstrip(array):
    dt = array.dtype

    if dt.kind not in 'SU':
        raise TypeError("This function can only be used on string arrays")
    # View the array as appropriate integers. The last dimension will
    # equal the number of characters in each string.
    bpc = 1 if dt.kind == 'S' else 4
    dt_int = "{0}{1}u{2}".format(dt.itemsize // bpc, dt.byteorder, bpc)

    b = array.view(dt_int, np.ndarray)
    
    if b.ndim > 2:
        try:
            b.shape = -1, b.shape[-1]
        except AttributeError:  # can occur for non-contiguous arrays
            pass
    if bpc == 1:
        rstrip_fast_char(b)
    else:
        rstrip_fast_int(b)

In [31]:
%%cython -a 

import numpy as np
cimport numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def rstrip_fast_int(unsigned int [:, ::1] arr):
    cdef size_t i, j
    cdef size_t size = arr.shape[1] - 1
    
    for i in range(arr.shape[0]):
        for j in range(size, -1, -1):
            if arr[i, j] == 0:
                continue
            elif arr[i, j] == 32:
                arr[i, j] = 0
            else:
                break
                
@cython.boundscheck(False)
@cython.wraparound(False)
def rstrip_fast_char(unsigned char [:, ::1] arr):
    cdef size_t i, j
    cdef size_t size = arr.shape[1] - 1
    
    for i in range(arr.shape[0]):
        for j in range(size, -1, -1):
            if arr[i, j] == 0:
                continue
            elif arr[i, j] == 32:
                arr[i, j] = 0
            else:
                break

In [10]:
arr = c.copy()
arr

array(['    ', 'a   ', 'aa  ', 'aaa ', 'aaaa'], dtype='<U4')

In [11]:
rstrip(arr)
arr

array(['', 'a', 'aa', 'aaa', 'aaaa'], dtype='<U4')

In [12]:
import pytest
from astropy.io.fits.util import _rstrip_inplace
from numpy.testing import assert_equal

def test_rstrip_inplace(method, repeat=100000):

    # Incorrect type
    s = np.array([1, 2, 3])
    with pytest.raises(TypeError) as exc:
        method(s)
    assert exc.value.args[0] == 'This function can only be used on string arrays'

    # Bytes array
    s = np.array(['a ', ' b', ' c c   ']*repeat, dtype='S6')
    method(s)
    assert_equal(s, np.array(['a', ' b', ' c c']*repeat, dtype='S6'))

    # Unicode array
    s = np.array(['a ', ' b', ' c c   ']*repeat, dtype='U6')
    method(s)
    assert_equal(s, np.array(['a', ' b', ' c c']*repeat, dtype='U6'))

    # 2-dimensional array
    s = np.array([['a ', ' b'], [' c c   ', ' a ']], dtype='S6')
    method(s)
    assert_equal(s, np.array([['a', ' b'], [' c c', ' a']], dtype='S6'))

    # 3-dimensional array
    s = np.repeat(' a a ', 24).reshape((2, 3, 4))
    method(s)
    assert_equal(s, ' a a')

    # 3-dimensional non-contiguous array
#     s = np.repeat(' a a ', 1000).reshape((10, 10, 10))[:2, :3, :4]
#     method(s)
#     assert_equal(s, ' a a')

In [15]:
%timeit test_rstrip_inplace(_rstrip_inplace)

85.6 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit test_rstrip_inplace(rstrip)

75.4 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
s = np.array(['aaa    ', '  bbb', ' c c       ']*1000000, dtype='U6')

In [29]:
%timeit _rstrip_inplace(s.copy())

117 ms ± 3.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%timeit rstrip(s.copy())

46.2 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
