Permalink
Browse files

ENH: default value for NaN for contains,startswith,endswith #1689

  • Loading branch information...
1 parent 5a4a49d commit 7a1ea0a3d189b1be7cdd23a7b98f79effd381430 @changhiskhan changhiskhan committed with wesm Aug 19, 2012
Showing with 38 additions and 12 deletions.
  1. +8 −0 doc/source/basics.rst
  2. +20 −12 pandas/core/strings.py
  3. +10 −0 pandas/tests/test_strings.py
View
8 doc/source/basics.rst
@@ -876,6 +876,14 @@ Methods like ``replace`` and ``findall`` take regular expressions, too:
s3
s3.str.replace('^.a|dog', 'XX-XX ', case=False)
+Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
+``na`` arguement so missing values can be considered True or False:
+
+.. ipython:: python
+
+ s4 = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
+ s4.str.contains('A', na=False)
+
.. csv-table::
:header: "Method", "Description"
:widths: 20, 80
View
32 pandas/core/strings.py
@@ -210,7 +210,7 @@ def str_count(arr, pat, flags=0):
return _na_map(f, arr)
-def str_contains(arr, pat, case=True, flags=0):
+def str_contains(arr, pat, case=True, flags=0, na=np.nan):
"""
Check whether given pattern is contained in each string in the array
@@ -222,6 +222,7 @@ def str_contains(arr, pat, case=True, flags=0):
If True, case sensitive
flags : int, default 0 (no flags)
re module flags, e.g. re.IGNORECASE
+ na : bool, default NaN
Returns
-------
@@ -233,10 +234,10 @@ def str_contains(arr, pat, case=True, flags=0):
regex = re.compile(pat, flags=flags)
f = lambda x: bool(regex.search(x))
- return _na_map(f, arr)
+ return _na_map(f, arr, na)
-def str_startswith(arr, pat):
+def str_startswith(arr, pat, na=np.nan):
"""
Return boolean array indicating whether each string starts with passed
pattern
@@ -245,16 +246,17 @@ def str_startswith(arr, pat):
----------
pat : string
Character sequence
+ na : bool, default NaN
Returns
-------
startswith : array (boolean)
"""
f = lambda x: x.startswith(pat)
- return _na_map(f, arr)
+ return _na_map(f, arr, na)
-def str_endswith(arr, pat):
+def str_endswith(arr, pat, na=np.nan):
"""
Return boolean array indicating whether each string ends with passed
pattern
@@ -263,13 +265,14 @@ def str_endswith(arr, pat):
----------
pat : string
Character sequence
+ na : bool, default NaN
Returns
-------
endswith : array (boolean)
"""
f = lambda x: x.endswith(pat)
- return _na_map(f, arr)
+ return _na_map(f, arr, na)
def str_lower(arr):
@@ -637,7 +640,7 @@ def wrapper(self):
return wrapper
-def _pat_wrapper(f, flags=False):
+def _pat_wrapper(f, flags=False, na=False):
def wrapper1(self, pat):
result = f(self.series, pat)
return self._wrap_result(result)
@@ -646,7 +649,11 @@ def wrapper2(self, pat, flags=0):
result = f(self.series, pat, flags=flags)
return self._wrap_result(result)
- wrapper = wrapper2 if flags else wrapper1
+ def wrapper3(self, pat, na=np.nan):
+ result = f(self.series, pat, na=na)
+ return self._wrap_result(result)
+
+ wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
wrapper.__name__ = f.__name__
if f.__doc__:
@@ -709,8 +716,9 @@ def join(self, sep):
return self._wrap_result(result)
@copy(str_contains)
- def contains(self, pat, case=True, flags=0):
- result = str_contains(self.series, pat, case=case, flags=flags)
+ def contains(self, pat, case=True, flags=0, na=np.nan):
+ result = str_contains(self.series, pat, case=case, flags=flags,
+ na=np.nan)
return self._wrap_result(result)
@copy(str_replace)
@@ -753,8 +761,8 @@ def encode(self, encoding):
return self._wrap_result(result)
count = _pat_wrapper(str_count, flags=True)
- startswith = _pat_wrapper(str_startswith)
- endswith = _pat_wrapper(str_endswith)
+ startswith = _pat_wrapper(str_startswith, na=True)
+ endswith = _pat_wrapper(str_endswith, na=True)
findall = _pat_wrapper(str_findall, flags=True)
match = _pat_wrapper(str_match, flags=True)
View
10 pandas/tests/test_strings.py
@@ -115,6 +115,10 @@ def test_contains(self):
expected = [False, np.nan, True, True]
tm.assert_almost_equal(result, expected)
+ result = strings.str_contains(values, pat, na=False)
+ expected = [False, False, True, True]
+ tm.assert_almost_equal(result, expected)
+
values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
result = strings.str_contains(values, pat)
expected = [False, False, True, True]
@@ -146,6 +150,9 @@ def test_startswith(self):
exp = Series([False, NA, True, False, False, NA, True])
tm.assert_series_equal(result, exp)
+ result = values.str.startswith('foo', na=True)
+ tm.assert_series_equal(result, exp.fillna(True).astype(bool))
+
def test_endswith(self):
values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
@@ -171,6 +178,9 @@ def test_endswith(self):
exp = Series([False, NA, False, False, True, NA, True])
tm.assert_series_equal(result, exp)
+ result = values.str.endswith('foo', na=False)
+ tm.assert_series_equal(result, exp.fillna(False).astype(bool))
+
def test_lower_upper(self):
values = Series(['om', NA, 'nom', 'nom'])

0 comments on commit 7a1ea0a

Please sign in to comment.