Squash me, pr feedback

quantopian · Mar 7, 2017 · 0d9ca7b · 0d9ca7b
1 parent 72ef80f
commit 0d9ca7b
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 74 deletions.
diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
@@ -24,7 +24,7 @@
 import pandas as pd
 from scipy.stats.mstats import winsorize as scipy_winsorize
 
-from zipline.errors import UnknownRankMethod
+from zipline.errors import BadPercentileBounds, UnknownRankMethod
 from zipline.lib.labelarray import LabelArray
 from zipline.lib.rank import masked_rankdata_2d
 from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
@@ -742,15 +742,45 @@ def test_winsorize_hand_computed(self):
         )
 
         terms = {
-            'winsor_1': f.winsorize(limits=33),
-            'winsor_2': f.winsorize(limits=(49, None)),
-            'winsor_3': f.winsorize(limits=33, inclusive=(False, False)),
-            'winsor_4': f.winsorize(limits=33, inclusive=(True, False)),
-            'masked': f.winsorize(limits=33, mask=m),
-            'grouped': f.winsorize(limits=34, groupby=c),
-            'grouped_str': f.winsorize(limits=34, groupby=str_c),
-            'grouped_masked': f.winsorize(limits=34, mask=m, groupby=c),
-            'grouped_masked_str': f.winsorize(limits=34, mask=m, groupby=str_c)
+            'winsor_1': f.winsorize(
+                min_percentile=0.33,
+                max_percentile=0.67
+            ),
+            'winsor_2': f.winsorize(
+                min_percentile=0.49,
+                max_percentile=1
+            ),
+            'winsor_3': f.winsorize(
+                min_percentile=0,
+                max_percentile=.67
+            ),
+            'masked': f.winsorize(
+                min_percentile=0.33,
+                max_percentile=0.67,
+                mask=m
+            ),
+            'grouped': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                groupby=c
+            ),
+            'grouped_str': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                groupby=str_c
+            ),
+            'grouped_masked': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                mask=m,
+                groupby=c
+            ),
+            'grouped_masked_str': f.winsorize(
+                min_percentile=0.34,
+                max_percentile=0.66,
+                mask=m,
+                groupby=str_c
+            ),
         }
         expected = {
             'winsor_1': array([
@@ -764,14 +794,9 @@ def test_winsorize_hand_computed(self):
                 [6.0,    5.,    4.,    3.,    3.,    3.]
             ]),
             'winsor_3': array([
-                [3.0,   3.,   3.,   4.,   4.,   4.],
-                [27.,  27.,  27.,  64.,  64.,  64.],
-                [4.0,   4.,   4.,   3.,   3.,   3.]
-            ]),
-            'winsor_4': array([
-                [2.,   2.,   3.,   4.,   4.,   4.],
-                [8.,   8.,  27.,  64.,  64.,  64.],
-                [4.,   4.,   4.,   3.,   2.,   2.]
+                [1.,    2.,    3.,    4.,    5.,    5.],
+                [1.,    8.,   27.,   64.,  125.,  125.],
+                [5.,    5.,    4.,    3.,    2.,    1.]
             ]),
             'masked': array([
                 [nan,    3.,    3.,    4.,    5.,    5.],
@@ -787,7 +812,7 @@ def test_winsorize_hand_computed(self):
                 [nan,    2.,    3.,    5.,    5.,    5.],
                 [1.0,   nan,   27.,  125.,  125.,  125.],
                 [6.0,    5.,    nan,    2.,    2.,   2.]
-            ])
+            ]),
         }
         # Changing the classifier dtype shouldn't affect anything.
         expected['grouped_str'] = expected['grouped']
@@ -806,15 +831,34 @@ def test_winsorize_hand_computed(self):
             check=partial(check_allclose, atol=0.001),
         )
 
+    def test_winsorize_bad_bounds(self):
+        """
+        Test out of bounds input for factor.winsorize.
+        """
+        f = self.f
+
+        bad_percentiles = [
+            (-.1, 1),
+            (0, 95),
+            (5, 95),
+            (5, 5),
+        ]
+        for min_, max_ in bad_percentiles:
+            with self.assertRaises(BadPercentileBounds):
+                f.winsorize(min_percentile=min_, max_percentile=max_)
+
     @parameter_space(
         seed_value=range(1, 2),
         normalizer_name_and_func=[
             ('demean', {}, lambda row: row - nanmean(row)),
             ('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
             (
                 'winsorize',
-                {"limits": 25.},
-                lambda row: scipy_winsorize(row, limits=0.25)
+                {"min_percentile": 0.25, "max_percentile": 0.75},
+                lambda row: scipy_winsorize(
+                    row,
+                    limits=0.25,
+                )
             ),
         ],
         add_nulls_to_factor=(False, True,),
@@ -1154,7 +1198,7 @@ def test_zscore(self):
         self.assertEqual(r, "GroupedRowTransform('zscore')")
 
     def test_winsorize(self):
-        r = F().winsorize(limits=5).short_repr()
+        r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr()
         self.assertEqual(r, "GroupedRowTransform('winsorize')")
 
 
@@ -1168,10 +1212,22 @@ def test_demean_is_window_safe_if_input_is_window_safe(self):
         self.assertFalse(F(window_safe=False).demean().window_safe)
         self.assertTrue(F(window_safe=True).demean().window_safe)
 
-    def test_winsorize_is_window_safe(self):
-        self.assertFalse(F().winsorize(limits=5).window_safe)
-        self.assertFalse(F(window_safe=False).winsorize(limits=5).window_safe)
-        self.assertTrue(F(window_safe=True).winsorize(limits=5).window_safe)
+    def test_winsorize_is_window_safe_if_input_is_window_safe(self):
+        self.assertFalse(
+            F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
+        )
+        self.assertFalse(
+            F(window_safe=False).winsorize(
+                min_percentile=.05,
+                max_percentile=.95
+            ).window_safe
+        )
+        self.assertTrue(
+            F(window_safe=True).winsorize(
+                min_percentile=.05,
+                max_percentile=.95
+            ).window_safe
+        )
 
 
 class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):

diff --git a/zipline/errors.py b/zipline/errors.py
@@ -569,9 +569,15 @@ class BadPercentileBounds(ZiplineError):
     Raised by API functions accepting percentile bounds when the passed bounds
     are invalid.
     """
+
+    def __init__(self, **kwargs):
+        if "upper_bound" not in kwargs.keys():
+            kwargs["upper_bound"] == 100.0
+        self.kwargs = kwargs
+
     msg = (
-        "Percentile bounds must fall between 0.0 and 100.0, and min must be "
-        "less than max."
+        "Percentile bounds must fall between 0.0 and {upper_bound}, and min "
+        "must be less than max."
         "\nInputs were min={min_percentile}, max={max_percentile}."
     )
 

diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
@@ -4,12 +4,12 @@
 from functools import wraps
 from operator import attrgetter
 from numbers import Number
+from math import ceil
 
 from numpy import empty_like, inf, nan, where
 from scipy.stats import rankdata
-from scipy.stats.mstats import winsorize as scipy_winsorize
 
-from zipline.errors import UnknownRankMethod
+from zipline.errors import BadPercentileBounds, UnknownRankMethod
 from zipline.lib.normalize import naive_grouped_rowwise_apply
 from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending
 from zipline.pipeline.api_utils import restrict_to_dtype
@@ -834,19 +834,24 @@ def linear_regression(self, target, regression_length, mask=NotSpecified):
             mask=mask,
         )
 
+    @expect_types(
+        min_percentile=(int, float),
+        max_percentile=(int, float),
+        mask=(Filter, NotSpecifiedType),
+        groupby=(Classifier, NotSpecifiedType),
+    )
     @float64_only
     def winsorize(self,
-                  limits,
-                  inclusive=(True, True),
+                  min_percentile,
+                  max_percentile,
                   mask=NotSpecified,
                   groupby=NotSpecified):
         """
-        Construct a Factor returns a winsorized row for results. Winsorizing
-        clips the input values to fixed percentiles. The (limits[0])th lowest
-        values are set to the value at the (limits[0])th percentile. The values
-        above the (limits[1])th percentiles are set to the value at the
-        (limits[1])th percentile. This is useful when limiting the impact of
-        extreme values.
+        Construct a Factor returns a winsorized row. Winsorizing changes values
+        ranked less than the minimum percentile to to value at the minimum
+        percentile. Similarly, values ranking above the maximum percentile will
+        be changed to the value at the maximum percentile. This is useful
+        when limiting the impact of extreme values.
 
         If ``mask`` is supplied, ignore values where ``mask`` returns False
         when computing row means and standard deviations, and output NaN
@@ -858,14 +863,14 @@ def winsorize(self,
 
         Parameters
         ----------
-        limits : None, tuple of float, optional
-            A tuple of two values between 0 and 100 inclusive. This is the
-            percentage to cut from each tail of the array. A value of None
-            can be used to indicate an open limit.
-        inclusive : a tuple of bool, optional
-            A bool indicating whether the data on each side should be
-            rounded(True) or truncated(False). A value of None can be used if
-            one side is not being winsorized. Default is (False, False).
+        min_percentile: float, int
+            Entries with values at or below this percentile will be replaced
+            with the (len(inp) * min_percentile)th lowest value. If low values
+            should not be clipped, use 0.
+        max_percentile: float, int
+            Entries with values at or above this percentile will be replaced
+            with the (len(inp) * max_percentile)th lowest value. If high
+            values should not be clipped, use 1.
         mask : zipline.pipeline.Filter, optional
             A Filter defining values to ignore when winsorizing.
         groupby : zipline.pipeline.Classifier, optional
@@ -882,34 +887,43 @@ def winsorize(self,
         price = USEquityPricing.close.latest
         columns={
             'PRICE': price,
-            'WINSOR_1: price.winsorize(limits=25),
-            'WINSOR_2': price.winsorize(limits=(50, None)),
+            'WINSOR_1: price.winsorize(
+                min_percentile=0.25, max_percentile=0.75
+            ),
+            'WINSOR_2': price.winsorize(
+                min_percentile=0.50, max_percentile=1.0
+            ),
             'WINSOR_3': price.winsorize(
-                limits=25, inclusive=(False, False)
+                min_percentile=0.0, max_percentile=0.5
             ),
-            'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
-            'WINSOR_5': price.winsorize(limits=(20, 40)),
+
         }
 
         Given a pipeline with columns, defined above, the result for a
         given day could look like:
 
-                'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
-        Asset_1    1        2          4          3          2          2
-        Asset_2    2        2          4          3          2          2
-        Asset_3    3        3          4          3          3          2
-        Asset_4    4        4          4          4          4          4
-        Asset_5    5        5          5          4          4          4
-        Asset_6    6        5          5          4          4          4
+                'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3'
+        Asset_1    1        2          4          3
+        Asset_2    2        2          4          3
+        Asset_3    3        3          4          3
+        Asset_4    4        4          4          4
+        Asset_5    5        5          5          4
+        Asset_6    6        5          5          4
 
         See Also
         --------
         :func:`scipy.stats.mstats.winsorize`
         :meth:`pandas.DataFrame.groupby`
         """
+        if not 0.0 <= min_percentile < max_percentile <= 1.0:
+            raise BadPercentileBounds(
+                min_percentile=min_percentile,
+                max_percentile=max_percentile,
+                upper_bound=1.0,
+            )
         return GroupedRowTransform(
             transform=winsorize,
-            transform_args=(limits, inclusive),
+            transform_args=(min_percentile, max_percentile),
             factor=self,
             groupby=groupby,
             dtype=self.dtype,
@@ -1617,18 +1631,18 @@ def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
 
 
-def winsorize(row, limits, inclusive):
-    if isinstance(limits, int) or isinstance(limits, float):
-        limits = limits / 100.
-    if isinstance(limits, tuple):
-        if limits[0] is not None:
-            limit_0 = limits[0] / 100.
-        else:
-            limit_0 = None
-        if limits[1] is not None:
-            limit_1 = limits[1] / 100
-        else:
-            limit_1 = None
-        limits = (limit_0, limit_1)
-
-    return scipy_winsorize(row, limits=limits, inclusive=inclusive)
+def winsorize(row, min_percentile, max_percentile):
+    a = row.copy()
+    num = a.size
+    idx = a.argsort()
+    if min_percentile > 0:
+        lowidx = int(min_percentile * num)
+        a[idx[:lowidx]] = a[idx[lowidx]]
+    if max_percentile < 1:
+        upidx = ceil(num * max_percentile)
+        # upidx could return as the length of the array, in this case
+        # no modification to the right tail is necessary.
+        if upidx < num:
+            a[idx[upidx:]] = a[idx[upidx - 1]]
+
+    return a