Merge 1cd8de1 into f90cd1c

quantopian · Mar 7, 2017 · 6da1b8c · 6da1b8c
2 parents f90cd1c + 1cd8de1
commit 6da1b8c
Show file tree

Hide file tree

Showing 2 changed files with 217 additions and 5 deletions.
diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
@@ -22,6 +22,7 @@
 )
 from numpy.random import randn, seed
 import pandas as pd
+from scipy.stats.mstats import winsorize as scipy_winsorize
 
 from zipline.errors import UnknownRankMethod
 from zipline.lib.labelarray import LabelArray
@@ -709,11 +710,112 @@ def test_normalizations_hand_computed(self):
             check=partial(check_allclose, atol=0.001),
         )
 
+    def test_winsorize_hand_computed(self):
+        """
+        Test the hand-computed example in factor.winsorize.
+        """
+        f = self.f
+        m = Mask()
+        c = C()
+        str_c = C(dtype=categorical_dtype, missing_value=None)
+
+        factor_data = array([
+            [1.,     2.,  3.,  4.,   5.,   6.],
+            [1.,     8., 27., 64., 125., 216.],
+            [6.,     5.,  4.,  3.,   2.,   1.]
+        ])
+        filter_data = array(
+            [[False, True, True, True, True, True],
+             [True, False, True, True, True, True],
+             [True, True, False, True, True, True]],
+            dtype=bool,
+        )
+        classifier_data = array(
+            [[1, 1, 1, 2, 2, 2],
+             [1, 1, 1, 2, 2, 2],
+             [1, 1, 1, 2, 2, 2]],
+            dtype=int64_dtype,
+        )
+        string_classifier_data = LabelArray(
+            classifier_data.astype(str).astype(object),
+            missing_value=None,
+        )
+
+        terms = {
+            'winsor_1': f.winsorize(limits=33),
+            'winsor_2': f.winsorize(limits=(49, None)),
+            'winsor_3': f.winsorize(limits=33, inclusive=(False, False)),
+            'winsor_4': f.winsorize(limits=33, inclusive=(True, False)),
+            'masked': f.winsorize(limits=33, mask=m),
+            'grouped': f.winsorize(limits=34, groupby=c),
+            'grouped_str': f.winsorize(limits=34, groupby=str_c),
+            'grouped_masked': f.winsorize(limits=34, mask=m, groupby=c),
+            'grouped_masked_str': f.winsorize(limits=34, mask=m, groupby=str_c)
+        }
+        expected = {
+            'winsor_1': array([
+                [2.,    2.,    3.,    4.,    5.,    5.],
+                [8.,    8.,   27.,   64.,  125.,  125.],
+                [5.,    5.,    4.,    3.,    2.,    2.]
+            ]),
+            'winsor_2': array([
+                [3.0,    3.,    3.,    4.,    5.,    6.],
+                [27.,   27.,   27.,   64.,  125.,  216.],
+                [6.0,    5.,    4.,    3.,    3.,    3.]
+            ]),
+            'winsor_3': array([
+                [3.0,   3.,   3.,   4.,   4.,   4.],
+                [27.,  27.,  27.,  64.,  64.,  64.],
+                [4.0,   4.,   4.,   3.,   3.,   3.]
+            ]),
+            'winsor_4': array([
+                [2.,   2.,   3.,   4.,   4.,   4.],
+                [8.,   8.,  27.,  64.,  64.,  64.],
+                [4.,   4.,   4.,   3.,   2.,   2.]
+            ]),
+            'masked': array([
+                [nan,    3.,    3.,    4.,    5.,    5.],
+                [27.,   nan,   27.,   64.,  125.,  125.],
+                [5.0,    5.,    nan,    3.,    2.,   2.]
+            ]),
+            'grouped': array([
+                [2.,    2.,    2.,    5.,    5.,    5.],
+                [8.,    8.,    8.,  125.,  125.,  125.],
+                [5.,    5.,    5.,    2.,    2.,    2.]
+            ]),
+            'grouped_masked': array([
+                [nan,    2.,    3.,    5.,    5.,    5.],
+                [1.0,   nan,   27.,  125.,  125.,  125.],
+                [6.0,    5.,    nan,    2.,    2.,   2.]
+            ])
+        }
+        # Changing the classifier dtype shouldn't affect anything.
+        expected['grouped_str'] = expected['grouped']
+        expected['grouped_masked_str'] = expected['grouped_masked']
+
+        self.check_terms(
+            terms,
+            expected,
+            initial_workspace={
+                f: factor_data,
+                c: classifier_data,
+                str_c: string_classifier_data,
+                m: filter_data,
+            },
+            mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
+            check=partial(check_allclose, atol=0.001),
+        )
+
     @parameter_space(
         seed_value=range(1, 2),
         normalizer_name_and_func=[
-            ('demean', lambda row: row - nanmean(row)),
-            ('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
+            ('demean', {}, lambda row: row - nanmean(row)),
+            ('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
+            (
+                'winsorize',
+                {"limits": 25.},
+                lambda row: scipy_winsorize(row, limits=0.25)
+            ),
         ],
         add_nulls_to_factor=(False, True,),
     )
@@ -722,9 +824,9 @@ def test_normalizations_randomized(self,
                                        normalizer_name_and_func,
                                        add_nulls_to_factor):
 
-        name, func = normalizer_name_and_func
+        name, kwargs, func = normalizer_name_and_func
 
-        shape = (7, 7)
+        shape = (20, 20)
 
         # All Trues.
         nomask = self.ones_mask(shape=shape)
@@ -755,7 +857,7 @@ def test_normalizations_randomized(self,
         c = C()
         c_with_nulls = OtherC()
         m = Mask()
-        method = getattr(f, name)
+        method = partial(getattr(f, name), **kwargs)
         terms = {
             'vanilla': method(),
             'masked': method(mask=m),
@@ -1051,6 +1153,10 @@ def test_zscore(self):
         r = F().zscore().short_repr()
         self.assertEqual(r, "GroupedRowTransform('zscore')")
 
+    def test_winsorize(self):
+        r = F().winsorize(limits=5).short_repr()
+        self.assertEqual(r, "GroupedRowTransform('winsorize')")
+
 
 class TestWindowSafety(TestCase):
 
@@ -1062,6 +1168,11 @@ def test_demean_is_window_safe_if_input_is_window_safe(self):
         self.assertFalse(F(window_safe=False).demean().window_safe)
         self.assertTrue(F(window_safe=True).demean().window_safe)
 
+    def test_winsorize_is_window_safe_if_input_is_window_safe(self):
+        self.assertFalse(F().winsorize(limits=5).window_safe)
+        self.assertFalse(F(window_safe=False).winsorize(limits=5).window_safe)
+        self.assertTrue(F(window_safe=True).winsorize(limits=5).window_safe)
+
 
 class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
     @parameter_space(dtype_=(float64_dtype, datetime64ns_dtype))

diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
@@ -7,6 +7,7 @@
 
 from numpy import empty_like, inf, nan, where
 from scipy.stats import rankdata
+from scipy.stats.mstats import winsorize as scipy_winsorize
 
 from zipline.errors import UnknownRankMethod
 from zipline.lib.normalize import naive_grouped_rowwise_apply
@@ -833,6 +834,102 @@ def linear_regression(self, target, regression_length, mask=NotSpecified):
             mask=mask,
         )
 
+    @float64_only
+    def winsorize(self,
+                  limits,
+                  inclusive=(True, True),
+                  mask=NotSpecified,
+                  groupby=NotSpecified):
+        """
+        Construct a Factor returns a winsorized row for results. Winsorizing
+        clips the input values to fixed percentiles. The (limits[0])th lowest
+        values are set to the value at the (limits[0])th percentile. The values
+        above the (limits[1])th percentiles are set to the value at the
+        (limits[1])th percentile. This is useful when limiting the impact of
+        extreme values.
+
+        If ``mask`` is supplied, ignore values where ``mask`` returns False
+        when computing row means and standard deviations, and output NaN
+        anywhere the mask is False.
+
+        If ``groupby`` is supplied, compute by partitioning each row based on
+        the values produced by ``groupby``, winsorizing the partitioned arrays,
+        and stitching the sub-results back together.
+
+        Parameters
+        ----------
+        limits : None, tuple of float, optional
+            A tuple of two values between 0 and 100 inclusive. This is the
+            percentage to cut from each tail of the array. A value of None
+            can be used to indicate an open limit.
+        inclusive : a tuple of bool, optional
+            A bool indicating whether the data on each side should be
+            rounded(True) or truncated(False). A value of None can be used if
+            one side is not being winsorized. Default is (True, True).
+        mask : zipline.pipeline.Filter, optional
+            A Filter defining values to ignore when winsorizing.
+        groupby : zipline.pipeline.Classifier, optional
+            A classifier defining partitions over which to winsorize.
+
+        Returns
+        -------
+        winsorized : zipline.pipeline.Factor
+            A Factor producing a winsorized version of self.
+
+        Example
+        -------
+
+        price = USEquityPricing.close.latest
+        columns={
+            'PRICE': price,
+            'WINSOR_1: price.winsorize(limits=25),
+            'WINSOR_2': price.winsorize(limits=(50, None)),
+            'WINSOR_3': price.winsorize(
+                limits=25, inclusive=(False, False)
+            ),
+            'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
+            'WINSOR_5': price.winsorize(limits=(20, 40)),
+        }
+
+        Given a pipeline with columns, defined above, the result for a
+        given day could look like:
+
+                'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
+        Asset_1    1        2          4          3          2          2
+        Asset_2    2        2          4          3          2          2
+        Asset_3    3        3          4          3          3          2
+        Asset_4    4        4          4          4          4          4
+        Asset_5    5        5          5          4          4          4
+        Asset_6    6        5          5          4          4          4
+
+        See Also
+        --------
+        :func:`scipy.stats.mstats.winsorize`
+        :meth:`pandas.DataFrame.groupby`
+        """
+        if isinstance(limits, int) or isinstance(limits, float):
+            limits = limits / 100.
+        if isinstance(limits, tuple):
+            if limits[0] is not None:
+                limit_0 = limits[0] / 100.
+            else:
+                limit_0 = None
+            if limits[1] is not None:
+                limit_1 = limits[1] / 100
+            else:
+                limit_1 = None
+            limits = (limit_0, limit_1)
+        return GroupedRowTransform(
+            transform=winsorize,
+            transform_args=(limits, inclusive),
+            factor=self,
+            groupby=groupby,
+            dtype=self.dtype,
+            missing_value=self.missing_value,
+            mask=mask,
+            window_safe=self.window_safe,
+        )
+
     @expect_types(bins=int, mask=(Filter, NotSpecifiedType))
     def quantiles(self, bins, mask=NotSpecified):
         """
@@ -1530,3 +1627,7 @@ def demean(row):
 
 def zscore(row):
     return (row - nanmean(row)) / nanstd(row)
+
+
+def winsorize(row, limits, inclusive):
+    return scipy_winsorize(row, limits=limits, inclusive=inclusive)