Merge 6c88762 into 52b3329

quantopian · Apr 5, 2018 · 5d5a823 · 5d5a823
2 parents 52b3329 + 6c88762
commit 5d5a823
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 52 deletions.
diff --git a/tests/pipeline/test_factor.py b/tests/pipeline/test_factor.py
@@ -7,6 +7,7 @@
 from unittest import TestCase
 
 from toolz import compose
+import numpy as np
 from numpy import (
     apply_along_axis,
     arange,
@@ -20,7 +21,7 @@
     rot90,
     where,
 )
-from numpy.random import randn, seed
+from numpy.random import randn, seed, RandomState
 import pandas as pd
 from scipy.stats.mstats import winsorize as scipy_winsorize
 
@@ -35,6 +36,7 @@
     DailyReturns,
     Returns,
 )
+from zipline.pipeline.factors.factor import winsorize as zp_winsorize
 from zipline.testing import (
     check_allclose,
     check_arrays,
@@ -95,6 +97,39 @@ class Mask(Filter):
 ])
 
 
+def scipy_winsorize_with_nan_handling(array, limits):
+    """
+    Wrapper around scipy.stats.mstats.winsorize that handles NaNs correctly.
+
+    scipy's winsorize sorts NaNs to the end of the array when calculating
+    percentiles.
+    """
+    # The basic idea of this function is to do the following:
+    # 1. Sort the input, sorting nans to the end of the array.
+    # 2. Call scipy winsorize on the non-nan portion of the input.
+    # 3. Undo the sorting to put the winsorized values back in their original
+    #    locations.
+
+    nancount = np.isnan(array).sum()
+    if nancount == len(array):
+        return array.copy()
+
+    sorter = array.argsort()
+    unsorter = sorter.argsort()  # argsorting a permutation gives its inverse!
+
+    if nancount:
+        sorted_non_nans = array[sorter][:-nancount]
+    else:
+        sorted_non_nans = array[sorter]
+
+    sorted_winsorized = np.hstack([
+        scipy_winsorize(sorted_non_nans, limits).data,
+        np.full(nancount, np.nan),
+    ])
+
+    return sorted_winsorized[unsorter]
+
+
 class FactorTestCase(BasePipelineTestCase):
 
     def init_instance_fixtures(self):
@@ -698,20 +733,26 @@ def test_winsorize_hand_computed(self):
         str_c = C(dtype=categorical_dtype, missing_value=None)
 
         factor_data = array([
-            [1.,     2.,  3.,  4.,   5.,   6.],
-            [1.,     8., 27., 64., 125., 216.],
-            [6.,     5.,  4.,  3.,   2.,   1.]
+            [1.,     2.,  3.,  4.,   5.,   6.,  7.,  8.,  9.],
+            [1.,     2.,  3.,  4.,   5.,   6., nan, nan, nan],
+            [1.,     8., 27., 64., 125., 216., nan, nan, nan],
+            [6.,     5.,  4.,  3.,   2.,   1., nan, nan, nan],
+            [nan,   nan, nan, nan,  nan,  nan, nan, nan, nan],
         ])
         filter_data = array(
-            [[False, True, True, True, True, True],
-             [True, False, True, True, True, True],
-             [True, True, False, True, True, True]],
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1],
+             [0, 1, 1, 1, 1, 1, 1, 1, 1],
+             [1, 0, 1, 1, 1, 1, 1, 1, 1],
+             [1, 1, 0, 1, 1, 1, 1, 1, 1],
+             [1, 1, 1, 0, 1, 1, 1, 1, 1]],
             dtype=bool,
         )
         classifier_data = array(
-            [[1, 1, 1, 2, 2, 2],
-             [1, 1, 1, 2, 2, 2],
-             [1, 1, 1, 2, 2, 2]],
+            [[1, 1, 1, 2, 2, 2, 1, 1, 1],
+             [1, 1, 1, 2, 2, 2, 1, 1, 1],
+             [1, 1, 1, 2, 2, 2, 1, 1, 1],
+             [1, 1, 1, 2, 2, 2, 1, 1, 1],
+             [1, 1, 1, 2, 2, 2, 1, 1, 1]],
             dtype=int64_dtype,
         )
         string_classifier_data = LabelArray(
@@ -762,34 +803,47 @@ def test_winsorize_hand_computed(self):
         }
         expected = {
             'winsor_1': array([
-                [2.,    2.,    3.,    4.,    5.,    5.],
-                [8.,    8.,   27.,   64.,  125.,  125.],
-                [5.,    5.,    4.,    3.,    2.,    2.]
+                [3.,    3.,    3.,    4.,    5.,    6.,  7.,  7.,  7.],
+                [2.,    2.,    3.,    4.,    5.,    5., nan, nan, nan],
+                [8.,    8.,   27.,   64.,  125.,  125., nan, nan, nan],
+                [5.,    5.,    4.,    3.,    2.,    2., nan, nan, nan],
+                [nan,  nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
             'winsor_2': array([
-                [3.0,    3.,    3.,    4.,    5.,    6.],
-                [27.,   27.,   27.,   64.,  125.,  216.],
-                [6.0,    5.,    4.,    3.,    3.,    3.]
+                [5.,     5.,    5.,    5.,    5.,    6.,  7.,  8.,  9.],
+                [3.0,    3.,    3.,    4.,    5.,    6., nan, nan, nan],
+                [27.,   27.,   27.,   64.,  125.,  216., nan, nan, nan],
+                [6.0,    5.,    4.,    3.,    3.,    3., nan, nan, nan],
+                [nan,   nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
             'winsor_3': array([
-                [1.,    2.,    3.,    4.,    5.,    5.],
-                [1.,    8.,   27.,   64.,  125.,  125.],
-                [5.,    5.,    4.,    3.,    2.,    1.]
+                [1.,    2.,    3.,    4.,    5.,    6.,  7.,  7.,  7.],
+                [1.,    2.,    3.,    4.,    5.,    5., nan, nan, nan],
+                [1.,    8.,   27.,   64.,  125.,  125., nan, nan, nan],
+                [5.,    5.,    4.,    3.,    2.,    1., nan, nan, nan],
+                [nan,  nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
             'masked': array([
-                [nan,    3.,    3.,    4.,    5.,    5.],
-                [27.,   nan,   27.,   64.,  125.,  125.],
-                [5.0,    5.,    nan,    3.,    2.,   2.]
+                # no mask on first row
+                [3.,     3.,    3.,    4.,    5.,    6.,  7.,  7.,  7.],
+                [nan,    3.,    3.,    4.,    5.,    5., nan, nan, nan],
+                [27.,   nan,   27.,   64.,  125.,  125., nan, nan, nan],
+                [5.0,    5.,    nan,   3.,    2.,    2., nan, nan, nan],
+                [nan,   nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
             'grouped': array([
-                [2.,    2.,    2.,    5.,    5.,    5.],
-                [8.,    8.,    8.,  125.,  125.,  125.],
-                [5.,    5.,    5.,    2.,    2.,    2.]
+                [3.,    3.,    3.,    5.,    5.,    5.,  7.,  7.,  7.],
+                [2.,    2.,    2.,    5.,    5.,    5., nan, nan, nan],
+                [8.,    8.,    8.,  125.,  125.,  125., nan, nan, nan],
+                [5.,    5.,    5.,    2.,    2.,    2., nan, nan, nan],
+                [nan,  nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
             'grouped_masked': array([
-                [nan,    2.,    3.,    5.,    5.,    5.],
-                [1.0,   nan,   27.,  125.,  125.,  125.],
-                [6.0,    5.,    nan,    2.,    2.,   2.]
+                [3.,     3.,    3.,    5.,    5.,    5.,  7.,  7.,  7.],
+                [nan,    2.,    3.,    5.,    5.,    5., nan, nan, nan],
+                [1.0,   nan,   27.,  125.,  125.,  125., nan, nan, nan],
+                [6.0,    5.,   nan,    2.,    2.,    2., nan, nan, nan],
+                [nan,   nan,   nan,   nan,   nan,   nan, nan, nan, nan],
             ]),
         }
         # Changing the classifier dtype shouldn't affect anything.
@@ -809,6 +863,88 @@ def test_winsorize_hand_computed(self):
             check=partial(check_allclose, atol=0.001),
         )
 
+    def test_winsorize_no_nans(self):
+        data = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
+        permutation = array([2, 1, 6, 8, 7, 5, 3, 9, 4, 0])
+
+        for perm in slice(None), permutation:
+            # Winsorize both tails at 90%.
+            result = zp_winsorize(data[perm], 0.1, 0.9)
+            expected = array([1., 1., 2., 3., 4., 5., 6., 7., 8., 8.])[perm]
+            assert_equal(result, expected)
+
+            # Winsorize both tails at 80%.
+            result = zp_winsorize(data[perm], 0.2, 0.8)
+            expected = array([2., 2., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
+            assert_equal(result, expected)
+
+            # Winsorize just the upper tail.
+            result = zp_winsorize(data[perm], 0.0, 0.8)
+            expected = array([0., 1., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
+            assert_equal(result, expected)
+
+            # Winsorize just the lower tail.
+            result = zp_winsorize(data[perm], 0.2, 1.0)
+            expected = array([2., 2., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
+            assert_equal(result, expected)
+
+            # Don't winsorize.
+            result = zp_winsorize(data[perm], 0.0, 1.0)
+            expected = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
+            assert_equal(result, expected)
+
+    def test_winsorize_nans(self):
+        # 5 low non-nan values, then some nans, then 5 high non-nans.
+        data = array([4.0, 3.0, 0.0, 1.0, 2.0,
+                      nan, nan, nan,
+                      9.0, 5.0, 6.0, 8.0, 7.0])
+
+        # Winsorize both tails at 10%.
+        # 0.0 -> 1.0
+        # 9.0 -> 8.0
+        result = zp_winsorize(data, 0.10, 0.90)
+        expected = array([4.0, 3.0, 1.0, 1.0, 2.0,
+                          nan, nan, nan,
+                          8.0, 5.0, 6.0, 8.0, 7.0])
+        assert_equal(result, expected)
+
+        # Winsorize both tails at 20%.
+        # 0.0 and 1.0 -> 2.0
+        # 9.0 and 8.0 -> 7.0
+        result = zp_winsorize(data, 0.20, 0.80)
+        expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
+                          nan, nan, nan,
+                          7.0, 5.0, 6.0, 7.0, 7.0])
+        assert_equal(result, expected)
+
+        # Winsorize just the upper tail.
+        result = zp_winsorize(data, 0, 0.8)
+        expected = array([4.0, 3.0, 0.0, 1.0, 2.0,
+                          nan, nan, nan,
+                          7.0, 5.0, 6.0, 7.0, 7.0])
+        assert_equal(result, expected)
+
+        # Winsorize just the lower tail.
+        result = zp_winsorize(data, 0.2, 1.0)
+        expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
+                          nan, nan, nan,
+                          9.0, 5.0, 6.0, 8.0, 7.0])
+        assert_equal(result, expected)
+
+    @parameter_space(seed=[0, 1, 2], __fail_fast=True)
+    def test_winsorize_randomized(self, seed):
+        state = RandomState(seed)
+        data = state.randn(50)
+        data[:5] = nan
+
+        # Permuting and then winsorizing should be the same as winsorizing and
+        # then permuting.
+        permutation = state.permutation(50)
+        assert_equal(
+            zp_winsorize(data[permutation], 0.1, 0.9),
+            zp_winsorize(data, 0.1, 0.9)[permutation],
+        )
+
     def test_winsorize_bad_bounds(self):
         """
         Test out of bounds input for factor.winsorize.
@@ -827,14 +963,14 @@ def test_winsorize_bad_bounds(self):
                 f.winsorize(min_percentile=min_, max_percentile=max_)
 
     @parameter_space(
-        seed_value=range(1, 2),
+        seed_value=[1, 2],
         normalizer_name_and_func=[
             ('demean', {}, lambda row: row - nanmean(row)),
             ('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
             (
                 'winsorize',
                 {"min_percentile": 0.25, "max_percentile": 0.75},
-                lambda row: scipy_winsorize(
+                lambda row: scipy_winsorize_with_nan_handling(
                     row,
                     limits=0.25,
                 )

diff --git a/zipline/pipeline/factors/factor.py b/zipline/pipeline/factors/factor.py
@@ -5,7 +5,7 @@
 from numbers import Number
 from math import ceil
 
-from numpy import empty_like, inf, nan, where
+from numpy import empty_like, inf, isnan, nan, where
 from scipy.stats import rankdata
 
 from zipline.utils.compat import wraps
@@ -855,29 +855,32 @@ def winsorize(self,
                   mask=NotSpecified,
                   groupby=NotSpecified):
         """
-        Construct a Factor returns a winsorized row. Winsorizing changes values
-        ranked less than the minimum percentile to to value at the minimum
-        percentile. Similarly, values ranking above the maximum percentile will
-        be changed to the value at the maximum percentile. This is useful
-        when limiting the impact of extreme values.
+        Construct a new factor that winsorizes the result of this factor.
+
+        Winsorizing changes values ranked less than the minimum percentile to
+        the value at the minimum percentile. Similarly, values ranking above
+        the maximum percentile are changed to the value at the maximum
+        percentile.
+
+        Winsorizing is useful for limiting the impact of extreme data points
+        without completely removing those points.
 
         If ``mask`` is supplied, ignore values where ``mask`` returns False
-        when computing row means and standard deviations, and output NaN
-        anywhere the mask is False.
+        when computing percentile cutoffs, and output NaN anywhere the mask is
+        False.
 
-        If ``groupby`` is supplied, compute by partitioning each row based on
-        the values produced by ``groupby``, winsorizing the partitioned arrays,
-        and stitching the sub-results back together.
+        If ``groupby`` is supplied, winsorization is applied separately
+        separately to each group defined by ``groupby``.
 
         Parameters
         ----------
         min_percentile: float, int
             Entries with values at or below this percentile will be replaced
-            with the (len(inp) * min_percentile)th lowest value. If low values
-            should not be clipped, use 0.
+            with the (len(input) * min_percentile)th lowest value. If low
+            values should not be clipped, use 0.
         max_percentile: float, int
             Entries with values at or above this percentile will be replaced
-            with the (len(inp) * max_percentile)th lowest value. If high
+            with the (len(input) * max_percentile)th lowest value. If high
             values should not be clipped, use 1.
         mask : zipline.pipeline.Filter, optional
             A Filter defining values to ignore when winsorizing.
@@ -1663,16 +1666,26 @@ def winsorize(row, min_percentile, max_percentile):
     This implementation is based on scipy.stats.mstats.winsorize
     """
     a = row.copy()
-    num = a.size
+    nan_count = isnan(row).sum()
+    nonnan_count = a.size - nan_count
+
+    # NOTE: argsort() sorts nans to the end of the array.
     idx = a.argsort()
+
+    # Set values at indices below the min percentile to the value of the entry
+    # at the cutoff.
     if min_percentile > 0:
-        lowidx = int(min_percentile * num)
-        a[idx[:lowidx]] = a[idx[lowidx]]
+        lower_cutoff = int(min_percentile * nonnan_count)
+        a[idx[:lower_cutoff]] = a[idx[lower_cutoff]]
+
+    # Set values at indices above the max percentile to the value of the entry
+    # at the cutoff.
     if max_percentile < 1:
-        upidx = int(ceil(num * max_percentile))
-        # upidx could return as the length of the array, in this case
-        # no modification to the right tail is necessary.
-        if upidx < num:
-            a[idx[upidx:]] = a[idx[upidx - 1]]
+        upper_cutoff = int(ceil(nonnan_count * max_percentile))
+        # if max_percentile is close to 1, then upper_cutoff might not
+        # remove any values.
+        if upper_cutoff < nonnan_count:
+            start_of_nans = (-nan_count) if nan_count else None
+            a[idx[upper_cutoff:start_of_nans]] = a[idx[upper_cutoff - 1]]
 
     return a