Skip to content

Commit

Permalink
Merge 6c88762 into 52b3329
Browse files Browse the repository at this point in the history
  • Loading branch information
Scott Sanderson committed Apr 5, 2018
2 parents 52b3329 + 6c88762 commit 5d5a823
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 52 deletions.
196 changes: 166 additions & 30 deletions tests/pipeline/test_factor.py
Expand Up @@ -7,6 +7,7 @@
from unittest import TestCase

from toolz import compose
import numpy as np
from numpy import (
apply_along_axis,
arange,
Expand All @@ -20,7 +21,7 @@
rot90,
where,
)
from numpy.random import randn, seed
from numpy.random import randn, seed, RandomState
import pandas as pd
from scipy.stats.mstats import winsorize as scipy_winsorize

Expand All @@ -35,6 +36,7 @@
DailyReturns,
Returns,
)
from zipline.pipeline.factors.factor import winsorize as zp_winsorize
from zipline.testing import (
check_allclose,
check_arrays,
Expand Down Expand Up @@ -95,6 +97,39 @@ class Mask(Filter):
])


def scipy_winsorize_with_nan_handling(array, limits):
"""
Wrapper around scipy.stats.mstats.winsorize that handles NaNs correctly.
scipy's winsorize sorts NaNs to the end of the array when calculating
percentiles.
"""
# The basic idea of this function is to do the following:
# 1. Sort the input, sorting nans to the end of the array.
# 2. Call scipy winsorize on the non-nan portion of the input.
# 3. Undo the sorting to put the winsorized values back in their original
# locations.

nancount = np.isnan(array).sum()
if nancount == len(array):
return array.copy()

sorter = array.argsort()
unsorter = sorter.argsort() # argsorting a permutation gives its inverse!

if nancount:
sorted_non_nans = array[sorter][:-nancount]
else:
sorted_non_nans = array[sorter]

sorted_winsorized = np.hstack([
scipy_winsorize(sorted_non_nans, limits).data,
np.full(nancount, np.nan),
])

return sorted_winsorized[unsorter]


class FactorTestCase(BasePipelineTestCase):

def init_instance_fixtures(self):
Expand Down Expand Up @@ -698,20 +733,26 @@ def test_winsorize_hand_computed(self):
str_c = C(dtype=categorical_dtype, missing_value=None)

factor_data = array([
[1., 2., 3., 4., 5., 6.],
[1., 8., 27., 64., 125., 216.],
[6., 5., 4., 3., 2., 1.]
[1., 2., 3., 4., 5., 6., 7., 8., 9.],
[1., 2., 3., 4., 5., 6., nan, nan, nan],
[1., 8., 27., 64., 125., 216., nan, nan, nan],
[6., 5., 4., 3., 2., 1., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
])
filter_data = array(
[[False, True, True, True, True, True],
[True, False, True, True, True, True],
[True, True, False, True, True, True]],
[[1, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 0, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 0, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 0, 1, 1, 1, 1, 1]],
dtype=bool,
)
classifier_data = array(
[[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2]],
[[1, 1, 1, 2, 2, 2, 1, 1, 1],
[1, 1, 1, 2, 2, 2, 1, 1, 1],
[1, 1, 1, 2, 2, 2, 1, 1, 1],
[1, 1, 1, 2, 2, 2, 1, 1, 1],
[1, 1, 1, 2, 2, 2, 1, 1, 1]],
dtype=int64_dtype,
)
string_classifier_data = LabelArray(
Expand Down Expand Up @@ -762,34 +803,47 @@ def test_winsorize_hand_computed(self):
}
expected = {
'winsor_1': array([
[2., 2., 3., 4., 5., 5.],
[8., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 2.]
[3., 3., 3., 4., 5., 6., 7., 7., 7.],
[2., 2., 3., 4., 5., 5., nan, nan, nan],
[8., 8., 27., 64., 125., 125., nan, nan, nan],
[5., 5., 4., 3., 2., 2., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
'winsor_2': array([
[3.0, 3., 3., 4., 5., 6.],
[27., 27., 27., 64., 125., 216.],
[6.0, 5., 4., 3., 3., 3.]
[5., 5., 5., 5., 5., 6., 7., 8., 9.],
[3.0, 3., 3., 4., 5., 6., nan, nan, nan],
[27., 27., 27., 64., 125., 216., nan, nan, nan],
[6.0, 5., 4., 3., 3., 3., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
'winsor_3': array([
[1., 2., 3., 4., 5., 5.],
[1., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 1.]
[1., 2., 3., 4., 5., 6., 7., 7., 7.],
[1., 2., 3., 4., 5., 5., nan, nan, nan],
[1., 8., 27., 64., 125., 125., nan, nan, nan],
[5., 5., 4., 3., 2., 1., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
'masked': array([
[nan, 3., 3., 4., 5., 5.],
[27., nan, 27., 64., 125., 125.],
[5.0, 5., nan, 3., 2., 2.]
# no mask on first row
[3., 3., 3., 4., 5., 6., 7., 7., 7.],
[nan, 3., 3., 4., 5., 5., nan, nan, nan],
[27., nan, 27., 64., 125., 125., nan, nan, nan],
[5.0, 5., nan, 3., 2., 2., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
'grouped': array([
[2., 2., 2., 5., 5., 5.],
[8., 8., 8., 125., 125., 125.],
[5., 5., 5., 2., 2., 2.]
[3., 3., 3., 5., 5., 5., 7., 7., 7.],
[2., 2., 2., 5., 5., 5., nan, nan, nan],
[8., 8., 8., 125., 125., 125., nan, nan, nan],
[5., 5., 5., 2., 2., 2., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
'grouped_masked': array([
[nan, 2., 3., 5., 5., 5.],
[1.0, nan, 27., 125., 125., 125.],
[6.0, 5., nan, 2., 2., 2.]
[3., 3., 3., 5., 5., 5., 7., 7., 7.],
[nan, 2., 3., 5., 5., 5., nan, nan, nan],
[1.0, nan, 27., 125., 125., 125., nan, nan, nan],
[6.0, 5., nan, 2., 2., 2., nan, nan, nan],
[nan, nan, nan, nan, nan, nan, nan, nan, nan],
]),
}
# Changing the classifier dtype shouldn't affect anything.
Expand All @@ -809,6 +863,88 @@ def test_winsorize_hand_computed(self):
check=partial(check_allclose, atol=0.001),
)

def test_winsorize_no_nans(self):
data = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
permutation = array([2, 1, 6, 8, 7, 5, 3, 9, 4, 0])

for perm in slice(None), permutation:
# Winsorize both tails at 90%.
result = zp_winsorize(data[perm], 0.1, 0.9)
expected = array([1., 1., 2., 3., 4., 5., 6., 7., 8., 8.])[perm]
assert_equal(result, expected)

# Winsorize both tails at 80%.
result = zp_winsorize(data[perm], 0.2, 0.8)
expected = array([2., 2., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
assert_equal(result, expected)

# Winsorize just the upper tail.
result = zp_winsorize(data[perm], 0.0, 0.8)
expected = array([0., 1., 2., 3., 4., 5., 6., 7., 7., 7.])[perm]
assert_equal(result, expected)

# Winsorize just the lower tail.
result = zp_winsorize(data[perm], 0.2, 1.0)
expected = array([2., 2., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
assert_equal(result, expected)

# Don't winsorize.
result = zp_winsorize(data[perm], 0.0, 1.0)
expected = array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])[perm]
assert_equal(result, expected)

def test_winsorize_nans(self):
# 5 low non-nan values, then some nans, then 5 high non-nans.
data = array([4.0, 3.0, 0.0, 1.0, 2.0,
nan, nan, nan,
9.0, 5.0, 6.0, 8.0, 7.0])

# Winsorize both tails at 10%.
# 0.0 -> 1.0
# 9.0 -> 8.0
result = zp_winsorize(data, 0.10, 0.90)
expected = array([4.0, 3.0, 1.0, 1.0, 2.0,
nan, nan, nan,
8.0, 5.0, 6.0, 8.0, 7.0])
assert_equal(result, expected)

# Winsorize both tails at 20%.
# 0.0 and 1.0 -> 2.0
# 9.0 and 8.0 -> 7.0
result = zp_winsorize(data, 0.20, 0.80)
expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
nan, nan, nan,
7.0, 5.0, 6.0, 7.0, 7.0])
assert_equal(result, expected)

# Winsorize just the upper tail.
result = zp_winsorize(data, 0, 0.8)
expected = array([4.0, 3.0, 0.0, 1.0, 2.0,
nan, nan, nan,
7.0, 5.0, 6.0, 7.0, 7.0])
assert_equal(result, expected)

# Winsorize just the lower tail.
result = zp_winsorize(data, 0.2, 1.0)
expected = array([4.0, 3.0, 2.0, 2.0, 2.0,
nan, nan, nan,
9.0, 5.0, 6.0, 8.0, 7.0])
assert_equal(result, expected)

@parameter_space(seed=[0, 1, 2], __fail_fast=True)
def test_winsorize_randomized(self, seed):
state = RandomState(seed)
data = state.randn(50)
data[:5] = nan

# Permuting and then winsorizing should be the same as winsorizing and
# then permuting.
permutation = state.permutation(50)
assert_equal(
zp_winsorize(data[permutation], 0.1, 0.9),
zp_winsorize(data, 0.1, 0.9)[permutation],
)

def test_winsorize_bad_bounds(self):
"""
Test out of bounds input for factor.winsorize.
Expand All @@ -827,14 +963,14 @@ def test_winsorize_bad_bounds(self):
f.winsorize(min_percentile=min_, max_percentile=max_)

@parameter_space(
seed_value=range(1, 2),
seed_value=[1, 2],
normalizer_name_and_func=[
('demean', {}, lambda row: row - nanmean(row)),
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
(
'winsorize',
{"min_percentile": 0.25, "max_percentile": 0.75},
lambda row: scipy_winsorize(
lambda row: scipy_winsorize_with_nan_handling(
row,
limits=0.25,
)
Expand Down
57 changes: 35 additions & 22 deletions zipline/pipeline/factors/factor.py
Expand Up @@ -5,7 +5,7 @@
from numbers import Number
from math import ceil

from numpy import empty_like, inf, nan, where
from numpy import empty_like, inf, isnan, nan, where
from scipy.stats import rankdata

from zipline.utils.compat import wraps
Expand Down Expand Up @@ -855,29 +855,32 @@ def winsorize(self,
mask=NotSpecified,
groupby=NotSpecified):
"""
Construct a Factor returns a winsorized row. Winsorizing changes values
ranked less than the minimum percentile to to value at the minimum
percentile. Similarly, values ranking above the maximum percentile will
be changed to the value at the maximum percentile. This is useful
when limiting the impact of extreme values.
Construct a new factor that winsorizes the result of this factor.
Winsorizing changes values ranked less than the minimum percentile to
the value at the minimum percentile. Similarly, values ranking above
the maximum percentile are changed to the value at the maximum
percentile.
Winsorizing is useful for limiting the impact of extreme data points
without completely removing those points.
If ``mask`` is supplied, ignore values where ``mask`` returns False
when computing row means and standard deviations, and output NaN
anywhere the mask is False.
when computing percentile cutoffs, and output NaN anywhere the mask is
False.
If ``groupby`` is supplied, compute by partitioning each row based on
the values produced by ``groupby``, winsorizing the partitioned arrays,
and stitching the sub-results back together.
If ``groupby`` is supplied, winsorization is applied separately
separately to each group defined by ``groupby``.
Parameters
----------
min_percentile: float, int
Entries with values at or below this percentile will be replaced
with the (len(inp) * min_percentile)th lowest value. If low values
should not be clipped, use 0.
with the (len(input) * min_percentile)th lowest value. If low
values should not be clipped, use 0.
max_percentile: float, int
Entries with values at or above this percentile will be replaced
with the (len(inp) * max_percentile)th lowest value. If high
with the (len(input) * max_percentile)th lowest value. If high
values should not be clipped, use 1.
mask : zipline.pipeline.Filter, optional
A Filter defining values to ignore when winsorizing.
Expand Down Expand Up @@ -1663,16 +1666,26 @@ def winsorize(row, min_percentile, max_percentile):
This implementation is based on scipy.stats.mstats.winsorize
"""
a = row.copy()
num = a.size
nan_count = isnan(row).sum()
nonnan_count = a.size - nan_count

# NOTE: argsort() sorts nans to the end of the array.
idx = a.argsort()

# Set values at indices below the min percentile to the value of the entry
# at the cutoff.
if min_percentile > 0:
lowidx = int(min_percentile * num)
a[idx[:lowidx]] = a[idx[lowidx]]
lower_cutoff = int(min_percentile * nonnan_count)
a[idx[:lower_cutoff]] = a[idx[lower_cutoff]]

# Set values at indices above the max percentile to the value of the entry
# at the cutoff.
if max_percentile < 1:
upidx = int(ceil(num * max_percentile))
# upidx could return as the length of the array, in this case
# no modification to the right tail is necessary.
if upidx < num:
a[idx[upidx:]] = a[idx[upidx - 1]]
upper_cutoff = int(ceil(nonnan_count * max_percentile))
# if max_percentile is close to 1, then upper_cutoff might not
# remove any values.
if upper_cutoff < nonnan_count:
start_of_nans = (-nan_count) if nan_count else None
a[idx[upper_cutoff:start_of_nans]] = a[idx[upper_cutoff - 1]]

return a

0 comments on commit 5d5a823

Please sign in to comment.