Skip to content

Commit

Permalink
Squash me, pr feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
Ana Ruelas committed Mar 7, 2017
1 parent 72ef80f commit 0d9ca7b
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 74 deletions.
108 changes: 82 additions & 26 deletions tests/pipeline/test_factor.py
Expand Up @@ -24,7 +24,7 @@
import pandas as pd
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.errors import BadPercentileBounds, UnknownRankMethod
from zipline.lib.labelarray import LabelArray
from zipline.lib.rank import masked_rankdata_2d
from zipline.lib.normalize import naive_grouped_rowwise_apply as grouped_apply
Expand Down Expand Up @@ -742,15 +742,45 @@ def test_winsorize_hand_computed(self):
)

terms = {
'winsor_1': f.winsorize(limits=33),
'winsor_2': f.winsorize(limits=(49, None)),
'winsor_3': f.winsorize(limits=33, inclusive=(False, False)),
'winsor_4': f.winsorize(limits=33, inclusive=(True, False)),
'masked': f.winsorize(limits=33, mask=m),
'grouped': f.winsorize(limits=34, groupby=c),
'grouped_str': f.winsorize(limits=34, groupby=str_c),
'grouped_masked': f.winsorize(limits=34, mask=m, groupby=c),
'grouped_masked_str': f.winsorize(limits=34, mask=m, groupby=str_c)
'winsor_1': f.winsorize(
min_percentile=0.33,
max_percentile=0.67
),
'winsor_2': f.winsorize(
min_percentile=0.49,
max_percentile=1
),
'winsor_3': f.winsorize(
min_percentile=0,
max_percentile=.67
),
'masked': f.winsorize(
min_percentile=0.33,
max_percentile=0.67,
mask=m
),
'grouped': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
groupby=c
),
'grouped_str': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
groupby=str_c
),
'grouped_masked': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
mask=m,
groupby=c
),
'grouped_masked_str': f.winsorize(
min_percentile=0.34,
max_percentile=0.66,
mask=m,
groupby=str_c
),
}
expected = {
'winsor_1': array([
Expand All @@ -764,14 +794,9 @@ def test_winsorize_hand_computed(self):
[6.0, 5., 4., 3., 3., 3.]
]),
'winsor_3': array([
[3.0, 3., 3., 4., 4., 4.],
[27., 27., 27., 64., 64., 64.],
[4.0, 4., 4., 3., 3., 3.]
]),
'winsor_4': array([
[2., 2., 3., 4., 4., 4.],
[8., 8., 27., 64., 64., 64.],
[4., 4., 4., 3., 2., 2.]
[1., 2., 3., 4., 5., 5.],
[1., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 1.]
]),
'masked': array([
[nan, 3., 3., 4., 5., 5.],
Expand All @@ -787,7 +812,7 @@ def test_winsorize_hand_computed(self):
[nan, 2., 3., 5., 5., 5.],
[1.0, nan, 27., 125., 125., 125.],
[6.0, 5., nan, 2., 2., 2.]
])
]),
}
# Changing the classifier dtype shouldn't affect anything.
expected['grouped_str'] = expected['grouped']
Expand All @@ -806,15 +831,34 @@ def test_winsorize_hand_computed(self):
check=partial(check_allclose, atol=0.001),
)

def test_winsorize_bad_bounds(self):
"""
Test out of bounds input for factor.winsorize.
"""
f = self.f

bad_percentiles = [
(-.1, 1),
(0, 95),
(5, 95),
(5, 5),
]
for min_, max_ in bad_percentiles:
with self.assertRaises(BadPercentileBounds):
f.winsorize(min_percentile=min_, max_percentile=max_)

@parameter_space(
seed_value=range(1, 2),
normalizer_name_and_func=[
('demean', {}, lambda row: row - nanmean(row)),
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
(
'winsorize',
{"limits": 25.},
lambda row: scipy_winsorize(row, limits=0.25)
{"min_percentile": 0.25, "max_percentile": 0.75},
lambda row: scipy_winsorize(
row,
limits=0.25,
)
),
],
add_nulls_to_factor=(False, True,),
Expand Down Expand Up @@ -1154,7 +1198,7 @@ def test_zscore(self):
self.assertEqual(r, "GroupedRowTransform('zscore')")

def test_winsorize(self):
r = F().winsorize(limits=5).short_repr()
r = F().winsorize(min_percentile=.05, max_percentile=.95).short_repr()
self.assertEqual(r, "GroupedRowTransform('winsorize')")


Expand All @@ -1168,10 +1212,22 @@ def test_demean_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(F(window_safe=False).demean().window_safe)
self.assertTrue(F(window_safe=True).demean().window_safe)

def test_winsorize_is_window_safe(self):
self.assertFalse(F().winsorize(limits=5).window_safe)
self.assertFalse(F(window_safe=False).winsorize(limits=5).window_safe)
self.assertTrue(F(window_safe=True).winsorize(limits=5).window_safe)
def test_winsorize_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(
F().winsorize(min_percentile=.05, max_percentile=.95).window_safe
)
self.assertFalse(
F(window_safe=False).winsorize(
min_percentile=.05,
max_percentile=.95
).window_safe
)
self.assertTrue(
F(window_safe=True).winsorize(
min_percentile=.05,
max_percentile=.95
).window_safe
)


class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
Expand Down
10 changes: 8 additions & 2 deletions zipline/errors.py
Expand Up @@ -569,9 +569,15 @@ class BadPercentileBounds(ZiplineError):
Raised by API functions accepting percentile bounds when the passed bounds
are invalid.
"""

def __init__(self, **kwargs):
if "upper_bound" not in kwargs.keys():
kwargs["upper_bound"] == 100.0
self.kwargs = kwargs

msg = (
"Percentile bounds must fall between 0.0 and 100.0, and min must be "
"less than max."
"Percentile bounds must fall between 0.0 and {upper_bound}, and min "
"must be less than max."
"\nInputs were min={min_percentile}, max={max_percentile}."
)

Expand Down
106 changes: 60 additions & 46 deletions zipline/pipeline/factors/factor.py
Expand Up @@ -4,12 +4,12 @@
from functools import wraps
from operator import attrgetter
from numbers import Number
from math import ceil

from numpy import empty_like, inf, nan, where
from scipy.stats import rankdata
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.errors import BadPercentileBounds, UnknownRankMethod
from zipline.lib.normalize import naive_grouped_rowwise_apply
from zipline.lib.rank import masked_rankdata_2d, rankdata_1d_descending
from zipline.pipeline.api_utils import restrict_to_dtype
Expand Down Expand Up @@ -834,19 +834,24 @@ def linear_regression(self, target, regression_length, mask=NotSpecified):
mask=mask,
)

@expect_types(
min_percentile=(int, float),
max_percentile=(int, float),
mask=(Filter, NotSpecifiedType),
groupby=(Classifier, NotSpecifiedType),
)
@float64_only
def winsorize(self,
limits,
inclusive=(True, True),
min_percentile,
max_percentile,
mask=NotSpecified,
groupby=NotSpecified):
"""
Construct a Factor returns a winsorized row for results. Winsorizing
clips the input values to fixed percentiles. The (limits[0])th lowest
values are set to the value at the (limits[0])th percentile. The values
above the (limits[1])th percentiles are set to the value at the
(limits[1])th percentile. This is useful when limiting the impact of
extreme values.
Construct a Factor returns a winsorized row. Winsorizing changes values
ranked less than the minimum percentile to to value at the minimum
percentile. Similarly, values ranking above the maximum percentile will
be changed to the value at the maximum percentile. This is useful
when limiting the impact of extreme values.
If ``mask`` is supplied, ignore values where ``mask`` returns False
when computing row means and standard deviations, and output NaN
Expand All @@ -858,14 +863,14 @@ def winsorize(self,
Parameters
----------
limits : None, tuple of float, optional
A tuple of two values between 0 and 100 inclusive. This is the
percentage to cut from each tail of the array. A value of None
can be used to indicate an open limit.
inclusive : a tuple of bool, optional
A bool indicating whether the data on each side should be
rounded(True) or truncated(False). A value of None can be used if
one side is not being winsorized. Default is (False, False).
min_percentile: float, int
Entries with values at or below this percentile will be replaced
with the (len(inp) * min_percentile)th lowest value. If low values
should not be clipped, use 0.
max_percentile: float, int
Entries with values at or above this percentile will be replaced
with the (len(inp) * max_percentile)th lowest value. If high
values should not be clipped, use 1.
mask : zipline.pipeline.Filter, optional
A Filter defining values to ignore when winsorizing.
groupby : zipline.pipeline.Classifier, optional
Expand All @@ -882,34 +887,43 @@ def winsorize(self,
price = USEquityPricing.close.latest
columns={
'PRICE': price,
'WINSOR_1: price.winsorize(limits=25),
'WINSOR_2': price.winsorize(limits=(50, None)),
'WINSOR_1: price.winsorize(
min_percentile=0.25, max_percentile=0.75
),
'WINSOR_2': price.winsorize(
min_percentile=0.50, max_percentile=1.0
),
'WINSOR_3': price.winsorize(
limits=25, inclusive=(False, False)
min_percentile=0.0, max_percentile=0.5
),
'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
'WINSOR_5': price.winsorize(limits=(20, 40)),
}
Given a pipeline with columns, defined above, the result for a
given day could look like:
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
Asset_1 1 2 4 3 2 2
Asset_2 2 2 4 3 2 2
Asset_3 3 3 4 3 3 2
Asset_4 4 4 4 4 4 4
Asset_5 5 5 5 4 4 4
Asset_6 6 5 5 4 4 4
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3'
Asset_1 1 2 4 3
Asset_2 2 2 4 3
Asset_3 3 3 4 3
Asset_4 4 4 4 4
Asset_5 5 5 5 4
Asset_6 6 5 5 4
See Also
--------
:func:`scipy.stats.mstats.winsorize`
:meth:`pandas.DataFrame.groupby`
"""
if not 0.0 <= min_percentile < max_percentile <= 1.0:
raise BadPercentileBounds(
min_percentile=min_percentile,
max_percentile=max_percentile,
upper_bound=1.0,
)
return GroupedRowTransform(
transform=winsorize,
transform_args=(limits, inclusive),
transform_args=(min_percentile, max_percentile),
factor=self,
groupby=groupby,
dtype=self.dtype,
Expand Down Expand Up @@ -1617,18 +1631,18 @@ def zscore(row):
return (row - nanmean(row)) / nanstd(row)


def winsorize(row, limits, inclusive):
if isinstance(limits, int) or isinstance(limits, float):
limits = limits / 100.
if isinstance(limits, tuple):
if limits[0] is not None:
limit_0 = limits[0] / 100.
else:
limit_0 = None
if limits[1] is not None:
limit_1 = limits[1] / 100
else:
limit_1 = None
limits = (limit_0, limit_1)

return scipy_winsorize(row, limits=limits, inclusive=inclusive)
def winsorize(row, min_percentile, max_percentile):
a = row.copy()
num = a.size
idx = a.argsort()
if min_percentile > 0:
lowidx = int(min_percentile * num)
a[idx[:lowidx]] = a[idx[lowidx]]
if max_percentile < 1:
upidx = ceil(num * max_percentile)
# upidx could return as the length of the array, in this case
# no modification to the right tail is necessary.
if upidx < num:
a[idx[upidx:]] = a[idx[upidx - 1]]

return a

0 comments on commit 0d9ca7b

Please sign in to comment.