Skip to content

Commit

Permalink
Merge 1cd8de1 into f90cd1c
Browse files Browse the repository at this point in the history
  • Loading branch information
analicia committed Mar 7, 2017
2 parents f90cd1c + 1cd8de1 commit 6da1b8c
Show file tree
Hide file tree
Showing 2 changed files with 217 additions and 5 deletions.
121 changes: 116 additions & 5 deletions tests/pipeline/test_factor.py
Expand Up @@ -22,6 +22,7 @@
)
from numpy.random import randn, seed
import pandas as pd
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.lib.labelarray import LabelArray
Expand Down Expand Up @@ -709,11 +710,112 @@ def test_normalizations_hand_computed(self):
check=partial(check_allclose, atol=0.001),
)

def test_winsorize_hand_computed(self):
"""
Test the hand-computed example in factor.winsorize.
"""
f = self.f
m = Mask()
c = C()
str_c = C(dtype=categorical_dtype, missing_value=None)

factor_data = array([
[1., 2., 3., 4., 5., 6.],
[1., 8., 27., 64., 125., 216.],
[6., 5., 4., 3., 2., 1.]
])
filter_data = array(
[[False, True, True, True, True, True],
[True, False, True, True, True, True],
[True, True, False, True, True, True]],
dtype=bool,
)
classifier_data = array(
[[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2],
[1, 1, 1, 2, 2, 2]],
dtype=int64_dtype,
)
string_classifier_data = LabelArray(
classifier_data.astype(str).astype(object),
missing_value=None,
)

terms = {
'winsor_1': f.winsorize(limits=33),
'winsor_2': f.winsorize(limits=(49, None)),
'winsor_3': f.winsorize(limits=33, inclusive=(False, False)),
'winsor_4': f.winsorize(limits=33, inclusive=(True, False)),
'masked': f.winsorize(limits=33, mask=m),
'grouped': f.winsorize(limits=34, groupby=c),
'grouped_str': f.winsorize(limits=34, groupby=str_c),
'grouped_masked': f.winsorize(limits=34, mask=m, groupby=c),
'grouped_masked_str': f.winsorize(limits=34, mask=m, groupby=str_c)
}
expected = {
'winsor_1': array([
[2., 2., 3., 4., 5., 5.],
[8., 8., 27., 64., 125., 125.],
[5., 5., 4., 3., 2., 2.]
]),
'winsor_2': array([
[3.0, 3., 3., 4., 5., 6.],
[27., 27., 27., 64., 125., 216.],
[6.0, 5., 4., 3., 3., 3.]
]),
'winsor_3': array([
[3.0, 3., 3., 4., 4., 4.],
[27., 27., 27., 64., 64., 64.],
[4.0, 4., 4., 3., 3., 3.]
]),
'winsor_4': array([
[2., 2., 3., 4., 4., 4.],
[8., 8., 27., 64., 64., 64.],
[4., 4., 4., 3., 2., 2.]
]),
'masked': array([
[nan, 3., 3., 4., 5., 5.],
[27., nan, 27., 64., 125., 125.],
[5.0, 5., nan, 3., 2., 2.]
]),
'grouped': array([
[2., 2., 2., 5., 5., 5.],
[8., 8., 8., 125., 125., 125.],
[5., 5., 5., 2., 2., 2.]
]),
'grouped_masked': array([
[nan, 2., 3., 5., 5., 5.],
[1.0, nan, 27., 125., 125., 125.],
[6.0, 5., nan, 2., 2., 2.]
])
}
# Changing the classifier dtype shouldn't affect anything.
expected['grouped_str'] = expected['grouped']
expected['grouped_masked_str'] = expected['grouped_masked']

self.check_terms(
terms,
expected,
initial_workspace={
f: factor_data,
c: classifier_data,
str_c: string_classifier_data,
m: filter_data,
},
mask=self.build_mask(self.ones_mask(shape=factor_data.shape)),
check=partial(check_allclose, atol=0.001),
)

@parameter_space(
seed_value=range(1, 2),
normalizer_name_and_func=[
('demean', lambda row: row - nanmean(row)),
('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
('demean', {}, lambda row: row - nanmean(row)),
('zscore', {}, lambda row: (row - nanmean(row)) / nanstd(row)),
(
'winsorize',
{"limits": 25.},
lambda row: scipy_winsorize(row, limits=0.25)
),
],
add_nulls_to_factor=(False, True,),
)
Expand All @@ -722,9 +824,9 @@ def test_normalizations_randomized(self,
normalizer_name_and_func,
add_nulls_to_factor):

name, func = normalizer_name_and_func
name, kwargs, func = normalizer_name_and_func

shape = (7, 7)
shape = (20, 20)

# All Trues.
nomask = self.ones_mask(shape=shape)
Expand Down Expand Up @@ -755,7 +857,7 @@ def test_normalizations_randomized(self,
c = C()
c_with_nulls = OtherC()
m = Mask()
method = getattr(f, name)
method = partial(getattr(f, name), **kwargs)
terms = {
'vanilla': method(),
'masked': method(mask=m),
Expand Down Expand Up @@ -1051,6 +1153,10 @@ def test_zscore(self):
r = F().zscore().short_repr()
self.assertEqual(r, "GroupedRowTransform('zscore')")

def test_winsorize(self):
r = F().winsorize(limits=5).short_repr()
self.assertEqual(r, "GroupedRowTransform('winsorize')")


class TestWindowSafety(TestCase):

Expand All @@ -1062,6 +1168,11 @@ def test_demean_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(F(window_safe=False).demean().window_safe)
self.assertTrue(F(window_safe=True).demean().window_safe)

def test_winsorize_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(F().winsorize(limits=5).window_safe)
self.assertFalse(F(window_safe=False).winsorize(limits=5).window_safe)
self.assertTrue(F(window_safe=True).winsorize(limits=5).window_safe)


class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
@parameter_space(dtype_=(float64_dtype, datetime64ns_dtype))
Expand Down
101 changes: 101 additions & 0 deletions zipline/pipeline/factors/factor.py
Expand Up @@ -7,6 +7,7 @@

from numpy import empty_like, inf, nan, where
from scipy.stats import rankdata
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.lib.normalize import naive_grouped_rowwise_apply
Expand Down Expand Up @@ -833,6 +834,102 @@ def linear_regression(self, target, regression_length, mask=NotSpecified):
mask=mask,
)

@float64_only
def winsorize(self,
limits,
inclusive=(True, True),
mask=NotSpecified,
groupby=NotSpecified):
"""
Construct a Factor returns a winsorized row for results. Winsorizing
clips the input values to fixed percentiles. The (limits[0])th lowest
values are set to the value at the (limits[0])th percentile. The values
above the (limits[1])th percentiles are set to the value at the
(limits[1])th percentile. This is useful when limiting the impact of
extreme values.
If ``mask`` is supplied, ignore values where ``mask`` returns False
when computing row means and standard deviations, and output NaN
anywhere the mask is False.
If ``groupby`` is supplied, compute by partitioning each row based on
the values produced by ``groupby``, winsorizing the partitioned arrays,
and stitching the sub-results back together.
Parameters
----------
limits : None, tuple of float, optional
A tuple of two values between 0 and 100 inclusive. This is the
percentage to cut from each tail of the array. A value of None
can be used to indicate an open limit.
inclusive : a tuple of bool, optional
A bool indicating whether the data on each side should be
rounded(True) or truncated(False). A value of None can be used if
one side is not being winsorized. Default is (True, True).
mask : zipline.pipeline.Filter, optional
A Filter defining values to ignore when winsorizing.
groupby : zipline.pipeline.Classifier, optional
A classifier defining partitions over which to winsorize.
Returns
-------
winsorized : zipline.pipeline.Factor
A Factor producing a winsorized version of self.
Example
-------
price = USEquityPricing.close.latest
columns={
'PRICE': price,
'WINSOR_1: price.winsorize(limits=25),
'WINSOR_2': price.winsorize(limits=(50, None)),
'WINSOR_3': price.winsorize(
limits=25, inclusive=(False, False)
),
'WINSOR_4': price.winsorize(limits=25, inclusive=(True, False)),
'WINSOR_5': price.winsorize(limits=(20, 40)),
}
Given a pipeline with columns, defined above, the result for a
given day could look like:
'PRICE' 'WINSOR_1' 'WINSOR_2' 'WINSOR_3' 'WINSOR_4' 'WINSOR_5'
Asset_1 1 2 4 3 2 2
Asset_2 2 2 4 3 2 2
Asset_3 3 3 4 3 3 2
Asset_4 4 4 4 4 4 4
Asset_5 5 5 5 4 4 4
Asset_6 6 5 5 4 4 4
See Also
--------
:func:`scipy.stats.mstats.winsorize`
:meth:`pandas.DataFrame.groupby`
"""
if isinstance(limits, int) or isinstance(limits, float):
limits = limits / 100.
if isinstance(limits, tuple):
if limits[0] is not None:
limit_0 = limits[0] / 100.
else:
limit_0 = None
if limits[1] is not None:
limit_1 = limits[1] / 100
else:
limit_1 = None
limits = (limit_0, limit_1)
return GroupedRowTransform(
transform=winsorize,
transform_args=(limits, inclusive),
factor=self,
groupby=groupby,
dtype=self.dtype,
missing_value=self.missing_value,
mask=mask,
window_safe=self.window_safe,
)

@expect_types(bins=int, mask=(Filter, NotSpecifiedType))
def quantiles(self, bins, mask=NotSpecified):
"""
Expand Down Expand Up @@ -1530,3 +1627,7 @@ def demean(row):

def zscore(row):
return (row - nanmean(row)) / nanstd(row)


def winsorize(row, limits, inclusive):
return scipy_winsorize(row, limits=limits, inclusive=inclusive)

0 comments on commit 6da1b8c

Please sign in to comment.