Skip to content

Commit

Permalink
Merge fe6dbbb into f90cd1c
Browse files Browse the repository at this point in the history
  • Loading branch information
analicia committed Mar 3, 2017
2 parents f90cd1c + fe6dbbb commit 636bef7
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 0 deletions.
9 changes: 9 additions & 0 deletions tests/pipeline/test_factor.py
Expand Up @@ -22,6 +22,7 @@
)
from numpy.random import randn, seed
import pandas as pd
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.lib.labelarray import LabelArray
Expand Down Expand Up @@ -714,6 +715,7 @@ def test_normalizations_hand_computed(self):
normalizer_name_and_func=[
('demean', lambda row: row - nanmean(row)),
('zscore', lambda row: (row - nanmean(row)) / nanstd(row)),
('winsorize', lambda row: scipy_winsorize(row, limits=0.05)),
],
add_nulls_to_factor=(False, True,),
)
Expand Down Expand Up @@ -1051,6 +1053,10 @@ def test_zscore(self):
r = F().zscore().short_repr()
self.assertEqual(r, "GroupedRowTransform('zscore')")

def test_winsorize(self):
r = F().winsorize().short_repr()
self.assertEqual(r, "GroupedRowTransform('winsorize')")


class TestWindowSafety(TestCase):

Expand All @@ -1062,6 +1068,9 @@ def test_demean_is_window_safe_if_input_is_window_safe(self):
self.assertFalse(F(window_safe=False).demean().window_safe)
self.assertTrue(F(window_safe=True).demean().window_safe)

def test_winsorize_is_window_safe(self):
self.assertTrue(F().winsorize().window_safe)


class TestPostProcessAndToWorkSpaceValue(ZiplineTestCase):
@parameter_space(dtype_=(float64_dtype, datetime64ns_dtype))
Expand Down
83 changes: 83 additions & 0 deletions zipline/pipeline/factors/factor.py
Expand Up @@ -7,6 +7,7 @@

from numpy import empty_like, inf, nan, where
from scipy.stats import rankdata
from scipy.stats.mstats import winsorize as scipy_winsorize

from zipline.errors import UnknownRankMethod
from zipline.lib.normalize import naive_grouped_rowwise_apply
Expand Down Expand Up @@ -833,6 +834,84 @@ def linear_regression(self, target, regression_length, mask=NotSpecified):
mask=mask,
)

def winsorize(self,
limits=0.05,
inclusive=(False, False),
mask=NotSpecified,
groupby=NotSpecified):
"""
Construct a Factor returns a winsorized row for results. This is useful
when limiting the impact of extreme values.
If ``mask`` is supplied, ignore values where ``mask`` returns False
when computing row means and standard deviations, and output NaN
anywhere the mask is False.
If ``groupby`` is supplied, compute by partitioning each row based on
the values produced by ``groupby``, winsorizing the partitioned arrays,
and stitching the sub-results back together.
Parameters
----------
limits : None, tuple of float, optional
A tuple of two values between 0 and 1 inclusive. This is the
percentage to cut from each side of the array. A value of None
can be used to indicate an open limit.
inclusive : a tuple of bool, optional
A bool indicating whether the data on each side should be
rounded(True) or truncated(False). A value of None can be used if
one side is not being winsorized. Default is (False, False).
mask : zipline.pipeline.Filter, optional
A Filter defining values to ignore when winsorizing.
groupby : zipline.pipeline.Classifier, optional
A classifier defining partitions over which to winsorize.
Returns
-------
winsorized : zipline.pipeline.Factor
A Factor producing a winsorized version of the self.
Example
-------
price = USEquityPricing
columns={
'price': price.latest,
'winsor_inc': price.winsorize(limits=0.25, inclusive=(True, True)),
'winsor_exc': price.winsorize(
limits=0.25, inclusive=(False, False)
),
'winsor_alt': price.winsorize(limits=0.25, inclusive=(True, False))
}
Given a pipeline with columns, defined above, the result for a
given day could look like:
'price' 'winsor_inc' 'winsor_exc' 'winsor_alt'
Asset_1 1 2 3 2
Asset_2 2 2 3 2
Asset_3 3 3 3 3
Asset_4 4 4 4 4
Asset_5 5 5 4 4
Asset_6 6 5 4 4
See Also
--------
:func:`scipy.stats.mstats.winsorize`
:meth:`pandas.DataFrame.groupby`
"""
return GroupedRowTransform(
transform=winsorize,
transform_args=(limits, inclusive),
factor=self,
groupby=groupby,
dtype=self.dtype,
missing_value=self.missing_value,
mask=mask,
window_safe=True,
)

@expect_types(bins=int, mask=(Filter, NotSpecifiedType))
def quantiles(self, bins, mask=NotSpecified):
"""
Expand Down Expand Up @@ -1530,3 +1609,7 @@ def demean(row):

def zscore(row):
return (row - nanmean(row)) / nanstd(row)


def winsorize(row, limits, inclusive):
return scipy_winsorize(row, limits=limits, inclusive=inclusive)

0 comments on commit 636bef7

Please sign in to comment.