ENH: vectorize dividend ratio calculation

quantopian · Sep 24, 2018 · fbdc5c9 · fbdc5c9
1 parent a3739c3
commit fbdc5c9
Show file tree

Hide file tree

Showing 10 changed files with 96 additions and 81 deletions.
diff --git a/tests/pipeline/test_us_equity_pricing_loader.py b/tests/pipeline/test_us_equity_pricing_loader.py
@@ -283,7 +283,9 @@ def make_dividends_data(cls):
 
     @classmethod
     def make_adjustment_writer_equity_daily_bar_reader(cls):
-        return MockDailyBarReader()
+        return MockDailyBarReader(
+            dates=cls.calendar_days_between(cls.START_DATE, cls.END_DATE),
+        )
 
     @classmethod
     def make_equity_daily_bar_data(cls, country_code, sids):
@@ -310,14 +312,15 @@ def test_input_sanity(self):
                 self.assertGreaterEqual(eff_date, asset_start)
                 self.assertLessEqual(eff_date, asset_end)
 
-    def calendar_days_between(self, start_date, end_date, shift=0):
-        slice_ = self.equity_daily_bar_days.slice_indexer(start_date, end_date)
+    @classmethod
+    def calendar_days_between(cls, start_date, end_date, shift=0):
+        slice_ = cls.equity_daily_bar_days.slice_indexer(start_date, end_date)
         start = slice_.start + shift
         stop = slice_.stop + shift
         if start < 0:
             raise KeyError(start_date, shift)
 
-        return self.equity_daily_bar_days[start:stop]
+        return cls.equity_daily_bar_days[start:stop]
 
     def expected_adjustments(self, start_date, end_date):
         price_adjustments = {}

diff --git a/tests/test_api_shim.py b/tests/test_api_shim.py
@@ -168,7 +168,12 @@ def make_splits_data(cls):
 
     @classmethod
     def make_adjustment_writer_equity_daily_bar_reader(cls):
-        return MockDailyBarReader()
+        return MockDailyBarReader(
+            dates=cls.nyse_calendar.sessions_in_range(
+                cls.START_DATE,
+                cls.END_DATE,
+            ),
+        )
 
     @classmethod
     def init_class_fixtures(cls):

diff --git a/tests/test_bar_data.py b/tests/test_bar_data.py
@@ -950,7 +950,12 @@ def make_dividends_data(cls):
 
     @classmethod
     def make_adjustment_writer_equity_daily_bar_reader(cls):
-        return MockDailyBarReader()
+        return MockDailyBarReader(
+            dates=cls.trading_calendar.sessions_in_range(
+                cls.START_DATE,
+                cls.END_DATE,
+            ),
+        )
 
     @classmethod
     def make_equity_daily_bar_data(cls, country_code, sids):

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -75,7 +75,12 @@ def make_equity_info(cls):
 
     @classmethod
     def make_adjustment_writer_equity_daily_bar_reader(cls):
-        return MockDailyBarReader()
+        return MockDailyBarReader(
+            dates=cls.trading_calendar.sessions_in_range(
+                cls.START_DATE,
+                cls.END_DATE,
+            ),
+        )
 
     @classmethod
     def make_stock_dividends_data(cls):

diff --git a/tests/test_history.py b/tests/test_history.py
@@ -234,7 +234,12 @@ def make_dividends_data(cls):
 
     @classmethod
     def make_adjustment_writer_equity_daily_bar_reader(cls):
-        return MockDailyBarReader()
+        return MockDailyBarReader(
+            dates=cls.trading_calendar.sessions_in_range(
+                cls.TRADING_START_DT,
+                cls.TRADING_END_DT,
+            ),
+        )
 
     def verify_regular_dt(self, idx, dt, mode, fields=None, assets=None):
         if mode == 'daily':

diff --git a/zipline/data/adjustments.py b/zipline/data/adjustments.py
@@ -16,6 +16,7 @@
     float64_dtype,
     int64_dtype,
     uint32_dtype,
+    uint64_dtype,
 )
 from zipline.utils.sqlite_utils import group_into_chunks, coerce_string_to_conn
 from ._adjustments import load_adjustments_from_sqlite
@@ -349,72 +350,62 @@ def calc_dividend_ratios(self, dividends):
             return pd.DataFrame(np.array(
                 [],
                 dtype=[
-                    ('sid', uint32_dtype),
+                    ('sid', uint64_dtype),
                     ('effective_date', uint32_dtype),
                     ('ratio', float64_dtype),
                 ],
             ))
-        ex_dates = dividends.ex_date.values
-
-        sids = dividends.sid.values
-        amounts = dividends.amount.values
-
-        ratios = np.full(len(amounts), np.nan)
-
-        equity_daily_bar_reader = self._equity_daily_bar_reader
-
-        effective_dates = np.full(len(amounts), -1, dtype=int64_dtype)
-
-        calendar = self._calendar
-
-        # Calculate locs against a tz-naive cal, as the ex_dates are tz-
-        # naive.
-        #
-        # TODO: A better approach here would be to localize ex_date to
-        # the tz of the calendar, but currently get_indexer does not
-        # preserve tz of the target when method='bfill', which throws
-        # off the comparison.
-        tz_naive_calendar = calendar.tz_localize(None)
-        day_locs = tz_naive_calendar.get_indexer(ex_dates, method='bfill')
-
-        isnull = pd.isnull
-
-        for i, amount in enumerate(amounts):
-            sid = sids[i]
-            ex_date = ex_dates[i]
-            day_loc = day_locs[i]
-
-            prev_close_date = calendar[day_loc - 1]
-
-            try:
-                prev_close = equity_daily_bar_reader.get_value(
-                    sid, prev_close_date, 'close')
-                if not isnull(prev_close):
-                    ratio = 1.0 - amount / prev_close
-                    ratios[i] = ratio
-                    # only assign effective_date when data is found
-                    effective_dates[i] = ex_date
-            except NoDataOnDate:
-                log.warn("Couldn't compute ratio for dividend %s" % {
-                    'sid': sid,
-                    'ex_date': ex_date,
-                    'amount': amount,
-                })
-                continue
-
-        # Create a mask to filter out indices in the effective_date, sid, and
-        # ratio vectors for which a ratio was not calculable.
-        effective_mask = effective_dates != -1
-        effective_dates = effective_dates[effective_mask]
-        effective_dates = effective_dates.astype('datetime64[ns]').\
-            astype('datetime64[s]').astype(uint32_dtype)
-        sids = sids[effective_mask]
-        ratios = ratios[effective_mask]
 
+        pricing_reader = self._equity_daily_bar_reader
+        input_sids = dividends.sid.values
+        unique_sids, sids_ix = np.unique(input_sids, return_inverse=True)
+        dates = pricing_reader.sessions.values
+
+        close, = pricing_reader.load_raw_arrays(
+            ['close'],
+            pd.Timestamp(dates[0], tz='UTC'),
+            pd.Timestamp(dates[-1], tz='UTC'),
+            unique_sids,
+        )
+        date_ix = np.searchsorted(dates, dividends.ex_date.values)
+        mask = date_ix > 0
+
+        date_ix = date_ix[mask]
+        sids_ix = sids_ix[mask]
+        input_dates = dividends.ex_date.values[mask]
+
+        # subtract one day to get the close on the day prior to the merger
+        previous_close = close[date_ix - 1, sids_ix]
+        input_sids = input_sids[mask]
+
+        amount = dividends.amount.values[mask]
+        ratio = 1.0 - amount / previous_close
+
+        non_nan_ratio_mask = ~np.isnan(ratio)
+        for ix in np.flatnonzero(~non_nan_ratio_mask):
+            log.warn(
+                "Couldn't compute ratio for dividend "
+                " sid={sid}, ex_date={ex_date!s}, amount={amount}",
+                sid=input_sids[ix],
+                ex_date=dates[date_ix[ix]],
+                amount=amount[ix],
+            )
+
+        positive_ratio_mask = ratio > 0
+        for ix in np.flatnonzero(~positive_ratio_mask):
+            log.warn(
+                "Negative dividend ratio for dividend "
+                " sid={sid}, ex_date={ex_date!s}, amount={amount}",
+                sid=input_sids[ix],
+                ex_date=dates[date_ix[ix]],
+                amount=amount[ix],
+            )
+
+        valid_ratio_mask = non_nan_ratio_mask & positive_ratio_mask
         return pd.DataFrame({
-            'sid': sids,
-            'effective_date': effective_dates,
-            'ratio': ratios,
+            'sid': input_sids[valid_ratio_mask],
+            'effective_date': input_dates[valid_ratio_mask],
+            'ratio': ratio[valid_ratio_mask],
         })
 
     def _write_dividends(self, dividends):

diff --git a/zipline/data/hdf5_daily_bars.py b/zipline/data/hdf5_daily_bars.py
@@ -457,8 +457,7 @@ def dates(self):
 
     @lazyval
     def sids(self):
-        sids = self._country_group[INDEX][SID][:]
-        return sids.astype(int)
+        return self._country_group[INDEX][SID][:].astype('int64', copy=False)
 
     @lazyval
     def asset_start_dates(self):

diff --git a/zipline/data/session_bars.py b/zipline/data/session_bars.py
@@ -33,4 +33,3 @@ def sessions(self):
            All session labels (unioning the range for all assets) which the
            reader can provide.
         """
-        pass
diff --git a/zipline/testing/core.py b/zipline/testing/core.py
@@ -981,8 +981,19 @@ def wrapped(*args, **kwargs):
 
 
 class MockDailyBarReader(object):
+    def __init__(self, dates):
+        self.sessions = pd.DatetimeIndex(dates)
+
+    def load_raw_arrays(self, columns, start, stop, sids):
+        dates = self.sessions
+        output_dates = dates[(dates >= start) & (dates <= stop)]
+        return [
+            np.full((len(output_dates), len(sids)), 100.0)
+            for _ in columns
+        ]
+
     def get_value(self, col, sid, dt):
-        return 100
+        return 100.0
 
 
 def create_mock_adjustment_data(splits=None, dividends=None, mergers=None):
@@ -1004,15 +1015,6 @@ def create_mock_adjustment_data(splits=None, dividends=None, mergers=None):
     return splits, mergers, dividends
 
 
-def create_mock_adjustments(tempdir, days, splits=None, dividends=None,
-                            mergers=None):
-    path = tempdir.getpath("test_adjustments.db")
-    SQLiteAdjustmentWriter(path, MockDailyBarReader(), days).write(
-        *create_mock_adjustment_data(splits, dividends, mergers)
-    )
-    return path
-
-
 def assert_timestamp_equal(left, right, compare_nat_equal=True, msg=""):
     """
     Assert that two pandas Timestamp objects are the same.

diff --git a/zipline/utils/numpy_utils.py b/zipline/utils/numpy_utils.py
@@ -33,6 +33,7 @@
 bool_dtype = dtype('bool')
 
 uint32_dtype = dtype('uint32')
+uint64_dtype = dtype('uint64')
 int64_dtype = dtype('int64')
 
 float32_dtype = dtype('float32')