From d4d8db3e1add7ed07d32870bc021fead3cbfbdf6 Mon Sep 17 00:00:00 2001 From: Freddie Vargus Date: Wed, 17 May 2017 15:30:40 -0400 Subject: [PATCH 1/2] MAINT: Remove code that doesn't work with the new yahoo API DOC: Remove mentions of Yahoo data bundles MAINT: Remove yahoo bundle, test, and more mentions of yahoo --- docs/source/bundles.rst | 40 +----- tests/data/bundles/test_yahoo.py | 206 --------------------------- tests/resources/rebuild_example_data | 25 +--- zipline/data/__init__.py | 4 - zipline/data/bundles/__init__.py | 2 - zipline/data/bundles/yahoo.py | 203 -------------------------- zipline/data/loader.py | 59 -------- zipline/utils/factory.py | 6 - 8 files changed, 2 insertions(+), 543 deletions(-) delete mode 100644 tests/data/bundles/test_yahoo.py delete mode 100644 zipline/data/bundles/yahoo.py diff --git a/docs/source/bundles.rst b/docs/source/bundles.rst index 6fd8c1fb7d..a316fe6912 100644 --- a/docs/source/bundles.rst +++ b/docs/source/bundles.rst @@ -153,44 +153,6 @@ Quantopian provides a mirror of the quandl WIKI dataset with the data in the formats that zipline expects. This is available under the name: ``quantopian-quandl`` and is the default bundle for zipline. -Yahoo Bundle Factories -`````````````````````` - -Zipline also ships with a factory function for creating a data bundle out of a -set of tickers from yahoo: :func:`~zipline.data.bundles.yahoo_equities`. -:func:`~zipline.data.bundles.yahoo_equities` makes it easy to pre-download and -cache the data for a set of equities from yahoo. The yahoo bundles include daily -pricing data along with splits, cash dividends, and inferred asset metadata. To -create a bundle from a set of equities, add the following to your -``~/.zipline/extensions.py`` file: - -.. code-block:: python - - from zipline.data.bundles import register, yahoo_equities - - # these are the tickers you would like data for - equities = { - 'AAPL', - 'MSFT', - 'GOOG', - } - register( - 'my-yahoo-equities-bundle', # name this whatever you like - yahoo_equities(equities), - ) - - -This may now be used like: - -.. code-block:: bash - - $ zipline ingest -b my-yahoo-equities-bundle - $ zipline run -f algo.py --bundle my-yahoo-equities-bundle - - -More than one yahoo equities bundle may be registered as long as they use -different names. - Writing a New Bundle ~~~~~~~~~~~~~~~~~~~~ @@ -200,7 +162,7 @@ zipline. To add a new bundle, one must implement an ``ingest`` function. The ``ingest`` function is responsible for loading the data into memory and passing it to a set of writer objects provided by zipline to convert the data to zipline's internal format. The ingest function may work by downloading data from -a remote location like the ``quandl`` bundle or yahoo bundles or it may just +a remote location like the ``quandl`` bundle or it may just load files that are already on the machine. The function is provided with writers that will write the data to the correct location transactionally. If an ingestion fails part way through the bundle will not be written in an incomplete diff --git a/tests/data/bundles/test_yahoo.py b/tests/data/bundles/test_yahoo.py deleted file mode 100644 index e3d2580200..0000000000 --- a/tests/data/bundles/test_yahoo.py +++ /dev/null @@ -1,206 +0,0 @@ -from __future__ import division - -import numpy as np -import pandas as pd -from six.moves.urllib.parse import urlparse, parse_qs -from toolz import flip, identity -from toolz.curried import merge_with, operator as op - -from zipline.data.bundles.core import _make_bundle_core -from zipline.data.bundles import yahoo_equities -from zipline.lib.adjustment import Float64Multiply -from zipline.testing import test_resource_path, tmp_dir, read_compressed -from zipline.testing.fixtures import WithResponses, ZiplineTestCase -from zipline.testing.predicates import assert_equal -from zipline.utils.calendars import get_calendar - - -class YahooBundleTestCase(WithResponses, ZiplineTestCase): - symbols = 'AAPL', 'IBM', 'MSFT' - columns = 'open', 'high', 'low', 'close', 'volume' - asset_start = pd.Timestamp('2014-01-02', tz='utc') - asset_end = pd.Timestamp('2014-12-31', tz='utc') - calendar = get_calendar('NYSE') - sessions = calendar.sessions_in_range(asset_start, asset_end) - - @classmethod - def init_class_fixtures(cls): - super(YahooBundleTestCase, cls).init_class_fixtures() - (cls.bundles, - cls.register, - cls.unregister, - cls.ingest, - cls.load, - cls.clean) = map(staticmethod, _make_bundle_core()) - - def _expected_data(self): - sids = 0, 1, 2 - modifier = { - 'low': 0, - 'open': 1, - 'close': 2, - 'high': 3, - 'volume': 0, - } - pricing = [ - np.hstack(( - np.arange(252, dtype='float64')[:, np.newaxis] + - 1 + - sid * 10000 + - modifier[column] * 1000 - for sid in sorted(sids) - )) - for column in self.columns - ] - - # There are two dividends and 1 split for each company. - - def dividend_adjustment(sid, which): - """The dividends occur at indices 252 // 4 and 3 * 252 / 4 - with a cash amount of sid + 1 / 10 and sid + 2 / 10 - """ - if which == 'first': - idx = 252 // 4 - else: - idx = 3 * 252 // 4 - - return { - idx: [Float64Multiply( - first_row=0, - last_row=idx, - first_col=sid, - last_col=sid, - value=float( - 1 - - ((sid + 1 + (which == 'second')) / 10) / - (idx - 1 + sid * 10000 + 2000) - ), - )], - } - - def split_adjustment(sid, volume): - """The splits occur at index 252 // 2 with a ratio of (sid + 1):1 - """ - idx = 252 // 2 - return { - idx: [Float64Multiply( - first_row=0, - last_row=idx, - first_col=sid, - last_col=sid, - value=(identity if volume else op.truediv(1))(sid + 2), - )], - } - - merge_adjustments = merge_with(flip(sum, [])) - - adjustments = [ - # ohlc - merge_adjustments( - *tuple(dividend_adjustment(sid, 'first') for sid in sids) + - tuple(dividend_adjustment(sid, 'second') for sid in sids) + - tuple(split_adjustment(sid, volume=False) for sid in sids) - ) - ] * (len(self.columns) - 1) + [ - # volume - merge_adjustments( - split_adjustment(sid, volume=True) for sid in sids - ), - ] - - return pricing, adjustments - - def test_bundle(self): - - def get_symbol_from_url(url): - params = parse_qs(urlparse(url).query) - symbol, = params['s'] - return symbol - - def pricing_callback(request): - headers = { - 'content-encoding': 'gzip', - 'content-type': 'text/csv', - } - path = test_resource_path( - 'yahoo_samples', - get_symbol_from_url(request.url) + '.csv.gz', - ) - with open(path, 'rb') as f: - return ( - 200, - headers, - f.read(), - ) - - for _ in range(3): - self.responses.add_callback( - self.responses.GET, - 'http://ichart.finance.yahoo.com/table.csv', - pricing_callback, - ) - - def adjustments_callback(request): - path = test_resource_path( - 'yahoo_samples', - get_symbol_from_url(request.url) + '.adjustments.gz', - ) - return 200, {}, read_compressed(path) - - for _ in range(3): - self.responses.add_callback( - self.responses.GET, - 'http://ichart.finance.yahoo.com/x', - adjustments_callback, - ) - - self.register( - 'bundle', - yahoo_equities(self.symbols), - calendar_name='NYSE', - start_session=self.asset_start, - end_session=self.asset_end, - ) - - zipline_root = self.enter_instance_context(tmp_dir()).path - environ = { - 'ZIPLINE_ROOT': zipline_root, - } - - self.ingest('bundle', environ=environ, show_progress=False) - bundle = self.load('bundle', environ=environ) - - sids = 0, 1, 2 - equities = bundle.asset_finder.retrieve_all(sids) - for equity, expected_symbol in zip(equities, self.symbols): - assert_equal(equity.symbol, expected_symbol) - - for equity in bundle.asset_finder.retrieve_all(sids): - assert_equal(equity.start_date, self.asset_start, msg=equity) - assert_equal(equity.end_date, self.asset_end, msg=equity) - - sessions = self.sessions - actual = bundle.equity_daily_bar_reader.load_raw_arrays( - self.columns, - sessions[sessions.get_loc(self.asset_start, 'bfill')], - sessions[sessions.get_loc(self.asset_end, 'ffill')], - sids, - ) - expected_pricing, expected_adjustments = self._expected_data() - assert_equal(actual, expected_pricing, array_decimal=2) - - adjustments_for_cols = bundle.adjustment_reader.load_adjustments( - self.columns, - self.sessions, - pd.Index(sids), - ) - - for column, adjustments, expected in zip(self.columns, - adjustments_for_cols, - expected_adjustments): - assert_equal( - adjustments, - expected, - msg=column, - decimal=4, - ) diff --git a/tests/resources/rebuild_example_data b/tests/resources/rebuild_example_data index 1c306b624a..938a1b2a5d 100755 --- a/tests/resources/rebuild_example_data +++ b/tests/resources/rebuild_example_data @@ -10,7 +10,7 @@ import numpy as np import pandas as pd from zipline import examples -from zipline.data.bundles import clean, ingest, register, yahoo_equities +from zipline.data.bundles import clean, ingest from zipline.testing import test_resource_path, tmp_dir from zipline.utils.cache import dataframe_cache @@ -30,14 +30,6 @@ INPUT_DATA_SYMBOLS = ( 'AAPL', 'MSFT', ) -TEST_BUNDLE_NAME = 'test' -input_bundle = yahoo_equities( - INPUT_DATA_SYMBOLS, - INPUT_DATA_START_DATE, - INPUT_DATA_END_DATE, -) -register(TEST_BUNDLE_NAME, input_bundle) - banner = """ Please verify that the new performance is more correct than the old @@ -86,19 +78,6 @@ def changed_results(new, old): def eof(*args, **kwargs): raise EOFError() - -def rebuild_input_data(environ): - ingest(TEST_BUNDLE_NAME, environ=environ, show_progress=True) - clean(TEST_BUNDLE_NAME, keep_last=1, environ=environ) - - -@click.command() -@click.option( - '--rebuild-input', - is_flag=True, - default=False, - help="Should we rebuild the input data from Yahoo?", -) @click.pass_context def main(ctx, rebuild_input): """Rebuild the perf data for test_examples @@ -112,8 +91,6 @@ def main(ctx, rebuild_input): # The environ here should be the same (modulo the tempdir location) # as we use in test_examples.py. environ = {'ZIPLINE_ROOT': d.getpath('example_data/root')} - if rebuild_input: - rebuild_input_data(environ) new_perf_path = d.getpath( 'example_data/new_perf/%s' % pd.__version__.replace('.', '-'), diff --git a/zipline/data/__init__.py b/zipline/data/__init__.py index f3dd9b36be..0830dbfa1d 100644 --- a/zipline/data/__init__.py +++ b/zipline/data/__init__.py @@ -1,15 +1,11 @@ from . import loader from .loader import ( - load_from_yahoo, - load_bars_from_yahoo, load_prices_from_csv, load_prices_from_csv_folder, ) __all__ = [ - 'load_bars_from_yahoo', - 'load_from_yahoo', 'load_prices_from_csv', 'load_prices_from_csv_folder', 'loader', diff --git a/zipline/data/bundles/__init__.py b/zipline/data/bundles/__init__.py index 40746c6c20..088e8752f1 100644 --- a/zipline/data/bundles/__init__.py +++ b/zipline/data/bundles/__init__.py @@ -12,7 +12,6 @@ to_bundle_ingest_dirname, unregister, ) -from .yahoo import yahoo_equities __all__ = [ @@ -26,5 +25,4 @@ 'register', 'to_bundle_ingest_dirname', 'unregister', - 'yahoo_equities', ] diff --git a/zipline/data/bundles/yahoo.py b/zipline/data/bundles/yahoo.py deleted file mode 100644 index ace6a40539..0000000000 --- a/zipline/data/bundles/yahoo.py +++ /dev/null @@ -1,203 +0,0 @@ -import os - -import numpy as np -import pandas as pd -from pandas_datareader.data import DataReader -import requests - -from zipline.utils.calendars import register_calendar_alias -from zipline.utils.cli import maybe_show_progress -from .core import register - - -def _cachpath(symbol, type_): - return '-'.join((symbol.replace(os.path.sep, '_'), type_)) - - -def yahoo_equities(symbols, start=None, end=None): - """Create a data bundle ingest function from a set of symbols loaded from - yahoo. - - Parameters - ---------- - symbols : iterable[str] - The ticker symbols to load data for. - start : datetime, optional - The start date to query for. By default this pulls the full history - for the calendar. - end : datetime, optional - The end date to query for. By default this pulls the full history - for the calendar. - - Returns - ------- - ingest : callable - The bundle ingest function for the given set of symbols. - - Examples - -------- - This code should be added to ~/.zipline/extension.py - - .. code-block:: python - - from zipline.data.bundles import yahoo_equities, register - - symbols = ( - 'AAPL', - 'IBM', - 'MSFT', - ) - register('my_bundle', yahoo_equities(symbols)) - - Notes - ----- - The sids for each symbol will be the index into the symbols sequence. - """ - # strict this in memory so that we can reiterate over it - symbols = tuple(symbols) - - def ingest(environ, - asset_db_writer, - minute_bar_writer, # unused - daily_bar_writer, - adjustment_writer, - calendar, - start_session, - end_session, - cache, - show_progress, - output_dir, - # pass these as defaults to make them 'nonlocal' in py2 - start=start, - end=end): - if start is None: - start = start_session - if end is None: - end = None - - metadata = pd.DataFrame(np.empty(len(symbols), dtype=[ - ('start_date', 'datetime64[ns]'), - ('end_date', 'datetime64[ns]'), - ('auto_close_date', 'datetime64[ns]'), - ('symbol', 'object'), - ])) - - def _pricing_iter(): - sid = 0 - with maybe_show_progress( - symbols, - show_progress, - label='Downloading Yahoo pricing data: ') as it, \ - requests.Session() as session: - for symbol in it: - path = _cachpath(symbol, 'ohlcv') - try: - df = cache[path] - except KeyError: - df = cache[path] = DataReader( - symbol, - 'yahoo', - start, - end, - session=session, - ).sort_index() - - # the start date is the date of the first trade and - # the end date is the date of the last trade - start_date = df.index[0] - end_date = df.index[-1] - # The auto_close date is the day after the last trade. - ac_date = end_date + pd.Timedelta(days=1) - metadata.iloc[sid] = start_date, end_date, ac_date, symbol - - df.rename( - columns={ - 'Open': 'open', - 'High': 'high', - 'Low': 'low', - 'Close': 'close', - 'Volume': 'volume', - }, - inplace=True, - ) - yield sid, df - sid += 1 - - daily_bar_writer.write(_pricing_iter(), show_progress=show_progress) - - symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) - - # Hardcode the exchange to "YAHOO" for all assets and (elsewhere) - # register "YAHOO" to resolve to the NYSE calendar, because these are - # all equities and thus can use the NYSE calendar. - metadata['exchange'] = "YAHOO" - asset_db_writer.write(equities=metadata) - - adjustments = [] - with maybe_show_progress( - symbols, - show_progress, - label='Downloading Yahoo adjustment data: ') as it, \ - requests.Session() as session: - for symbol in it: - path = _cachpath(symbol, 'adjustment') - try: - df = cache[path] - except KeyError: - df = cache[path] = DataReader( - symbol, - 'yahoo-actions', - start, - end, - session=session, - ).sort_index() - - df['sid'] = symbol_map[symbol] - adjustments.append(df) - - adj_df = pd.concat(adjustments) - adj_df.index.name = 'date' - adj_df.reset_index(inplace=True) - - splits = adj_df[adj_df.action == 'SPLIT'] - splits = splits.rename( - columns={'value': 'ratio', 'date': 'effective_date'}, - ) - splits.drop('action', axis=1, inplace=True) - - dividends = adj_df[adj_df.action == 'DIVIDEND'] - dividends = dividends.rename( - columns={'value': 'amount', 'date': 'ex_date'}, - ) - dividends.drop('action', axis=1, inplace=True) - # we do not have this data in the yahoo dataset - dividends['record_date'] = pd.NaT - dividends['declared_date'] = pd.NaT - dividends['pay_date'] = pd.NaT - - adjustment_writer.write(splits=splits, dividends=dividends) - - return ingest - - -# bundle used when creating test data -register( - '.test', - yahoo_equities( - ( - 'AMD', - 'CERN', - 'COST', - 'DELL', - 'GPS', - 'INTC', - 'MMM', - 'AAPL', - 'MSFT', - ), - pd.Timestamp('2004-01-02', tz='utc'), - pd.Timestamp('2015-01-01', tz='utc'), - ), -) - -register_calendar_alias("YAHOO", "NYSE") diff --git a/zipline/data/loader.py b/zipline/data/loader.py index f682f47e91..3510f55f00 100644 --- a/zipline/data/loader.py +++ b/zipline/data/loader.py @@ -28,7 +28,6 @@ cache_root, data_root, ) -from ..utils.deprecate import deprecated from zipline.utils.calendars import get_calendar @@ -202,11 +201,6 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day, We attempt to download data unless we already have data stored at the data cache for `symbol` whose first entry is before or on `first_date` and whose last entry is on or after `last_date`. - - If we perform a download and the cache criteria are not satisfied, we wait - at least one hour before attempting a redownload. This is determined by - comparing the current time to the result of os.path.getmtime on the cache - path. """ filename = get_benchmark_filename(symbol) data = _load_cached_data(filename, first_date, last_date, now, 'benchmark', @@ -426,59 +420,6 @@ def load_from_yahoo(indexes=None, return df -@deprecated( - 'load_bars_from_yahoo is deprecated, please register a' - ' yahoo_equities data bundle instead', -) -def load_bars_from_yahoo(indexes=None, - stocks=None, - start=None, - end=None, - adjusted=True): - """ - Loads data from Yahoo into a panel with the following - column names for each indicated security: - - - open - - high - - low - - close - - volume - - price - - Note that 'price' is Yahoo's 'Adjusted Close', which removes the - impact of splits and dividends. If the argument 'adjusted' is True, then - the open, high, low, and close values are adjusted as well. - - :param indexes: Financial indexes to load. - :type indexes: dict - :param stocks: Stock closing prices to load. - :type stocks: list - :param start: Retrieve prices from start date on. - :type start: datetime - :param end: Retrieve prices until end date. - :type end: datetime - :param adjusted: Adjust open/high/low/close for splits and dividends. - The 'price' field is always adjusted. - :type adjusted: bool - - """ - data = _load_raw_yahoo_data(indexes, stocks, start, end) - panel = pd.Panel(data) - # Rename columns - panel.minor_axis = ['open', 'high', 'low', 'close', 'volume', 'price'] - panel.major_axis = panel.major_axis.tz_localize(pytz.utc) - # Adjust data - if adjusted: - adj_cols = ['open', 'high', 'low', 'close'] - for ticker in panel.items: - ratio = (panel[ticker]['price'] / panel[ticker]['close']) - ratio_filtered = ratio.fillna(0).values - for col in adj_cols: - panel[ticker][col] *= ratio_filtered - return panel - - def load_prices_from_csv(filepath, identifier_col, tz='UTC'): data = pd.read_csv(filepath, index_col=identifier_col) data.index = pd.DatetimeIndex(data.index, tz=tz) diff --git a/zipline/utils/factory.py b/zipline/utils/factory.py index a529ed06a8..d545910dcc 100644 --- a/zipline/utils/factory.py +++ b/zipline/utils/factory.py @@ -27,15 +27,9 @@ from zipline.sources import SpecificEquityTrades from zipline.finance.trading import SimulationParameters from zipline.sources.test_source import create_trade -from zipline.data.loader import ( # For backwards compatibility - load_from_yahoo, - load_bars_from_yahoo, -) from zipline.utils.calendars import get_calendar from zipline.utils.input_validation import expect_types -__all__ = ['load_from_yahoo', 'load_bars_from_yahoo'] - def create_simulation_parameters(year=2006, start=None, end=None, capital_base=float("1.0e5"), From 492b22f0b16d6fb9ef81d88b8701395d82f5754a Mon Sep 17 00:00:00 2001 From: Freddie Vargus Date: Mon, 3 Jul 2017 11:07:45 -0400 Subject: [PATCH 2/2] TST: Add error message for --rebuild--input --- tests/resources/rebuild_example_data | 17 +++++++++++++++++ zipline/data/loader.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/tests/resources/rebuild_example_data b/tests/resources/rebuild_example_data index 938a1b2a5d..f1365c7461 100755 --- a/tests/resources/rebuild_example_data +++ b/tests/resources/rebuild_example_data @@ -78,6 +78,15 @@ def changed_results(new, old): def eof(*args, **kwargs): raise EOFError() + +@click.command() +@click.option( + '--rebuild-input', + is_flag=True, + default=False, + help="Should we rebuild the input data from Yahoo?", +) + @click.pass_context def main(ctx, rebuild_input): """Rebuild the perf data for test_examples @@ -92,6 +101,14 @@ def main(ctx, rebuild_input): # as we use in test_examples.py. environ = {'ZIPLINE_ROOT': d.getpath('example_data/root')} + if rebuild_input: + raise NotImplementedError( + ("We cannot rebuild input for Yahoo because of " + "changes Yahoo made to their API, so we cannot " + "use Yahoo data bundles anymore. This will be fixed in " + "a future release") + ) + new_perf_path = d.getpath( 'example_data/new_perf/%s' % pd.__version__.replace('.', '-'), ) diff --git a/zipline/data/loader.py b/zipline/data/loader.py index 3510f55f00..80e94d4db0 100644 --- a/zipline/data/loader.py +++ b/zipline/data/loader.py @@ -201,6 +201,11 @@ def ensure_benchmark_data(symbol, first_date, last_date, now, trading_day, We attempt to download data unless we already have data stored at the data cache for `symbol` whose first entry is before or on `first_date` and whose last entry is on or after `last_date`. + + If we perform a download and the cache criteria are not satisfied, we wait + at least one hour before attempting a redownload. This is determined by + comparing the current time to the result of os.path.getmtime on the cache + path. """ filename = get_benchmark_filename(symbol) data = _load_cached_data(filename, first_date, last_date, now, 'benchmark',