Skip to content

Commit

Permalink
PERF: Change asset finder to be backed by sqlite3.
Browse files Browse the repository at this point in the history
Attack the startup bottleneck of creating the asset finders caches for a
large universe, which was between 1-2 seconds on development and
production machines.

Instead, allow the AssetFinder to be passed a sqlite3 file that has
already been populated and then hydrate asset objects only when an
equity is referenced for the first time.

To create aforementioned sqlite3, create an AssetFinder with an db_path
and `create_table` set to True. If `create_table` is set to False, the
prepopulated data in the sqlite file found at db_path will be used.

Default behavior is to use an in memory database.

Behavior that changes:

- Fuzzy lookup now only works on one character, that character needs to be
specified at write/metadata consumption time, since the fuzzy lookup key
is created by dropping the character from each symbol.

- Overwriting partially written metadata is no longer
  supported. i.e. some unit tests allowed for inserting just the identifier,
  and then later updating the symbol, end_date, etc.

  Instead of building an upsert behavior at this time, this patch
  changes the unit tests so that the data for each asset is only
  inserted once.

Other notes:

- populate_cache is now removed, since there is no longer a two step
  process of inserting metadata and then realizing that metadata into
  assets. _spawn_asset is rolled into insert_metadata, so that a call to
  insert_metadata both converts the metadata and makes it available in
  the data store.
  • Loading branch information
ehebert authored and Eddie Hebert committed Jul 13, 2015
1 parent 2ab9f8a commit 11ae7a7
Show file tree
Hide file tree
Showing 5 changed files with 528 additions and 267 deletions.
21 changes: 15 additions & 6 deletions tests/test_algorithm.py
Expand Up @@ -23,6 +23,7 @@
import numpy as np
import pandas as pd

from zipline.assets import AssetFinder
from zipline.utils.test_utils import (
nullctx,
setup_logger,
Expand Down Expand Up @@ -1277,25 +1278,33 @@ def test_asset_date_bounds(self):
df_source, _ = factory.create_test_df_source(self.sim_params)
metadata = {0: {'start_date': '1990-01-01',
'end_date': '2020-01-01'}}
algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
sim_params=self.sim_params,)
asset_finder = AssetFinder()
algo = SetAssetDateBoundsAlgorithm(
asset_finder=asset_finder,
asset_metadata=metadata,
sim_params=self.sim_params,)
algo.run(df_source)

# Run the algorithm with a sid that has already ended
df_source, _ = factory.create_test_df_source(self.sim_params)
metadata = {0: {'start_date': '1989-01-01',
'end_date': '1990-01-01'}}
algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
sim_params=self.sim_params,)
asset_finder = AssetFinder()
algo = SetAssetDateBoundsAlgorithm(
asset_finder=asset_finder,
asset_metadata=metadata,
sim_params=self.sim_params,)
with self.assertRaises(TradingControlViolation):
algo.run(df_source)

# Run the algorithm with a sid that has not started
df_source, _ = factory.create_test_df_source(self.sim_params)
metadata = {0: {'start_date': '2020-01-01',
'end_date': '2021-01-01'}}
algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
sim_params=self.sim_params,)
algo = SetAssetDateBoundsAlgorithm(
asset_finder=asset_finder,
asset_metadata=metadata,
sim_params=self.sim_params,)
with self.assertRaises(TradingControlViolation):
algo.run(df_source)

Expand Down
60 changes: 24 additions & 36 deletions tests/test_assets.py
Expand Up @@ -20,14 +20,12 @@
import sys
from unittest import TestCase

from datetime import (
timedelta,
datetime
)
from datetime import datetime, timedelta
import pickle
import uuid
import warnings
import pandas as pd
from pandas.tseries.tools import normalize_date

from nose_parameterized import parameterized

Expand Down Expand Up @@ -289,7 +287,7 @@ def test_lookup_symbol_fuzzy(self):
for i in range(3)
]
)
finder = AssetFinder(frame)
finder = AssetFinder(frame, fuzzy_char='@')
asset_0, asset_1, asset_2 = (
finder.retrieve_asset(i) for i in range(3)
)
Expand All @@ -304,17 +302,15 @@ def test_lookup_symbol_fuzzy(self):
# Adding an unnecessary fuzzy shouldn't matter.
self.assertEqual(
asset_1,
finder.lookup_symbol('test@1', as_of, fuzzy='@')
finder.lookup_symbol('test@1', as_of, fuzzy=True)
)

# Shouldn't find this with no fuzzy_str passed.
self.assertIsNone(finder.lookup_symbol('test1', as_of))
# Shouldn't find this with an incorrect fuzzy_str.
self.assertIsNone(finder.lookup_symbol('test1', as_of, fuzzy='*'))
# Should find it with the correct fuzzy_str.
# Should find exact match.
self.assertEqual(
asset_1,
finder.lookup_symbol('test1', as_of, fuzzy='@'),
finder.lookup_symbol('test1', as_of, fuzzy=True),
)

def test_lookup_symbol_resolve_multiple(self):
Expand Down Expand Up @@ -434,35 +430,28 @@ def test_insert_metadata(self):
foo_data="FOO",)

# Test proper insertion
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
self.assertEqual('2015-01-01', finder.metadata_cache[0]['end_date'])
equity = finder.retrieve_asset(0)
self.assertIsInstance(equity, Equity)
self.assertEqual('PLAY', equity.symbol)
self.assertEqual(pd.Timestamp('2015-01-01', tz='UTC'),
equity.end_date)

# Test invalid field
self.assertFalse('foo_data' in finder.metadata_cache[0])

# Test updating fields
finder.insert_metadata(0,
asset_type='equity',
start_date='2014-01-01',
end_date='2015-02-01',
symbol="PLAY",
exchange="NYSE",)
self.assertEqual('2015-02-01', finder.metadata_cache[0]['end_date'])
self.assertEqual('NYSE', finder.metadata_cache[0]['exchange'])

# Check that old data survived
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])

def test_consume_metadata(self):

# Test dict consumption
finder = AssetFinder({0: {'asset_type': 'equity'}})
finder = AssetFinder()
dict_to_consume = {0: {'symbol': 'PLAY'},
1: {'symbol': 'MSFT'}}
finder.consume_metadata(dict_to_consume)
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])

equity = finder.retrieve_asset(0)
self.assertIsInstance(equity, Equity)
self.assertEqual('PLAY', equity.symbol)

finder = AssetFinder()

# Test dataframe consumption
df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
Expand All @@ -473,11 +462,8 @@ def test_consume_metadata(self):
finder.consume_metadata(df)
self.assertEqual('NASDAQ', finder.metadata_cache[0]['exchange'])
self.assertEqual('Microsoft', finder.metadata_cache[1]['asset_name'])
# Check that old data survived
self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])

def test_consume_asset_as_identifier(self):

# Build some end dates
eq_end = pd.Timestamp('2012-01-01', tz='UTC')
fut_end = pd.Timestamp('2008-01-01', tz='UTC')
Expand All @@ -489,7 +475,6 @@ def test_consume_asset_as_identifier(self):
# Consume the Assets
finder = AssetFinder()
finder.consume_identifiers([equity_asset, future_asset])
finder.populate_cache()

# Test equality with newly built Assets
self.assertEqual(equity_asset, finder.retrieve_asset(1))
Expand All @@ -503,12 +488,15 @@ def test_sid_assignment(self):
metadata = {'PLAY': {'symbol': 'PLAY'},
'MSFT': {'symbol': 'MSFT'}}

today = normalize_date(pd.Timestamp('2015-07-09', tz='UTC'))

# Build a finder that is allowed to assign sids
finder = AssetFinder(metadata=metadata, allow_sid_assignment=True)
finder = AssetFinder(metadata=metadata,
allow_sid_assignment=True)

# Verify that Assets were built and different sids were assigned
play = finder.lookup_symbol('PLAY', datetime.now())
msft = finder.lookup_symbol('MSFT', datetime.now())
play = finder.lookup_symbol('PLAY', today)
msft = finder.lookup_symbol('MSFT', today)
self.assertEqual('PLAY', play.symbol)
self.assertIsNotNone(play.sid)
self.assertNotEqual(play.sid, msft.sid)
Expand Down
6 changes: 5 additions & 1 deletion tests/test_perf_tracking.py
Expand Up @@ -34,6 +34,7 @@
import numpy as np
from six.moves import range, zip

from zipline.assets import AssetFinder
import zipline.utils.factory as factory
import zipline.finance.performance as perf
from zipline.finance.slippage import Transaction, create_transaction
Expand Down Expand Up @@ -2132,7 +2133,10 @@ def test_update_last_sale(self, env=None):
metadata = {1: {'asset_type': 'equity'},
2: {'asset_type': 'future',
'contract_multiplier': 1000}}
env.update_asset_finder(asset_metadata=metadata)
asset_finder = AssetFinder()
env.update_asset_finder(
asset_finder=asset_finder,
asset_metadata=metadata)
pt = perf.PositionTracker()
dt = pd.Timestamp("1984/03/06 3:00PM")
pos1 = perf.Position(1, amount=np.float64(100.0),
Expand Down

0 comments on commit 11ae7a7

Please sign in to comment.