PERF: Change asset finder to be backed by sqlite3.

Attack the startup bottleneck of creating the asset finders caches for a large universe, which was between 1-2 seconds on development and production machines. Instead, allow the AssetFinder to be passed a sqlite3 file that has already been populated and then hydrate asset objects only when an equity is referenced for the first time. To create aforementioned sqlite3, create an AssetFinder with an db_path and `create_table` set to True. If `create_table` is set to False, the prepopulated data in the sqlite file found at db_path will be used. Default behavior is to use an in memory database. Behavior that changes: - Fuzzy lookup now only works on one character, that character needs to be specified at write/metadata consumption time, since the fuzzy lookup key is created by dropping the character from each symbol. - Overwriting partially written metadata is no longer supported. i.e. some unit tests allowed for inserting just the identifier, and then later updating the symbol, end_date, etc. Instead of building an upsert behavior at this time, this patch changes the unit tests so that the data for each asset is only inserted once. Other notes: - populate_cache is now removed, since there is no longer a two step process of inserting metadata and then realizing that metadata into assets. _spawn_asset is rolled into insert_metadata, so that a call to insert_metadata both converts the metadata and makes it available in the data store.
quantopian · Jul 13, 2015 · 11ae7a7 · 11ae7a7
1 parent 2ab9f8a
commit 11ae7a7
Show file tree

Hide file tree

Showing 5 changed files with 528 additions and 267 deletions.
diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
@@ -23,6 +23,7 @@
 import numpy as np
 import pandas as pd
 
+from zipline.assets import AssetFinder
 from zipline.utils.test_utils import (
     nullctx,
     setup_logger,
@@ -1277,25 +1278,33 @@ def test_asset_date_bounds(self):
         df_source, _ = factory.create_test_df_source(self.sim_params)
         metadata = {0: {'start_date': '1990-01-01',
                         'end_date': '2020-01-01'}}
-        algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
-                                           sim_params=self.sim_params,)
+        asset_finder = AssetFinder()
+        algo = SetAssetDateBoundsAlgorithm(
+            asset_finder=asset_finder,
+            asset_metadata=metadata,
+            sim_params=self.sim_params,)
         algo.run(df_source)
 
         # Run the algorithm with a sid that has already ended
         df_source, _ = factory.create_test_df_source(self.sim_params)
         metadata = {0: {'start_date': '1989-01-01',
                         'end_date': '1990-01-01'}}
-        algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
-                                           sim_params=self.sim_params,)
+        asset_finder = AssetFinder()
+        algo = SetAssetDateBoundsAlgorithm(
+            asset_finder=asset_finder,
+            asset_metadata=metadata,
+            sim_params=self.sim_params,)
         with self.assertRaises(TradingControlViolation):
             algo.run(df_source)
 
         # Run the algorithm with a sid that has not started
         df_source, _ = factory.create_test_df_source(self.sim_params)
         metadata = {0: {'start_date': '2020-01-01',
                         'end_date': '2021-01-01'}}
-        algo = SetAssetDateBoundsAlgorithm(asset_metadata=metadata,
-                                           sim_params=self.sim_params,)
+        algo = SetAssetDateBoundsAlgorithm(
+            asset_finder=asset_finder,
+            asset_metadata=metadata,
+            sim_params=self.sim_params,)
         with self.assertRaises(TradingControlViolation):
             algo.run(df_source)
 

diff --git a/tests/test_assets.py b/tests/test_assets.py
@@ -20,14 +20,12 @@
 import sys
 from unittest import TestCase
 
-from datetime import (
-    timedelta,
-    datetime
-)
+from datetime import datetime, timedelta
 import pickle
 import uuid
 import warnings
 import pandas as pd
+from pandas.tseries.tools import normalize_date
 
 from nose_parameterized import parameterized
 
@@ -289,7 +287,7 @@ def test_lookup_symbol_fuzzy(self):
                 for i in range(3)
             ]
         )
-        finder = AssetFinder(frame)
+        finder = AssetFinder(frame, fuzzy_char='@')
         asset_0, asset_1, asset_2 = (
             finder.retrieve_asset(i) for i in range(3)
         )
@@ -304,17 +302,15 @@ def test_lookup_symbol_fuzzy(self):
             # Adding an unnecessary fuzzy shouldn't matter.
             self.assertEqual(
                 asset_1,
-                finder.lookup_symbol('test@1', as_of, fuzzy='@')
+                finder.lookup_symbol('test@1', as_of, fuzzy=True)
             )
 
             # Shouldn't find this with no fuzzy_str passed.
             self.assertIsNone(finder.lookup_symbol('test1', as_of))
-            # Shouldn't find this with an incorrect fuzzy_str.
-            self.assertIsNone(finder.lookup_symbol('test1', as_of, fuzzy='*'))
-            # Should find it with the correct fuzzy_str.
+            # Should find exact match.
             self.assertEqual(
                 asset_1,
-                finder.lookup_symbol('test1', as_of, fuzzy='@'),
+                finder.lookup_symbol('test1', as_of, fuzzy=True),
             )
 
     def test_lookup_symbol_resolve_multiple(self):
@@ -434,35 +430,28 @@ def test_insert_metadata(self):
                                foo_data="FOO",)
 
         # Test proper insertion
-        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
-        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
-        self.assertEqual('2015-01-01', finder.metadata_cache[0]['end_date'])
+        equity = finder.retrieve_asset(0)
+        self.assertIsInstance(equity, Equity)
+        self.assertEqual('PLAY', equity.symbol)
+        self.assertEqual(pd.Timestamp('2015-01-01', tz='UTC'),
+                         equity.end_date)
 
         # Test invalid field
         self.assertFalse('foo_data' in finder.metadata_cache[0])
 
-        # Test updating fields
-        finder.insert_metadata(0,
-                               asset_type='equity',
-                               start_date='2014-01-01',
-                               end_date='2015-02-01',
-                               symbol="PLAY",
-                               exchange="NYSE",)
-        self.assertEqual('2015-02-01', finder.metadata_cache[0]['end_date'])
-        self.assertEqual('NYSE', finder.metadata_cache[0]['exchange'])
-
-        # Check that old data survived
-        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
-
     def test_consume_metadata(self):
 
         # Test dict consumption
-        finder = AssetFinder({0: {'asset_type': 'equity'}})
+        finder = AssetFinder()
         dict_to_consume = {0: {'symbol': 'PLAY'},
                            1: {'symbol': 'MSFT'}}
         finder.consume_metadata(dict_to_consume)
-        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
-        self.assertEqual('PLAY', finder.metadata_cache[0]['symbol'])
+
+        equity = finder.retrieve_asset(0)
+        self.assertIsInstance(equity, Equity)
+        self.assertEqual('PLAY', equity.symbol)
+
+        finder = AssetFinder()
 
         # Test dataframe consumption
         df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
@@ -473,11 +462,8 @@ def test_consume_metadata(self):
         finder.consume_metadata(df)
         self.assertEqual('NASDAQ', finder.metadata_cache[0]['exchange'])
         self.assertEqual('Microsoft', finder.metadata_cache[1]['asset_name'])
-        # Check that old data survived
-        self.assertEqual('equity', finder.metadata_cache[0]['asset_type'])
 
     def test_consume_asset_as_identifier(self):
-
         # Build some end dates
         eq_end = pd.Timestamp('2012-01-01', tz='UTC')
         fut_end = pd.Timestamp('2008-01-01', tz='UTC')
@@ -489,7 +475,6 @@ def test_consume_asset_as_identifier(self):
         # Consume the Assets
         finder = AssetFinder()
         finder.consume_identifiers([equity_asset, future_asset])
-        finder.populate_cache()
 
         # Test equality with newly built Assets
         self.assertEqual(equity_asset, finder.retrieve_asset(1))
@@ -503,12 +488,15 @@ def test_sid_assignment(self):
         metadata = {'PLAY': {'symbol': 'PLAY'},
                     'MSFT': {'symbol': 'MSFT'}}
 
+        today = normalize_date(pd.Timestamp('2015-07-09', tz='UTC'))
+
         # Build a finder that is allowed to assign sids
-        finder = AssetFinder(metadata=metadata, allow_sid_assignment=True)
+        finder = AssetFinder(metadata=metadata,
+                             allow_sid_assignment=True)
 
         # Verify that Assets were built and different sids were assigned
-        play = finder.lookup_symbol('PLAY', datetime.now())
-        msft = finder.lookup_symbol('MSFT', datetime.now())
+        play = finder.lookup_symbol('PLAY', today)
+        msft = finder.lookup_symbol('MSFT', today)
         self.assertEqual('PLAY', play.symbol)
         self.assertIsNotNone(play.sid)
         self.assertNotEqual(play.sid, msft.sid)

diff --git a/tests/test_perf_tracking.py b/tests/test_perf_tracking.py
@@ -34,6 +34,7 @@
 import numpy as np
 from six.moves import range, zip
 
+from zipline.assets import AssetFinder
 import zipline.utils.factory as factory
 import zipline.finance.performance as perf
 from zipline.finance.slippage import Transaction, create_transaction
@@ -2132,7 +2133,10 @@ def test_update_last_sale(self, env=None):
         metadata = {1: {'asset_type': 'equity'},
                     2: {'asset_type': 'future',
                         'contract_multiplier': 1000}}
-        env.update_asset_finder(asset_metadata=metadata)
+        asset_finder = AssetFinder()
+        env.update_asset_finder(
+            asset_finder=asset_finder,
+            asset_metadata=metadata)
         pt = perf.PositionTracker()
         dt = pd.Timestamp("1984/03/06 3:00PM")
         pos1 = perf.Position(1, amount=np.float64(100.0),