⚖️ Feature: Volume Weighted Average ⚖️ (#61)

* resolved * add mergers to planned data collection in readme * add get_intraday_path * comments * add institutional sentiment to readme data checklista * to compile * limit polygon save timeframe to prevent retroactive weirdness * add multiprocessing to speed up data updates * delete local data files after each file update to preserve disk space * constants tests done * conflict * conflict * polygon test_intraday * iex test_intraday * test_save_intraday + repo rename * readme * git cleanup local branches * package name change * remove package upload in dev pipeline * fix bugs and add volume weighted avg price to ohlc * tests * lint * lint again * pandas warning * fix date polygon
suchak1 · Nov 8, 2020 · 9c338b7 · 9c338b7
1 parent 7b2df8f
commit 9c338b7
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 47 deletions.
diff --git a/.github/workflows/sandbox.yml b/.github/workflows/sandbox.yml
@@ -79,13 +79,3 @@ jobs:
 
       - name: Upload repo to S3
         run: python scripts/update_repo.py
-
-      - name: Publish package
-        env:
-          GITHUB: ${{ secrets.GITHUB }}
-          PYPI_TEST: ${{ secrets.PYPI_TEST }}
-        run: |
-          pip install twine python-dotenv setuptools wheel
-          python setup.py sdist bdist_wheel
-          python util/pypi.py
-          python -m twine upload --repository testpypi dist/*
diff --git a/scripts/update_hist_ohlc.py b/scripts/update_hist_ohlc.py
@@ -10,7 +10,7 @@
 iex = IEXCloud()
 poly_stocks = Polygon()
 poly_crypto = Polygon(os.environ['POLYGON'])
-stock_symbols = iex.get_symbols()[140:]
+stock_symbols = iex.get_symbols()[250:]
 crypto_symbols = POLY_CRYPTO_SYMBOLS
 # Double redundancy
 

diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def get_readme():
 
 
 setup(
-    name='hyperdrive-quant',
+    name='hyperdrive',
     version=get_version(),
     description='An algorithmic trading platform',
     long_description=get_readme(),

diff --git a/src/Constants.py b/src/Constants.py
@@ -58,6 +58,7 @@ def get_env_bool(var_name):
 LOW = 'Low'
 CLOSE = 'Close'
 VOL = 'Vol'
+AVG = 'Avg'
 
 # Sentiment
 POS = 'Pos'

diff --git a/src/DataSource.py b/src/DataSource.py
@@ -3,8 +3,7 @@
 from time import sleep
 import pandas as pd
 from pytz import timezone
-from operator import attrgetter
-from datetime import datetime, timedelta
+from datetime import datetime
 from polygon import RESTClient
 from dotenv import load_dotenv
 from FileOps import FileReader, FileWriter
@@ -124,8 +123,8 @@ def save_splits(self, **kwargs):
     def standardize_ohlc(self, symbol, df):
         full_mapping = dict(
             zip(
-                ['date', 'open', 'high', 'low', 'close', 'volume'],
-                [C.TIME, C.OPEN, C.HIGH, C.LOW, C.CLOSE, C.VOL]
+                ['date', 'open', 'high', 'low', 'close', 'volume', 'average'],
+                [C.TIME, C.OPEN, C.HIGH, C.LOW, C.CLOSE, C.VOL, C.AVG]
             )
         )
         df = self.standardize(
@@ -413,25 +412,11 @@ def _get_splits(symbol, timeframe='max'):
         return self.try_again(func=_get_splits, **kwargs)
 
     def get_ohlc(self, **kwargs):
-        def _get_prev_ohlc(symbol):
-            today = datetime.now(timezone('US/Eastern'))
-            one_day = timedelta(days=1)
-            yesterday = today - one_day
-            formatted_date = yesterday.strftime('%Y-%m-%d')
-            response = self.client.stocks_equities_daily_open_close(
-                symbol, formatted_date, unadjusted=False)
-            raw = attrgetter('from_', 'open', 'high', 'low',
-                             'close', 'volume')(response)
-            labels = ['date', 'open', 'high', 'low', 'close', 'volume']
-            data = dict(zip(labels, raw))
-            df = pd.DataFrame([data])
-            return self.standardize_ohlc(symbol, df)
-
         def _get_ohlc(symbol, timeframe='max'):
-            if timeframe == '1d':
-                return _get_prev_ohlc(symbol)
-            end = datetime.now(timezone('US/Eastern'))
-            delta = self.reader.convert_delta(timeframe)
+            end = datetime.now(timezone('US/Eastern')) - \
+                self.reader.convert_delta('1d')
+            delta = self.reader.convert_delta(
+                timeframe) - self.reader.convert_delta('1d')
             start = end - delta
             formatted_start = start.strftime('%Y-%m-%d')
             formatted_end = end.strftime('%Y-%m-%d')
@@ -440,7 +425,8 @@ def _get_ohlc(symbol, timeframe='max'):
                 from_=formatted_start, to=formatted_end, unadjusted=False
             ).results
             columns = {'t': 'date', 'o': 'open', 'h': 'high',
-                       'l': 'low', 'c': 'close', 'v': 'volume'}
+                       'l': 'low', 'c': 'close', 'v': 'volume',
+                       'vw': 'average'}
             df = pd.DataFrame(response).rename(columns=columns)
             df['date'] = df['date'].apply(
                 lambda x: datetime.fromtimestamp(int(x)/1000))

diff --git a/src/FileOps.py b/src/FileOps.py
@@ -1,7 +1,8 @@
 import os
 import json
 import time
-from datetime import date, datetime, timedelta
+from pytz import timezone
+from datetime import datetime, timedelta
 import pandas as pd
 from Storage import Store
 # consider combining fileoperations into one class
@@ -83,16 +84,18 @@ def convert_delta(self, timeframe):
 
         return delta
 
-    def data_in_timeframe(self, df, col, timeframe='max', tolerance='0d'):
+    def data_in_timeframe(self, df, col, timeframe='max'):  # noqa , tolerance='0d'):
         if col not in df:
             return df
         delta = self.convert_delta(timeframe)
-        tol = self.convert_delta(tolerance)
-        df[col] = pd.to_datetime(df[col])
-        filtered = df[df[col] > pd.to_datetime(date.today() - delta)]
-        if filtered.empty:
-            filtered = df[df[col] > pd.to_datetime(
-                date.today() - (delta + tol))]
+        # tol = self.convert_delta(tolerance)
+        tz = timezone('US/Eastern')
+        df[col] = pd.to_datetime(df[col]).dt.tz_localize(tz)
+        today = datetime.now(tz)
+        filtered = df[df[col] > pd.to_datetime(today - delta)].copy(deep=True)
+        # if filtered.empty:
+        #     filtered = df[df[col] > pd.to_datetime(today - (delta + tol))]
+        filtered[col] = filtered[col].dt.tz_localize(None)
         return filtered
 
     # def convert_dates(self, timeframe):

diff --git a/src/Storage.py b/src/Storage.py
@@ -34,7 +34,7 @@ def upload_dir(self, **kwargs):
 
     def delete_objects(self, keys):
         if keys:
-            objects = [{'Key': key} for key in keys]
+            objects = [{'Key': key.replace('\\', '/')} for key in keys]
             bucket = self.get_bucket()
             bucket.delete_objects(Delete={'Objects': objects})
 
@@ -44,6 +44,7 @@ def get_all_keys(self):
         return keys
 
     def key_exists(self, key, download=False):
+        key = key.replace('\\', '/')
         try:
             if download:
                 self.download_file(key)
@@ -60,13 +61,17 @@ def download_file(self, key):
             self.finder.make_path(key)
             with open(key, 'wb') as file:
                 bucket = self.get_bucket()
-                bucket.download_fileobj(key, file)
+                s3_key = key.replace('\\', '/')
+                bucket.download_fileobj(s3_key, file)
         except ClientError as e:
             print(f'{key} does not exist in S3.')
             os.remove(key)
+            print(e)
             raise e
 
     def copy_object(self, src, dst):
+        src = src.replace('\\', '/')
+        dst = dst.replace('\\', '/')
         bucket = self.get_bucket()
         copy_source = {
             'Bucket': self.bucket_name,
@@ -75,16 +80,20 @@ def copy_object(self, src, dst):
         bucket.copy(copy_source, dst)
 
     def rename_key(self, old_key, new_key):
+        old_key = old_key.replace('\\', '/')
+        new_key = new_key.replace('\\', '/')
         self.copy_object(old_key, new_key)
         self.delete_objects([old_key])
 
     def last_modified(self, key):
+        key = key.replace('\\', '/')
         bucket = self.get_bucket()
         obj = bucket.Object(key)
         then = obj.last_modified.replace(tzinfo=None)
         return then
 
     def modified_delta(self, key):
+        key = key.replace('\\', '/')
         then = self.last_modified(key)
         now = datetime.utcnow()
         return now - then
diff --git a/test/test_DataSource.py b/test/test_DataSource.py
@@ -261,7 +261,7 @@ def test_save_ohlc(self):
 
     #     assert md.reader.check_file_exists(intra_path)
     #     assert md.reader.store.modified_delta(
-    #          intra_path).total_seconds() < 60)
+    # intra_path).total_seconds() < 60
     #     df = md.reader.load_csv(intra_path)
     #     assert {C.TIME, C.OPEN, C.HIGH, C.LOW,
     #             C.CLOSE, C.VOL}.issubset(df.columns)
@@ -362,7 +362,7 @@ def test_get_splits(self):
     def test_get_ohlc(self):
         df = poly.get_ohlc(symbol='AAPL', timeframe='1m')
         assert {C.TIME, C.OPEN, C.HIGH, C.LOW,
-                C.CLOSE, C.VOL}.issubset(df.columns)
+                C.CLOSE, C.VOL, C.AVG}.issubset(df.columns)
         assert len(df) > 10
 
     # def test_get_intraday(self):