# Download flat files
This was only possible from April 2024. We simply download everything and extract them to single ticker files.

In [1]:
import os
import pytz
import gzip
import calendar
import pandas as pd
from datetime import datetime, date, time, timedelta
from pytz import timezone
from times import get_market_dates
from fastparquet import write
import boto3
from botocore.config import Config

DATA_PATH = "../data/polygon/"

START_DATE = date(2003, 9, 10)
END_DATE = date(2024, 4, 19)

session = boto3.Session(
   aws_access_key_id='7203c471-037b-4944-96b0-effc0d3911b3',
   aws_secret_access_key='IOOFCMHAT7plpPitNmqFICLdG1AnhC5l',
)
s3 = session.client(
   's3',
   endpoint_url='https://files.polygon.io',
   config=Config(signature_version='s3v4'),
)

# Initial download
Download everything

In [None]:
for day in get_market_dates(date(2003, 9, 10), END_DATE):
    destination = DATA_PATH + f'raw/flatfiles/{day.isoformat()}.csv.gz'
    s3.download_file('flatfiles', 
                f'us_stocks_sip/minute_aggs_v1/{day.year}/{day.strftime("%m")}/{day.isoformat()}.csv.gz', 
                destination)

Extract and concatenate into monthly files

In [None]:
def concatenate_and_save(start_date, end_date, file_name):
    files = []
    market_dates = get_market_dates(start_date, end_date)
    if len(market_dates) == 0:
        return
    for day in market_dates:
        destination = DATA_PATH + f'raw/flatfiles/{day.isoformat()}.csv.gz'
        with gzip.open(destination) as f:
            all_bars = pd.read_csv(f)
            all_bars = all_bars[['window_start', 'ticker', 'open', 'high', 'low', 'close', 'volume']]
            all_bars = all_bars.rename(columns={'window_start': 'datetime'})
            all_bars = all_bars.set_index('datetime')
            all_bars.index = pd.to_datetime(all_bars.index, unit='ns') # Convert to datetime (UTC-naive)
            # Make UTC aware (in order to convert)
            # Convert UTC to ET
            # Make timezone naive
            all_bars.index = all_bars.index.tz_localize(pytz.UTC).tz_convert("US/Eastern").tz_localize(None)  

            files.append(all_bars)
            print(day)
        
    all_bars = pd.concat(files)
    all_bars = all_bars.reset_index()
    all_bars = all_bars.set_index('ticker')
    all_bars = all_bars.sort_index()

    all_bars.to_parquet(DATA_PATH + f"raw/flatfiles/{file_name}.parquet", engine="fastparquet", compression="snappy", row_group_offsets=25000)

In [None]:
for year in range(2003, 2024 + 1):
    files = []
    for month in range(1, 12 + 1):
        _, end_date = calendar.monthrange(year, month)
        concatenate_and_save(date(year, month, 1), date(year, month, end_date), f"{year}-{month}")

Split the monthly files which contains all ticker into individual ticker files.

In [45]:
for year in range(2003, 2024+1):
    files = []
    for month in range(1, 12+1):
        if not (year == 2022 and month == 10):
            continue
        print(f'{datetime.now()} | {year}-{month}')
        if not os.path.isfile(DATA_PATH + f"raw/flatfiles/{year}-{month}.parquet"):
            continue
        
        all_bars = pd.read_parquet(DATA_PATH + f"raw/flatfiles/{year}-{month}.parquet")
        all_bars = all_bars[~all_bars.index.isna()]

        if all_bars['datetime'].min().year != year or all_bars['datetime'].min().month != month:
            all_bars = all_bars[(all_bars['datetime'].dt.month == month) & (all_bars['datetime'].dt.year == year)]
            print(f'{year} | {month} HAS OUT OF BOUND DATES')

        # File names are case insensitive! This lead to big data errors (e.g. TpC and TPC are merged)
        # So we simply remove all tickers that have small letters.
        # Which we don't need anyways, because small letter = non-common stock.
        for ticker in list(filter(lambda ticker: not any(s.islower() for s in ticker), list(all_bars.index.unique()))):
            if not (ticker == 'TPC'):
                continue
            bars = all_bars.loc[ticker]
            if isinstance(bars, pd.Series):
                bars = all_bars.loc[[ticker]]
            bars = bars[['datetime', 'open', 'high', 'low', 'close', 'volume']]
            bars = bars.set_index('datetime').sort_index()
            
            # Windows quirk note: you cannot save files called 'prn'. Of course there is a ticker that is named PRN...
            # So we name it 'PRN_'. I hope there are no tickers named NULL or something please.
            if ticker == 'PRN':
                ticker = 'PRN_'

            if os.path.isfile(DATA_PATH + f'raw/m1/{ticker}.parquet'):
                write(DATA_PATH + f"raw/m1/{ticker}.parquet", bars, append=True, compression="snappy", row_group_offsets=25000)
            else:
                bars.to_parquet(DATA_PATH + f"raw/m1/{ticker}.parquet", engine="fastparquet", compression="snappy", row_group_offsets=25000)

2024-05-09 12:43:46.688066 | 2022-10


The old version of the above code runs very fast, up to end 2022. Then it becomes 20-50x slower. I found the problem:

Pandas dataframe lookup uses hash-tables so the time complexity is O(1). However, if you have null values in the index this does not apply anymore! The bars from end 2022 and upwards have null values.

In [3]:
all_bars = pd.read_parquet(DATA_PATH + f"raw/flatfiles/2022-10.parquet")
all_bars = all_bars[~all_bars.index.isna()]

In [19]:
all_bars[all_bars['open'] == 17.0012]

Unnamed: 0_level_0,datetime,open,high,low,close,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DFEN,2022-10-27 15:10:00,17.0012,17.01,17.0012,17.01,2192
PCY,2022-10-10 10:13:00,17.0012,17.01,17.0012,17.01,1154
SBNYP,2022-10-17 12:15:00,17.0012,17.0012,17.0012,17.0012,100
SH,2022-10-07 14:43:00,17.0012,17.0094,17.0,17.0094,16862
TpC,2022-10-19 15:59:00,17.0012,17.03,16.97,16.97,7230
ZHDG,2022-10-31 10:19:00,17.0012,17.0012,17.0012,17.0012,1765


In [6]:
TCP = all_bars.loc['TPC']

In [8]:
TCP = TCP.sort_values('datetime')

In [17]:
TCP[TCP['datetime'].dt.date == date(2022, 10, 19)]

Unnamed: 0_level_0,datetime,open,high,low,close,volume
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TPC,2022-10-19 09:30:00,6.030,6.03,6.030,6.03,870
TPC,2022-10-19 09:31:00,6.030,6.03,6.030,6.03,101
TPC,2022-10-19 09:35:00,6.080,6.08,6.070,6.08,1207
TPC,2022-10-19 09:37:00,6.060,6.06,6.050,6.05,774
TPC,2022-10-19 09:41:00,6.020,6.02,6.010,6.01,608
...,...,...,...,...,...,...
TPC,2022-10-19 15:57:00,6.115,6.13,6.115,6.13,7624
TPC,2022-10-19 15:58:00,6.130,6.15,6.130,6.14,9381
TPC,2022-10-19 15:59:00,6.150,6.15,6.140,6.14,11743
TPC,2022-10-19 16:00:00,6.140,6.14,6.140,6.14,5543


# Updates
Process day-by-day

In [None]:
def process_flatfile(local_file_path):
    """Unzips the flat file and split or append it to ticker files.
    """
    with gzip.open(local_file_path) as f:
        all_bars = pd.read_csv(f)
        all_bars = all_bars[['window_start', 'ticker', 'open', 'high', 'low', 'close', 'volume']]
        all_bars = all_bars.rename(columns={'window_start': 'datetime'})
        all_bars = all_bars.set_index('datetime')
        all_bars.index = pd.to_datetime(all_bars.index, unit='ns') # Convert to datetime (UTC-naive)
        all_bars.index = all_bars.index.tz_localize(pytz.UTC)  # Make UTC aware (in order to convert)
        all_bars.index = all_bars.index.tz_convert("US/Eastern")  # Convert UTC to ET
        all_bars.index = all_bars.index.tz_localize(None)  # Make timezone naive
        
        for ticker in all_bars['ticker'].unique():
            bars = all_bars[all_bars['ticker'] == ticker]
            bars = bars[['open', 'high', 'low', 'close', 'volume']]

            if os.path.isfile(DATA_PATH + f'raw/m1/{ticker}.parquet'):
                write(DATA_PATH + f"raw/m1/{ticker}.parquet", bars, append=True, compression="snappy", row_group_offsets=25000)
            else:
                bars.to_parquet(DATA_PATH + f"raw/m1/{ticker}.parquet", engine="fastparquet", compression="snappy", row_group_offsets=25000)

In [None]:
for day in get_market_dates(START_DATE, END_DATE):
    destination = DATA_PATH + f'raw/{day.isoformat()}.csv.gz'
    s3.download_file('flatfiles', 
                 f'us_stocks_sip/minute_aggs_v1/{day.year}/{day.strftime("%m")}/{day.isoformat()}.csv.gz', 
                 destination)
    process_flatfile(destination)