In [1]:
import struct
import sys
import gzip
import shutil
from pathlib import Path
from urllib.request import urlretrieve
from urllib.parse import urljoin

sys.path.append('../src')

In [2]:
# data saved on external drive due to size 
data_path = Path("../../../../Volumes/external_drive/nasdaq_itch/data")
URL = "https://emi.nasdaq.com/ITCH/Nasdaq%20ITCH/"
SOURCE_FILE = "12302019.NASDAQ_ITCH50.gz"

In [3]:
def may_be_download(url):
    """Download & unzip ITCH data if not yet available"""
    if not data_path.exists():
        print('Creating directory')
        data_path.mkdir()
    else: 
        print('Directory exists')

    filename = data_path / url.split('/')[-1]        
    if not filename.exists():
        print('Downloading...', url)
        urlretrieve(url, filename)
    else: 
        print('File exists')        

    unzipped = data_path / (filename.stem + '.bin')
    if not unzipped.exists():
        print('Unzipping to', unzipped)
        with gzip.open(str(filename), 'rb') as f_in:
            with open(unzipped, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else: 
        print('File already unpacked')
    return unzipped

In [4]:
file_name = may_be_download(urljoin(URL, SOURCE_FILE))

Directory exists
File exists
File already unpacked


In [5]:
# use > becasue all are big-endian
message_types = {'A': '>HH6sQsI8sI',
                 'B': '>HH6sQ',
                 'C': '>HH6sQIQsI',
                 'D': '>HH6sQ',
                 'E': '>HH6sQIQ',
                 'F': '>HH6sQsI8sI4s',
                 'H': '>HH6s8sss4s',
                 'I': '>HH6sQQs8sIIIss',
                 'J': '>HH6s8sIIII',
                 'K': '>HH6s8sIsI',
                 'L': '>HH6s4s8ssss',
                 'P': '>HH6sQsI8sIQ',
                 'Q': '>HH6sQ8sIQs',
                 'R': '>HH6s8sssIss2ssssssIs',
                 'S': '>HH6ss',
                 'U': '>HH6sQQII',
                 'V': '>HH6sQQQ',
                 'W': '>HH6ss',
                 'X': '>HH6sQI',
                 'Y': '>HH6s8ss',
                 'h': '>HH6s8sss'}


In [6]:
output_msft_file = "msft.bin"
count = 0
with file_name.open('rb') as data, open(output_msft_file, 'wb') as outfile:
    while True:
    #for i in range(50_000_000):
        # determine message size in bytes
        message_size = int.from_bytes(data.read(2), byteorder='big', signed=False)
        message_type = data.read(1).decode('ascii')  
        try:
            # using the message size, read the relevant number of bytes
            binary_stream = data.read(message_size - 1)
            # to get only msft events, first find the stock_locate attribute
            if message_type == "S":
                s_message = struct.unpack(message_types[message_type], binary_stream)
                if s_message[3].decode('ascii') == 'C':
                    break
            elif message_type == "R":
                r_message = struct.unpack(message_types[message_type], binary_stream)
                # find msft
                if r_message[3].decode('ascii').strip() == 'MSFT':
                    msft_stock_locate = r_message[0]
            # read all stock messages
            elif message_type in ["A", "F", "E", "C", "X", "U", "D", "P", "Q"]:
                message = struct.unpack(message_types[message_type], binary_stream)
                # repackage all msft ones and save to their own file
                if message[0] == msft_stock_locate:
                    # add the event type back
                    new_fmt = message_types[message_type]
                    new_fmt = new_fmt.strip(">")
                    repackaged = struct.pack(">c"+new_fmt, message_type.encode('ascii'), *message)
                    # check the length of this message
                    # repackage again, this time including the lenght
                    repackaged_w_len = struct.pack(">Hc"+new_fmt, len(repackaged)+1, message_type.encode('ascii'), *message)
                    outfile.write(repackaged_w_len)
                    count+=1
            else:
                pass
        except Exception as e:
            print(e)

count

1220971