In [3]:
from zipfile import ZipFile
import numpy as np
import tables as tb

In [4]:
fname = '../local_data/EQY_US_ALL_BBO_20140206.zip'

```
fields of dailyquotes file taqquote
[0:8]HHMMSSXXX
[9] text EXCHANGE N Nyse  T/Q NASDAQ
[10:25] text symbol 6+10
[26:36] bid price 7+4
[37:43] bid size (units)
[44:54] ask price 7+4
[55:61] ask size
[62] text Condition of quote
[63:66] market maker
[67] bid exchange
[68] ask aexchange
[69:84] int seqno
[85] int bbo indicator
[86] int NASDAQ BBO indocator
[87] text cancel/correction
[88] text C=CTA N=UTP
[90] text Retail interest indicator
[...]
```

### Zip Files are a PITA

In [5]:
zf = ZipFile(fname)

In [6]:
f = zf.filelist[0]

In [7]:
f.filename

'taqquote20140206'

### Read this in

In [8]:
# Two characters are also used at the end of each line as a line indicator
widths = [9, 1, 16, 11, 7, 11, 7, 1, 4, 1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [9]:
w = np.array(widths)
w.cumsum()

array([ 9, 10, 26, 37, 44, 55, 62, 63, 67, 68, 69, 85, 86, 87, 88, 89, 90,
       91, 92, 93, 94, 95, 96])

Note that we'll have 98 bytes total with the `\r\n` on the end.

In [10]:
BYTES_PER_LINE = 98

```
# Note - we're using object here (as pandas will do this anyway), 
# and we'll need to convert back to fixed width strings later
# We can get the widths from the widths list above
old_dtype = [('Time', np.datetime64),
         ('Exchange', object),  # |S1
         ('Symbol', object),  # |S16, etc.
         ('Bid_Price', np.float64),
         ('Bid_Size', np.int32),
         ('Ask_Price', np.float64),
         ('Ask_Size', np.int32),
         ('Quote_Condition', object),
         ('Market_Maker', np.int), # This is blank - want to skip?
         ('Bid_Exchange', object),
         ('Ask_Exchange', object),
         ('Sequence_Number', np.int64),
         ('National_BBO_Ind', np.int8), # These aren't really numbers
         ('National_BBO_Ind', np.int8), # Maybe should be string?
         ('Quote_Cancel_Correction', object),
         ('Source_of_Quote', object),
         ('Retail_Interest_Indicator_RPI', object),
         ('Short_Sale_Restriction_Indicator', object),
         ('LULD_BBO_Indicator_CQS', object),
         ('LULD_BBO_Indicator_UTP', object),
         ('FINRA_ADF_MPID_Indicator', object),
         ('SIP_generated_Message_Identifier', object),
         ('National_BBO_LULD_Indicator', object)
        ] # Then there's two characters for newline

# This was for pandas' screwball approach to dtype
# names = [a for a,b in dtype]
# dtype = dict(dtype)
```

In [11]:
# Note that the '|' character means byte order doesn't matter
initial_dtype = [('Time', 'S9'),  # HHMMSSmmm
                 # ('hour', '|S2'),
                 # ('minute', '|S2'),
                 # ('second', '|S2'),
                 # ('msec', '|S3'),
                 ('Exchange', 'S1'),
                 ('Symbol', 'S16'),
                 ('Bid_Price', 'S11'),  # 7.4 (fixed point)
                 ('Bid_Size', 'S7'),
                 ('Ask_Price', 'S11'),  # 7.4
                 ('Ask_Size', 'S7'),
                 ('Quote_Condition', 'S1'),
                 ('Market_Maker', 'S4'),
                 ('Bid_Exchange', 'S1'),
                 ('Ask_Exchange', 'S1'),
                 ('Sequence_Number', 'S16'),
                 ('National_BBO_Ind', 'S1'),
                 ('NASDAQ_BBO_Ind', 'S1'),
                 ('Quote_Cancel_Correction', 'S1'),
                 ('Source_of_Quote', 'S1'),
                 ('Retail_Interest_Indicator_RPI', 'S1'),
                 ('Short_Sale_Restriction_Indicator', 'S1'),
                 ('LULD_BBO_Indicator_CQS', 'S1'),
                 ('LULD_BBO_Indicator_UTP', 'S1'),
                 ('FINRA_ADF_MPID_Indicator', 'S1'),
                 ('SIP_generated_Message_Identifier', 'S1'),
                 ('National_BBO_LULD_Indicator', 'S1'),
                 ('newline', 'S2')]

In [54]:
# The below could be mostly auto-generated

# Justin and Pandas use datetime64, as does PyTables.
# We could use msec from beginning of day for now in an int16
# (maybe compare performance to datetime64? But dates should compress very well...)
time_col = 'Time'  # want to convert to np.datetime64 from HHMMSSmmm

convert_dtype = [
               ('Bid_Price', np.float64),
               ('Bid_Size', np.int32),
               ('Ask_Price', np.float64),
               ('Ask_Size', np.int32),
               # ('Market_Maker', np.int8),  # This is not currently used, and should always be b'    '
               ('Sequence_Number', np.int64),
               ('National_BBO_Ind', np.int8),
               ('NASDAQ_BBO_Ind', np.int8),
              ]

passthrough_strings = ['Exchange',
                     'Symbol',
                     'Quote_Condition',
                     'Bid_Exchange',
                     'Ask_Exchange',
                     'Quote_Cancel_Correction',
                     'Source_of_Quote',
                     'Retail_Interest_Indicator_RPI',
                     'Short_Sale_Restriction_Indicator',
                     'LULD_BBO_Indicator_CQS',
                     'LULD_BBO_Indicator_UTP',
                     'FINRA_ADF_MPID_Indicator',
                     'SIP_generated_Message_Identifier',
                     'National_BBO_LULD_Indicator']

In [55]:
pytables_dtype = [('Time', np.datetime64)]
converted_dict = dict(convert_dtype)

for name, dtype in initial_dtype:
    if name in converted_dict:
        pytables_dtype.append( (name, converted_dict[name]))
    elif name in passthrough_strings:
        pytables_dtype.append( (name, dtype))

In [56]:
pytables_dtype

[('Time', numpy.datetime64),
 ('Exchange', '|S1'),
 ('Symbol', '|S16'),
 ('Bid_Price', numpy.float64),
 ('Bid_Size', numpy.int32),
 ('Ask_Price', numpy.float64),
 ('Ask_Size', numpy.int32),
 ('Quote_Condition', '|S1'),
 ('Bid_Exchange', '|S1'),
 ('Ask_Exchange', '|S1'),
 ('Sequence_Number', numpy.int64),
 ('National_BBO_Ind', numpy.int8),
 ('NASDAQ_BBO_Ind', numpy.int8),
 ('Quote_Cancel_Correction', '|S1'),
 ('Source_of_Quote', '|S1'),
 ('Retail_Interest_Indicator_RPI', '|S1'),
 ('Short_Sale_Restriction_Indicator', '|S1'),
 ('LULD_BBO_Indicator_CQS', '|S1'),
 ('LULD_BBO_Indicator_UTP', '|S1'),
 ('FINRA_ADF_MPID_Indicator', '|S1'),
 ('SIP_generated_Message_Identifier', '|S1'),
 ('National_BBO_LULD_Indicator', '|S1')]

In [59]:
tb.Filters??

In [84]:
class TAQ2HDF5:
    
    def __init__(self, taq_fname):
        self.taq_fname = taq_fname
    
    def convert_taq(self):
        # The below doesn't work for pandas (and neither does `unzip` from the command line). Probably want to use
        # something like `7z x -so my_file.zip 2> /dev/null` if we use pandas.
        with ZipFile(self.taq_fname) as zfile:
            for inside_f in zfile.filelist:
                # The original filename is available as inside_f.filename
                with zfile.open(inside_f.filename) as infile:
                    first = infile.readline()
                    # You need to use bytes to split bytes
                    _, numlines = first.split(b":")
                    numlines = int(numlines)
                    # Should I use a context manager here?
                    self.setup_hdf5(inside_f.filename, numlines)
                    self.convert_taq(numlines, infile, self.h5_table)
                    self.finalize_hdf5()
                    
    def setup_hdf5(self, h5_fname_root, numlines):
        # We're using aggressive compression and checksums, since this will likely stick around
        # Stopping one level short of max compression - don't be greedy.
        self.h5 = tb.open_file(h5_fname_root + '.h5', title=h5_fname_root, mode='w', 
                          filters=tb.Filters(compression=8, complib='blosc:lz4hc', fletcher32=True) )
                     
        self.h5_table = self.h5.create_table('/', 'daily_quotes', 
                                             description=pytables_dtype, expectedrows=numlines)
    
                          
    def finalize_hdf5(self):
        self.h5.close()

    def process_chunk(self, infile, out, chunksize, chunk_start):
        raw_bytes = infile.read(BYTES_PER_LINE * chunksize)

        # If we use asarray, it crashes Python!
        # ndarray gives 'S' arrays instead of chararrays (as recarray does)
        all_strings = np.ndarray(chunksize, buffer=raw_bytes, dtype=initial_dtype)

        chunk_stop = chunk_start + chunksize
        # Apart from Time, we can use automatic type coersion
        # Note, we're also leaving out newline here (last column in all_strings)
        out[chunk_start:chunk_stop, 1:] = all_strings[:,1:-1]
        
    # at some point, I'm thinking we should 
    def raw_conversion(self, numlines, infile, out, chunksize=1000000):
        '''Read raw bytes from TAQ, write to HDF5'''
        processed_chunks = 0
        
        # while((processed_chunks + chunksize) < numlines):
        chunk_start = processed_chunk * chunksize
        self.process_chunk(infile, out, chunksize, chunk_start)

        # Get what's left (should be <= chunksize)
        # chunk_start = processed_chunk * chunksize
        # chunksize = (numlines - processed_chunks) 
        # self.process_chunk(infile, out, chunksize, chunk_start)

In [66]:
np.array?

In [16]:
two_lines

b'075300081PA               000000000000000000000007294000000027R    PP000000000007625512 C       \r\n075300085PA               000000000000000000000006076000000010R    PP000000000007625612 C       \r\n'

## Why is conversion so hard?

numpy doesn't seem to deal well with conversion from bytes, and I don't want to convert to string just to convert to bytes (though for all I know, pandas is doing this under the hood).

In [27]:
# If we use asarray, it crashes Python!
# ndarray gives 'S' arrays instead of chararrays (as recarray does)
all_strings = np.ndarray(2, buffer=two_lines, dtype=initial_dtype)
all_strings

array([ (b'075300081', b'P', b'A               ', b'00000000000', b'0000000', b'00000729400', b'0000027', b'R', b'    ', b'P', b'P', b'0000000000076255', b'1', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b'\r\n'),
       (b'075300085', b'P', b'A               ', b'00000000000', b'0000000', b'00000607600', b'0000010', b'R', b'    ', b'P', b'P', b'0000000000076256', b'1', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', b'\r\n')], 
      dtype=[('Time', 'S9'), ('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', 'S11'), ('Bid_Size', 'S7'), ('Ask_Price', 'S11'), ('Ask_Size', 'S7'), ('Quote_Condition', 'S1'), ('Market_Maker', 'S4'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', 'S16'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP'

In [30]:
ap = all_strings_arr['Ask_Price']

In [34]:
all_strings_arr.astype?

In [31]:
ap.astype(np.float32)

array([ 729400.,  607600.], dtype=float32)

In [91]:
import pandas as pd

In [46]:
to_convert = [
 ('Bid_Price', np.float64),
 ('Bid_Size', np.int32),
 ('Ask_Price', np.float64),
 ('Ask_Size', np.int32),
    # This is currently blank
    # ('Market_Maker', np.int8),
 ('Sequence_Number', np.int64),
 ('National_BBO_Ind', np.int8),
 ('NASDAQ_BBO_Ind', np.int8)]

In [47]:
names, types = list(zip(*to_convert))

In [48]:
names

('Bid_Price',
 'Bid_Size',
 'Ask_Price',
 'Ask_Size',
 'Sequence_Number',
 'National_BBO_Ind',
 'NASDAQ_BBO_Ind')

In [49]:
all_strings_arr[list(names)].astype(to_convert)

array([(0.0, 0, 729400.0, 27, 76255, 1, 2),
       (0.0, 0, 607600.0, 10, 76256, 1, 2)], 
      dtype=[('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'i1'), ('NASDAQ_BBO_Ind', 'i1')])

In [173]:
np.float32(all_strings_arr['Ask_Price'])

729400.0

In [148]:
np.float32(all_strings.Ask_Price)

ValueError: Can only create a chararray from string data.

In [142]:
pd.DataFrame(all_strings[list(names)])

Unnamed: 0,Bid_Price,Bid_Size,Ask_Price,Ask_Size,Market_Maker,Sequence_Number,National_BBO_Ind,NASDAQ_BBO_Ind
0,b'00000000000',b'0000000',b'00000729400',b'0000027',b' ',b'0000000000076255',b'1',b'2'
1,b'00000000000',b'0000000',b'00000607600',b'0000010',b' ',b'0000000000076256',b'1',b'2'


In [81]:
byte_float = all_strings.Bid_Price[0]

In [90]:
all_strings[['Bid_Price', 'Ask_Price']]

array([(b'00000000000', b'00000729400'), (b'00000000000', b'00000607600')], 
      dtype=[('Bid_Price', 'S11'), ('Ask_Price', 'S11')])

In [68]:
list(zip([name for name, type in initial_dtype], all_strings[0]))

[('Time', b'075300081'),
 ('Exchange', b'P'),
 ('Symbol', b'A               '),
 ('Bid_Price', b'00000000000'),
 ('Bid_Size', b'0000000'),
 ('Ask_Price', b'00000729400'),
 ('Ask_Size', b'0000027'),
 ('Quote_Condition', b'R'),
 ('Market_Maker', b'    '),
 ('Bid_Exchange', b'P'),
 ('Ask_Exchange', b'P'),
 ('Sequence_Number', b'0000000000076255'),
 ('National_BBO_Ind', b'1'),
 ('NASDAQ_BBO_Ind', b'2'),
 ('Quote_Cancel_Correction', b' '),
 ('Source_of_Quote', b'C'),
 ('Retail_Interest_Indicator_RPI', b' '),
 ('Short_Sale_Restriction_Indicator', b' '),
 ('LULD_BBO_Indicator_CQS', b' '),
 ('LULD_BBO_Indicator_UTP', b' '),
 ('FINRA_ADF_MPID_Indicator', b' '),
 ('SIP_generated_Message_Identifier', b' '),
 ('National_BBO_LULD_Indicator', b' '),
 ('newline', b'\r\n')]

In [70]:
list(zip([name for name, type in initial_dtype], all_strings[1]))

[('Time', b'075300085'),
 ('Exchange', b'P'),
 ('Symbol', b'A               '),
 ('Bid_Price', b'00000000000'),
 ('Bid_Size', b'0000000'),
 ('Ask_Price', b'00000607600'),
 ('Ask_Size', b'0000010'),
 ('Quote_Condition', b'R'),
 ('Market_Maker', b'    '),
 ('Bid_Exchange', b'P'),
 ('Ask_Exchange', b'P'),
 ('Sequence_Number', b'0000000000076256'),
 ('National_BBO_Ind', b'1'),
 ('NASDAQ_BBO_Ind', b'2'),
 ('Quote_Cancel_Correction', b' '),
 ('Source_of_Quote', b'C'),
 ('Retail_Interest_Indicator_RPI', b' '),
 ('Short_Sale_Restriction_Indicator', b' '),
 ('LULD_BBO_Indicator_CQS', b' '),
 ('LULD_BBO_Indicator_UTP', b' '),
 ('FINRA_ADF_MPID_Indicator', b' '),
 ('SIP_generated_Message_Identifier', b' '),
 ('National_BBO_LULD_Indicator', b' '),
 ('newline', b'\r\n')]

In [73]:
np.datetime64('12:25')

ValueError: Error parsing datetime string "12:25" at position 2

## Screwing around with HDF5

In [71]:
h5test.close()

In [72]:
h5test = tb.open_file('test.h5', 'w')

In [73]:
arr = h5test.createArray('/', 'test', [0,0,0,0])

In [76]:
arr = h5test.root.test

In [77]:
arr[:] = ['1', '2', '3', '4']

In [81]:
arr[1:] = ['1', 2, '5']

In [82]:
arr[:]

[1, 1, 2, 5]