# process one file

In [None]:
from glob import glob
import numpy as np

import raw_taq


In [None]:
# You can run this if you update the raw_taq.py file
from importlib import reload
reload(raw_taq)

In [None]:
fname = "../local_data/EQY_US_ALL_BBO_20150102.zip"
taq_file = raw_taq.TAQ2Chunks(fname)

# how far can we walk through the zip file

In [None]:
# let's time just walking through a file vs various chunk size
from itertools import islice

def walk_through_file(fname, chunk_size=1000, max_chunk=None):
    taq_file = raw_taq.TAQ2Chunks(fname)
    for chunk in islice(taq_file.convert_taq(chunk_size), max_chunk):
        pass
    

```
(myenv3)rdhyee@mercury:~/dlab-finance/basic-taq$ time cat ../local_data/EQY_US_ALL_BBO_20150102.zip > /dev/null 

real	0m33.261s
user	0m0.013s
sys	0m1.698s
```

trying gzip on entire file caused error:

```
time gzip -cdfq ../local_data/EQY_US_ALL_BBO_20150102.zip > /dev/null 

gzip: ../local_data/EQY_US_ALL_BBO_20150102.zip: invalid compressed data--length error

real	3m40.387s
user	3m39.351s
sys	0m1.000s

```

on savio:

[ryee@n0045 ~]$ time cat davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

real	0m5.851s
user	0m0.005s
sys	0m4.411s

```
[ryee@n0045 ~]$ time gzip -cdfq davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

gzip: davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip: invalid compressed data--length error

real	4m33.470s
user	4m31.036s
sys	0m2.561s

```

Need to do unzip

```
[ryee@n0045 ~]$ time unzip -c davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

real	4m30.988s
user	4m26.768s
sys	0m4.367s

```


In [None]:
%time walk_through_file(fname, chunk_size=200000, max_chunk=5)

1000000 records:

```
10000 chunks = 18.2s
1000 x 1000 = 9.5s
100 chunks x 10000/chunk 7.32s
10 chunks x 100000/chunk 7.37s
5 chunks x 2000000/chunk 7.69s
```

In [None]:
# 10,000,000 records

%time walk_through_file(fname, chunk_size=100000, max_chunk=100)

10,000,000 records:

10,000 chunks x 1000/chunk 85s
5,000 chunks x 2000/chunk 79s
100 chunks x 100,000/chunk 76s

In [None]:
686099151 / 10000000 * 76 / (3600)

In [None]:
# code to walk through a zip file

# function to calculate mapping of line len to index of last column

def record_len_to_last_column(initial_dtype):
    """
    initial_dtype of form:
    
    [('Time', 9),
 ('Exchange', 1),
 ('Symbol_root', 6),
 ('Symbol_suffix', 10),
 ('Bid_Price', 11),
 ('Bid_Size', 7),
 ('Ask_Price', 11),
 ....
 ('newline', 'S2')]
 
 Assumption is that the last field is a newline field that is present in all versions of BBO
    """
    
    cum_len = 0
    cum_lens = []
    flens = [(field, int(dtype[1:])) for (field, dtype) in raw_taq.initial_dtype]
    newline_len = flens[-1][1]

    for (i,(field, flen)) in enumerate(flens[:-1]):
        cum_len += flen
        cum_lens.append((cum_len+newline_len, i))

    return dict(cum_lens)
    
    

def raw_chunks_from_zipfile(fname, chunksize=1000):
    import zipfile
    import datetime

    with zipfile.ZipFile(fname, 'r') as zfile:
        for inside_f in zfile.filelist:
           
            # can I do two passes -- first pass is to read 2 first two lines 
            
            with zfile.open(inside_f.filename) as infile:
                first = infile.readline()  # we can process first line
                second = infile.readline()
                bytes_per_line = len(second)
        
            with zfile.open(inside_f.filename) as infile:
                first = infile.readline()
                
                still_bytes = True
                while (still_bytes):
                    raw_bytes = infile.read(bytes_per_line * chunksize)
                    if raw_bytes:
                        yield (raw_bytes)
                    else:
                        still_bytes = False

RECORD_LEN_TO_LAST_COLUMN_MAP = record_len_to_last_column(raw_taq.initial_dtype)                

def chunks_from_zipfile(fname, chunksize=1000):
    import zipfile
    import datetime
    
    
    with zipfile.ZipFile(fname, 'r') as zfile:
        for inside_f in zfile.filelist:
                   
            with zfile.open(inside_f.filename) as infile:
                first = infile.readline()
                bytes_per_line = len(first)
                dtype = raw_taq.initial_dtype[:RECORD_LEN_TO_LAST_COLUMN_MAP[bytes_per_line]] + \
                   [raw_taq.initial_dtype[-1]]
                    
                more_bytes = True
                
                while (more_bytes):
                    raw_bytes = infile.read(bytes_per_line * chunksize)
                    all_strings = np.ndarray(len(raw_bytes) // bytes_per_line, 
                                             buffer=raw_bytes, dtype=dtype)
                    
                    if raw_bytes:
                        yield (all_strings)
                    else:
                        more_bytes = False    
    

In [None]:
def walk_through_zip_raw(fname,chunksize=100000,max_chunk=None):
    for (i, chunk) in enumerate(islice(raw_chunks_from_zipfile(fname, chunksize=chunksize),max_chunk)):
        pass
    return i

def walk_through_zip_init_conv(fname,chunksize=100000,max_chunk=None):
    LINE_WIDTH = 98 # will have to generalize to get line size out
    expected_buffer_size = chunksize *  LINE_WIDTH
    
    for (i, chunk) in enumerate(islice(raw_chunks_from_zipfile(fname, chunksize=chunksize),max_chunk)):
        try:
            all_strings = np.ndarray(chunksize, buffer=chunk, dtype=raw_taq.initial_dtype)
        except Exception as e:
            all_strings = np.ndarray(len(chunk) // LINE_WIDTH, buffer=chunk, dtype=raw_taq.initial_dtype)
            
    return i
            
    
def walk_through_zip_init_conv_0(fname,chunksize=100000,max_chunk=None):
    
    for (i, chunk) in enumerate(islice(raw_chunks_from_zipfile(fname, chunksize=chunksize),max_chunk)):
        all_strings = np.ndarray(chunksize, buffer=chunk, dtype=raw_taq.initial_dtype)
            
    return i
                             


In [None]:
%time walk_through_zip_raw(fname,chunksize=1000000,max_chunk=10)

In [None]:
%time walk_through_zip_init_conv(fname,chunksize=1000000,max_chunk=10)

In [None]:
%time walk_through_zip_init_conv_0(fname,chunksize=1000000,max_chunk=10)

In [None]:
# let's play with parsing chunks

chunk = next(chunks_from_zipfile(fname,chunksize=1000))

In [None]:
len(chunk)

In [None]:
# http://docs.scipy.org/doc/numpy/reference/generated/numpy.fromstring.html

np.ndarray(1, buffer=chunk, dtype=raw_taq.initial_dtype)


In [None]:
from itertools import islice

# process by row or by chunk?
def taq_row(fname, chunk_size=1000):
    taq_file = raw_taq.TAQ2Chunks(fname)
    for chunk in taq_file.convert_taq(chunk_size):
        for row in chunk:
            yield row
            

In [None]:
for (i,row) in enumerate(islice(taq_row(fname), 1000000)):
    print("\r {0}".format(i), end=" ")

In [None]:
row.converted_time

In [None]:
row.dtype.names

In [None]:
# If you want just the type
row.dtype

In [None]:
for field in row.dtype.names:
    print (field, row[field])

In [None]:
# converting im
import datetime
datetime.datetime.fromtimestamp(1420230800.94)

In [None]:
from itertools import zip_longest
from collections import Counter

c = Counter()
counts = np.unique(chunk[:]['Symbol_root'], return_counts=True)
c.update(dict(zip_longest(counts[0], counts[1])))
c

In [None]:
# Accumulate (exchange, symbol_root, symbol_suffix)

from collections import Counter

def count_chunk_elements(fname, chunksize=1000000, max_chunk=None):

    symbol_roots = Counter()

    for (i,chunk) in enumerate(islice(chunks_from_zipfile(fname, chunksize), max_chunk)):

        counts = np.unique(chunk[:]['Symbol_root'], return_counts=True)
        symbol_roots.update(dict(zip_longest(counts[0], counts[1])))

        print("\r {0}".format(i),end="")

    return symbol_roots

In [None]:
#faqname = "../local_data/EQY_US_ALL_BBO_20150102.zip"
faqname = "../local_data/EQY_US_ALL_BBO_20100104.zip"

In [None]:
%time c = count_chunk_elements(fname, max_chunk=None)

In [None]:
sum(c.values())

In [None]:
for (i,(k,v)) in enumerate(islice(c.most_common(),10)):
    print ("\t".join([str(i), k.decode('utf-8').strip(), str(v)]))

## You can also easily convert numpy record arrays to pandas dataframes easily

In [None]:
import pandas as pd

In [None]:
chunk_df = pd.DataFrame(chunk)

In [None]:
chunk_df

In [None]:
# note that time is not correctly parsed yet:
chunk_df.Time

# Goal: Compute some summary statistics across a few securities in the TAQ file

Processing an entire TAQ file will take a long time. So, maybe just run through the chunks for the first two securities (you can then exit out of a loop once you see the third security / symbol).

In [None]:
import numpy as np
from statistics import mode

#find the max bid price
max_price = max(chunk['Bid_Price'])

#find the min bid price
min_price = min(chunk['Bid_Price'])

#find the mean of bid price
avg_price = np.mean(chunk['Bid_Price'])

#find the mod of bid price
mod_price = mode(chunk['Bid_Price'])

#find the sd of bid price
sd_price = np.std(chunk['Bid_Price'])

print(" Max bid price: ", max_price, "\n", "Min bid price: ", min_price, "\n", 
      "Mean bid price: ", avg_price, "\n", "Mod bid price: ", mod_price, "\n", "Standard deviation bid price: ", sd_price)

In [None]:
#find the max Ask price
max_price = max(chunk['Ask_Price'])

#find the min Ask price
min_price = min(chunk['Ask_Price'])

#find the mean of Ask price
avg_price = np.mean(chunk['Ask_Price'])

#find the mod of Ask price
mod_price = mode(chunk['Ask_Price'])

#find the sd of Ask price
sd_price = np.std(chunk['Ask_Price'])

print(" Max Ask price: ", max_price, "\n", "Min Ask price: ", min_price, "\n", 
      "Mean Ask price: ", avg_price, "\n", "Mod Ask price: ", mod_price, "\n", "Standard deviation Ask price: ", sd_price)