# process one file

In [1]:
from glob import glob
import raw_taq

In [2]:
# You can run this if you update the raw_taq.py file
from importlib import reload
reload(raw_taq)

<module 'raw_taq' from '/home/rdhyee/dlab-finance/basic-taq/raw_taq.py'>

In [3]:
fname = "../local_data/EQY_US_ALL_BBO_20150102.zip"
taq_file = raw_taq.TAQ2Chunks(fname)

# how far can we walk through the zip file

In [4]:
# let's time just walking through a file vs various chunk size
from itertools import islice

def walk_through_file(fname, chunk_size=1000, max_chunk=None):
    taq_file = raw_taq.TAQ2Chunks(fname)
    for chunk in islice(taq_file.convert_taq(chunk_size), max_chunk):
        pass
    

```
(myenv3)rdhyee@mercury:~/dlab-finance/basic-taq$ time cat ../local_data/EQY_US_ALL_BBO_20150102.zip > /dev/null 

real	0m33.261s
user	0m0.013s
sys	0m1.698s
```

trying gzip on entire file caused error:

```
time gzip -cdfq ../local_data/EQY_US_ALL_BBO_20150102.zip > /dev/null 

gzip: ../local_data/EQY_US_ALL_BBO_20150102.zip: invalid compressed data--length error

real	3m40.387s
user	3m39.351s
sys	0m1.000s

```

on savio:

[ryee@n0045 ~]$ time cat davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

real	0m5.851s
user	0m0.005s
sys	0m4.411s

```
[ryee@n0045 ~]$ time gzip -cdfq davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

gzip: davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip: invalid compressed data--length error

real	4m33.470s
user	4m31.036s
sys	0m2.561s

```

Need to do unzip

```
[ryee@n0045 ~]$ time unzip -c davclark/taq-mirror/EQY_US_ALL_BBO/EQY_US_ALL_BBO_2015/EQY_US_ALL_BBO_201501/EQY_US_ALL_BBO_20150102.zip > /dev/null

real	4m30.988s
user	4m26.768s
sys	0m4.367s

```


In [None]:
%time walk_through_file(fname, chunk_size=200000, max_chunk=5)

1000000 records:

```
10000 chunks = 18.2s
1000 x 1000 = 9.5s
100 chunks x 10000/chunk 7.32s
10 chunks x 100000/chunk 7.37s
5 chunks x 2000000/chunk 7.69s
```

In [None]:
# 10,000,000 records

%time walk_through_file(fname, chunk_size=100000, max_chunk=100)

10,000,000 records:

10,000 chunks x 1000/chunk 85s
5,000 chunks x 2000/chunk 79s
100 chunks x 100,000/chunk 76s

In [None]:
686099151 / 10000000 * 76 / (3600)

In [11]:
# code to walk through a zip file

def raw_chunks_from_zipfile(fname, chunksize=1000, BYTES_PER_LINE=98):
    import zipfile
    import datetime

    with zipfile.ZipFile(fname, 'r') as zfile:
        for inside_f in zfile.filelist:
           
            with zfile.open(inside_f.filename) as infile:
                first = infile.readline()
                
                still_bytes = True
                while(still_bytes):
                    raw_bytes = infile.read(BYTES_PER_LINE * chunksize)
                    if raw_bytes:
                        yield(still_bytes)
                    else:
                        still_bytes = False

                        

In [21]:
def walk_through_zip_raw(fname,chunksize=100000,max_chunk=None):
    for (i, chunk) in enumerate(islice(raw_chunks_from_zipfile(fname, chunksize=chunksize),max_chunk)):
        pass
    return i

In [22]:
%time walk_through_zip_raw(fname,chunksize=1000000,max_chunk=None)

CPU times: user 2min 31s, sys: 1min 5s, total: 3min 36s
Wall time: 3min 36s


686

In [None]:
from itertools import islice

# process by row or by chunk?
def taq_row(fname, chunk_size=1000):
    taq_file = raw_taq.TAQ2Chunks(fname)
    for chunk in taq_file.convert_taq(chunk_size):
        for row in chunk:
            yield row
            

In [None]:
for (i,row) in enumerate(islice(taq_row(fname), 1000000)):
    print("\r {0}".format(i), end=" ")

In [None]:
row.converted_time

In [None]:
row.dtype.names

In [None]:
# If you want just the type
row.dtype

In [None]:
for field in row.dtype.names:
    print (field, row[field])

In [None]:
# converting im
import datetime
datetime.datetime.fromtimestamp(1420230800.94)

In [None]:
# Accumulate (exchange, symbol_root, symbol_suffix)

from collections import Counter

exchanges = Counter()
symbol_roots = Counter()
exchange_symbol_root_suffixes = Counter()

for (i,row) in enumerate(islice(taq_row(fname, chunk_size=1000000), 10000000)):
    exchange = row['Exchange'].decode("utf-8", "strict")
    symbol_root = row['Symbol_root'].decode("utf-8", "strict").strip()
    symbol_suffix = row['Symbol_suffix'].decode("utf-8", "strict").strip()
    
    triplet = (exchange, symbol_root, symbol_suffix)
    
    print("\r {0}".format(i),end="")
    exchanges.update([exchange])
    symbol_roots.update([symbol_root])
    
    exchange_symbol_root_suffixes.update([triplet])
    
exchanges, symbol_roots, exchange_symbol_root_suffixes

In [None]:
(row['Exchange'].decode("utf-8", "strict")) 

## You can also easily convert numpy record arrays to pandas dataframes easily

In [None]:
import pandas as pd

In [None]:
chunk_df = pd.DataFrame(chunk)

In [None]:
chunk_df

In [None]:
# note that time is not correctly parsed yet:
chunk_df.Time

# Goal: Compute some summary statistics across a few securities in the TAQ file

Processing an entire TAQ file will take a long time. So, maybe just run through the chunks for the first two securities (you can then exit out of a loop once you see the third security / symbol).

In [None]:
import numpy as np
from statistics import mode

#find the max bid price
max_price = max(chunk['Bid_Price'])

#find the min bid price
min_price = min(chunk['Bid_Price'])

#find the mean of bid price
avg_price = np.mean(chunk['Bid_Price'])

#find the mod of bid price
mod_price = mode(chunk['Bid_Price'])

#find the sd of bid price
sd_price = np.std(chunk['Bid_Price'])

print(" Max bid price: ", max_price, "\n", "Min bid price: ", min_price, "\n", 
      "Mean bid price: ", avg_price, "\n", "Mod bid price: ", mod_price, "\n", "Standard deviation bid price: ", sd_price)

In [None]:
#find the max Ask price
max_price = max(chunk['Ask_Price'])

#find the min Ask price
min_price = min(chunk['Ask_Price'])

#find the mean of Ask price
avg_price = np.mean(chunk['Ask_Price'])

#find the mod of Ask price
mod_price = mode(chunk['Ask_Price'])

#find the sd of Ask price
sd_price = np.std(chunk['Ask_Price'])

print(" Max Ask price: ", max_price, "\n", "Min Ask price: ", min_price, "\n", 
      "Mean Ask price: ", avg_price, "\n", "Mod Ask price: ", mod_price, "\n", "Standard deviation Ask price: ", sd_price)