# Sampling to create Tick, Volume and Dollar-traded bars

+ 2.1 Tick Bar
+ 2.2 Volume Bar
+ 2.3 Volume Bar

Here we are taking arbitrary values for sampling the 3 different types of bars.
* For tick bar we are sampling 100 ticks. 
* For volume bar we are sampling 1000 units. 
* For tick bar we are sampling 1000000 dollar traded


In [5]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import dask.dataframe as dd

from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm 
import statsmodels.graphics.tsaplots as tsp
from scipy.stats import jarque_bera

In [6]:
df = dd.read_parquet('continous_tick_data.parquet')

In [9]:
# incase if you want to observe the job on the dask monitor dashboard
from dask.distributed import Client
client = Client() 
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.75 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61516,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.75 GiB

0,1
Comm: tcp://127.0.0.1:61551,Total threads: 2
Dashboard: http://127.0.0.1:61553/status,Memory: 3.94 GiB
Nanny: tcp://127.0.0.1:61519,
Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-n___w2x3,Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-n___w2x3

0,1
Comm: tcp://127.0.0.1:61557,Total threads: 2
Dashboard: http://127.0.0.1:61558/status,Memory: 3.94 GiB
Nanny: tcp://127.0.0.1:61522,
Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-fzbh4_17,Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-fzbh4_17

0,1
Comm: tcp://127.0.0.1:61548,Total threads: 2
Dashboard: http://127.0.0.1:61549/status,Memory: 3.94 GiB
Nanny: tcp://127.0.0.1:61520,
Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-nj30a493,Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-nj30a493

0,1
Comm: tcp://127.0.0.1:61552,Total threads: 2
Dashboard: http://127.0.0.1:61555/status,Memory: 3.94 GiB
Nanny: tcp://127.0.0.1:61521,
Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-82utkic2,Local directory: C:\Users\saeed\AppData\Local\Temp\dask-worker-space\worker-82utkic2


# 2.1 Tick Bars

In [10]:
%%time
transactions = 100
df['group'] =  (df['index']/transactions).astype(np.int64)
df_tick_bars = df.groupby('group').agg({'rolled_price':['first','last','max','min'],
                                     'volume': 'sum','timestamp':'min','instrument':'count'}).reset_index(drop=True)
df_tick_bars.columns = ['open','close','high','low','volume','timestamp','count']
df_tick_bars_dd = df_tick_bars.compute()

CPU times: total: 20.9 s
Wall time: 6min 9s


In [11]:
len(df_tick_bars_dd)

8561831

In [12]:
df_tick_bars_dd.head()

Unnamed: 0,open,close,high,low,volume,timestamp,count
0,959.75,961.25,961.25,959.5,324,2003-06-30 23:00:01,100
1,961.25,962.5,962.5,961.25,420,2003-07-01 01:03:04,100
2,962.5,961.5,963.0,961.5,441,2003-07-01 02:05:06,100
3,961.5,961.75,962.75,961.5,578,2003-07-01 02:23:29,100
4,961.75,961.25,962.0,961.25,472,2003-07-01 02:40:38,100


## 2.2 Volume Bars

In [13]:
%%time
traded_volume = 1000
df['volume'] = df['volume'].astype(np.uint64)
df['group'] =  (df['volume'].cumsum()/traded_volume).astype(np.uint64)
df_volume_bars = df.groupby('group').agg({'rolled_price':['first','last','max','min'],
                                     'volume': 'sum','timestamp':'min','instrument':'count'})#.reset_index(drop=True)
df_volume_bars.columns = ['open','close','high','low','volume','timestamp','count']
df_volume_bars_dd = df_volume_bars.compute()

CPU times: total: 1min 9s
Wall time: 13min 14s


In [14]:
len(df_volume_bars_dd)

4803869

In [15]:
df_volume_bars_dd.head()

Unnamed: 0_level_0,open,close,high,low,volume,timestamp,count
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,959.75,962.25,963.0,959.5,992,2003-06-30 23:00:01,270
1,962.25,962.0,962.75,961.25,1006,2003-07-01 02:16:17,177
2,962.0,961.0,962.0,959.25,997,2003-07-01 02:59:16,223
3,961.0,959.75,961.0,959.25,1004,2003-07-01 03:35:59,269
4,959.75,960.0,960.5,959.5,997,2003-07-01 04:27:02,302


## Dollar Volume Bars

In [7]:
%%time
traded_dollar_volume = 1000000
df['group'] =  (df['dollar_volume'].cumsum()/traded_dollar_volume).astype(np.int64)
df_dol_vol_bars = df.groupby('group').agg({'rolled_price':['first','last','max','min'],
                                     'volume': 'sum','timestamp':'min','instrument':'count','dollar_volume':'sum'})#.reset_index(drop=True)
df_dol_vol_bars.columns = ['open','close','high','low','volume','timestamp','count','dv_sum']
df_dol_vol_bars_dd = df_dol_vol_bars.compute()

CPU times: total: 3min 47s
Wall time: 8min 7s


In [8]:
len(df_dol_vol_bars_dd)

6161892

In [9]:
df_dol_vol_bars_dd.head()

Unnamed: 0_level_0,open,close,high,low,volume,timestamp,count,dv_sum
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,959.75,962.0,963.0,959.5,1030,2003-06-30 23:00:01,274,990456.75
1,962.0,962.0,962.75,961.25,1040,2003-07-01 02:18:47,189,1000363.75
2,962.0,961.0,962.0,959.25,1029,2003-07-01 03:02:08,231,988433.75
3,960.75,960.25,961.0,959.25,1063,2003-07-01 03:37:07,311,1020434.25
4,960.25,960.5,960.75,959.5,1038,2003-07-01 04:42:51,310,996659.75


In [14]:
# saving the sampled bars
df_dol_vol_bars_dd.to_parquet('dol_vol_bars.parquet',index=False)
df_volume_bars_dd.to_parquet('vol_bars.parquet',index=False)
df_tick_bars_dd.to_parquet('tick_bars.parquet',index=False)