In [235]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
output_notebook()

import python_analysis_functions as paf 
import panel as pn
import holoviews as hv
from holoviews import opts,dim
import os 
import numpy as np 
import functools as ft 
import operator as ops 
import datetime
from tqdm import tqdm

import bokeh
import pandas as pd

pn.extension()
hv.extension('bokeh') 

day = "2020_08_21"
pair = "ETHUSDT" 
fos = "FUTURES"
futures_or_spot = fos 
aggregate = "T"
ftype = "TRADE"

df = paf.extract_tbinned_data_from_file(day,pair,fos, ftype, aggregate) 

def viz(df,rsize=(850,400)) :
    ds = paf.to_hv_dataset(df)  
    pg = ds.to(hv.Curve, 'tbin', 'p_avg')
    vg = ds.to(hv.Curve, 'tbin', 'v_tot')
    vc = ds.to(hv.Curve, 'tbin', 'v_tot_cumulative')

    rs = [pg,vg,vc]
    rgs = [ paf.rsz(g,rsize) for g in rs ]
    plts = pn.Column( *rgs )  
    return plts 
    

def run_plot(df) : 
    runs_df = paf.get_runs(df)
    p = paf.analyze_runs(runs_df)
    return (p , runs_df) 

def candle_with_ma(df,sz=(1000,400)) : 
    spacing = 0.3
    info  =  [  { 'rectangle'  : (row['index'] -spacing , row['p_open'], row['index'] + spacing , row['p_close'])   , 'segment' :  ( row['index'], row['p_min'],row['index'], row['p_max'] )  }  for index, row in df.iterrows() ] 

    rectangles = [ x['rectangle'] for x in info ]
    segments   = [ x['segment']   for x in info ]

    color_exp = (dim('y1')>dim('y0')).categorize({True: 'green', False: 'red'})

    rsz = paf.rsz
    segp = rsz(hv.Segments(segments).opts(color="black"),sz)  
    recp = rsz(hv.Rectangles(rectangles).opts(color=color_exp),sz)
    
    candle = segp * recp 
    
    # add mvas 
    xs = df['index'] 
    
    def mva_pts(s) : 
        return [ (xs[i],df[s][i]) for i in range(len(xs)) ] 
    
    mva7, mva25, mva99 = [ rsz( hv.Curve(mva_pts(s),label=s)  ,sz) for s in ['mva_7','mva_25', 'mva_99'] ] 
    
    mvas = mva7 * mva25 * mva99 
    plt = candle * mvas      
    return (plt , candle, mvas) 


cache req: cache/extract_tbinned_data_from_file/2020_08_21_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!


## Data Overview

This first panel of graphs show the following (for a given pair/day): 
1. Candlestick graph (zooming capable)
2. average price over time 
3. avg quote asset (usually usdt in a x/usdt pair) volume per time bin 
4. cumulative quote asset volume over time 
5. distribution of run lengths 

In [236]:
sz = (850,400)
p, candle, mvas = candle_with_ma(df,sz)  
p.options(title='Candlestick graph (zoom in)')

In [237]:
viz(df)

In [238]:
p , runs_df = run_plot(df)
show(p)

## Entire timeseries analysis (selected pair over all available temporal data)

In the below cell we concatenate data for all time available (at the desired binning level). 

In [239]:
ads = paf.get_all_days_binned(pair,futures_or_spot,ftype,aggregate)

cache req: cache/extract_tbinned_data_from_file/2020_08_01_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_02_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_03_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_04_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_05_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_06_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_07_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_08_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_09_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/extract_tbinned_data_from_file/2020_08_10_ETHUSDT_FUTURES_TRADE_T.pkl
cache hit!
cache req: cache/ext

### Runs plot  / analysis (over all of time available)
The following pie chart shows the distribution of run lengths over the entire time interval. 
You can hover to see how many runs of a given length there are. 
I believe that as the time interval becomes sufficiently large, you are more likely to observe 
the distribution: 

1 - 50%   (0.5)^1 <br/>
2 - 25%   (0.5)^2 <br/>
3 - 12.5% (0.5)^3 <br/>

This makes sense if you ASSUME that the probability of an increase or decrease in the next bin is an independent event with probability 0.5 <br/>
This is sort of a funny conclusion, because it shows that (over a large enough time interval), it must be true that the NEXT time interval change is truly  <br/> 
random. 

One use for this is to simply detect if a market is "real". A synthesized market may not conform to this distribution. On the other hand, knowing this it would <br/>
be easy to synthesize in a conformant way. 

However, over smaller time intervals this is not the case. 


In [240]:
# ads is the aggregated data frame (all time) 
p , runs_df  = run_plot(ads)
show(p)

### Return vs Run Index

In [241]:
# QUESTION : "Is there an association between the p_change of an interval and its run index?"
# simply by plotting a scatter plot of the p_change_percent and the run_index 

In [242]:
adsr  = paf.add_run_info_to_df(ads,runs_df)
paf.return_vs_run_index(adsr)

### Return vs Run Index Graph Description 

This graph above shows a scatter plot of the interval percentage change (minutely) vs the run_index (what position is this minute interval in the context of its associated positive or negative run). 

What you can see is that for any given run_index, the distribution of percentage changes is approximately gaussian with a mean of 0 but a variable variance. <br/> 

The variance of the distribution appears to be negatively correlated with the run_index.  

Possible interpretation: A big percentage change is less likely to occur at higher run indeces. Not sure exactly why this is but it does make some intuitive sense. 

### Run Vizualiations

In [243]:
def plot_run(df,run_info,sz=(200,200)) : 
    "Plots an individual run" 

    # need to get the data slice 
    dta = paf.get_data_for_run(df,run_info)
    
    #print("using data") 
    #print(dta.head())
    
    spacing = 0.3
    info  =  [  { 'rectangle'  : (row['index'] -spacing , row['p_open'], row['index'] + spacing , row['p_close'])   , 'segment' :  ( row['index'], row['p_min'],row['index'], row['p_max'] )  }  for index, row in dta.iterrows() ] 

    rectangles = [ x['rectangle'] for x in info ]
    segments   = [ x['segment']   for x in info ]

    color_exp = (dim('y1')>dim('y0')).categorize({True: 'green', False: 'red'})

    rsz = paf.rsz
    segp = rsz(hv.Segments(segments).opts(color="black"),sz)  
    recp = rsz(hv.Rectangles(rectangles).opts(color=color_exp),sz)
    
    candle = segp * recp 

    return candle 
    

In [244]:
long_runs = runs_df[runs_df['length'] > 10] 

In [245]:
def plot_runs(ads,runs) : 
    g  = plot_run(ads,runs.iloc[0]) 
    for i in range(len(runs)-1) : 
        g = g + plot_run(ads,runs.iloc[i+1]) 
    return g 

In [246]:
plot_runs(ads,long_runs)

### Resistance Metrics 

In [247]:
adsr['p_round'] = np.around(adsr['p_avg'],0)

In [248]:
dta = adsr

In [249]:
# loop through the rows 
results = {} 
cols = ['v_tot','v_net','v_buy_tot', 'v_sell_tot']
for i, row in tqdm(dta.iterrows()) : 
    p_round = row['p_round']

    info = results.get(p_round) 
    
    new_info = {} 
    
    if info :
        # have initialized it 
        # so will update it 
        for col in cols :
            new_info[col]= row[col] + info[col] 
        
    else :
        # have not initialized it, so will just set it 
        for col in cols :
            new_info[col] = row[col] 
        
    # store the new info
    results[p_round] = new_info 
    

def fix(k,v) : 
    v['p_round'] = k 
    return v 

tmp = [fix(k,v) for (k,v) in results.items()]    
    
adf = pd.DataFrame(tmp, columns = ["p_round"] + cols ) 
#adf = adf.sort_values(by=[''],ascending=False)

41968it [00:03, 10867.30it/s]


In [256]:
top_buy  = adf.sort_values(by='v_buy_tot',ascending=False)[:100].get(["p_round","v_buy_tot"])
top_sell = adf.sort_values(by='v_sell_tot',ascending=False)[:100].get(["p_round","v_sell_tot"])


# [:10] #pd.concat([adf[:6],adf[-6:]])
bpts = [ (0,r['p_round'],len(ads),r['p_round'],r['v_buy_tot']) for (i,r) in top_buy.iterrows() ] 
spts = [ (0,r['p_round'],len(ads),r['p_round'],r['v_sell_tot']) for (i,r) in top_sell.iterrows() ] 


sz = (1400,800)

paf.rsz(hv.Curve(ads,'index', 'p_avg'),sz) * paf.rsz(hv.Segments(spts,vdims=['c']),sz).opts(line_width=2,cmap='Reds',color=dim('c'),colorbar=False) * paf.rsz(hv.Segments(bpts,vdims=['c']),sz).opts(line_width=2,cmap='Greens',color=dim('c'),colorbar=True)


  [cmap for cmap in cm.cmap_d if not


## Resistance Plot 

Green lines show BUY market volume (green shade) at specified price points (y position)
Red lines show the same for SELL volume 

Here only the top N of each are displayed. 

In [251]:
all_net = adf.sort_values(by='v_net',ascending=False).get(["p_round","v_net"]) 
top_net = pd.concat([all_net[:5], all_net[-5:]]) 

npts = [ (0,r['p_round'],len(ads),r['p_round'],r['v_net']) for (i,r) in top_net.iterrows() ] 

sz = (1400,800)
paf.rsz(hv.Curve(ads,'index', 'p_avg'),sz) * paf.rsz(hv.Segments(npts,vdims=['c']),sz).opts(line_width=2,cmap='bwr',color=dim('c'),colorbar=True) 


  [cmap for cmap in cm.cmap_d if not


In [252]:
# ability of LOB to refill after large volumes is a measure of resistance 
# quick refilling of the order book after a market order stops the price from moving as quickly 
# thus, a proxy for order book refilling is the ratio of v_net/p_change_percent 
# instead of going by proxy -- I could actually start to analyze the LOB data to come up with a metric of order book 
# refilling? 

In [253]:
ads['rr'] = ads['v_net'] / ads['p_change_percent']
ads = ads[~ads.isin([np.nan, np.inf, -np.inf]).any(1)] 

In [254]:
hads = paf.to_hv_dataset(ads)

In [255]:
(  paf.rsz(hads.to(hv.Curve,'tbin', 'p_avg'),(1000,400)) + paf.rsz(hads.to(hv.Scatter,'tbin', 'rr'),(1000,400)) ).cols(1)