# General analysis script for Binary.com feed
## Configuration Cell: Instruct how to do analysis

In [5]:
import pandas as pd
import os

# List of Feed-source(s). [e.g. 'combined','idata','bloomberg']
feeds = ['combined'] # first listed feed is primary

# List of underlyings
#underlyings = ['frxAUDJPY','frxAUDUSD','frxEURAUD','frxEURCAD','frxEURCHF','frxEURGBP','frxEURJPY','frxEURUSD',
#               'frxGBPAUD','frxGBPJPY','frxGBPUSD','frxUSDCAD','frxUSDCHF','frxUSDJPY','frxXAUUSD']
underlyings = ['STI'] # Uncomment for single underlying

# Provider either a list of dates or start and end dates
date_start = pd.to_datetime('2016-12-27')
date_end   = pd.to_datetime('2016-12-27')
datelist = pd.date_range(start=date_start,end=date_end)
#datelist = pd.to_datetime(['11-Mar-16','18-Mar-16','25-Mar-16']) # Uncomment to specify list

################################# Types of analysis: What do you want to do?
analyze_gaps       = False       # check feed gaps
gap_duration1      = 1          # Short gap duration in minutes
gap_duration2      = 3         # Long gap duration in minutes
                                #
begin_trading_hour = 0          # First trading hour of day (GMT)
end_trading_hour   = 23         # Last trading hour of day (GMT)--inclusive
                                #
trading_break      = False      # Filter trading break
begin_break_hour   = 8          # Start of trading break (GMT)
end_break_hour     = 10         # End of trading break (GMT)
                                #
show_trades        = True      # show client trades with feed chart
client_id          = 'CR427589' # JackDB download file should be client_id.csv in trades directory
min_payout         = 10       # minimum payout for trades shown
                                #
narrow_trading     = False      # check for periods with narrow trading range
time_period        = 300        # check for narrow trading with this duration in seconds
min_range          = 5          # narrow trading is considered to be within this many pips
                                #
analyze_vol        = False      #
feed_coverage      = False      # check feed coverage and frequency
feed_chart         = False       # ouput a feed chart
compare_feed       = False      # compare two different feeds for an underlying
analyze_digit      = False      # check distribution of last digit
#################################

feed_dir = '/home/frank/notebooks/binary/feed/' # root directory for fullfeed files
trades_dir = '/home/frank/notebooks/binary/Trader/' # directory for client trades
output_dir = '/home/frank/notebooks/binary/documents/output/' # place output files here

print('Configuration complete!')

Configuration complete!


## Check for common mistakes

In [6]:
for feed_source in feeds:
    for underlying in underlyings:
        underlying_folder = feed_dir+feed_source+'/'+underlying
        if not (os.path.isdir(underlying_folder)):
            print('No feed directory named ',underlying_folder) 
if not (os.path.isdir(output_dir)):
    print('No output directory named ',output_dir)
if show_trades and not (os.path.isfile(trades_dir+client_id+'.csv')):
    print('No trades file named ',trades_dir+client_id+'.csv')
print('Finished checking for files.')

Finished checking for files.


## Setup and Function Definitions: Save a copy before you modify

In [7]:
# Import needed packages
import datetime as dt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

# Pip sizes as of July 2016
pip_size = {'frxAUDJPY':0.001,'frxAUDUSD':0.00001,'frxEURAUD':0.00001,'frxEURCAD':0.00001,'frxEURCHF':0.0001,
            'frxEURGBP':0.00001,'frxEURJPY':0.001,'frxEURUSD':0.00001,'frxGBPAUD':0.00001,'frxGBPJPY':0.001,
            'frxGBPUSD':0.0001,'frxUSDCAD':0.00001,'frxUSDCHF':0.0001,'frxUSDJPY':0.001,'frxXAUUSD':0.01}

# Define Functions to read and process fullfeed files

def read_combined(date):
    format = '%d-%b-%y %H:%M:%S'
    date_str = str(date).lstrip('0')
    input_file = feed_dir+'combined/'+underlying+'/'+date_str+'.fullfeed'
    single_df = pd.DataFrame()
    if os.path.isfile(input_file):
        single_df = pd.read_csv(input_file, sep=' ',usecols=[0,4,5], names=['timestamp','combined','provider'])
        single_df.timestamp = pd.to_datetime(date+' '+single_df.timestamp,format=format)
        single_df = single_df.set_index('timestamp')
        if underlying[0:3]=='frx': # Check forex for switch to or from panda
            single_df.provider = single_df.provider=='panda'
            if (single_df.provider.all()!=False) and (single_df.provider.all()!=True):
                print('!ALERT! Combined switched to or from panda provider')
            single_df.drop(['provider'],axis=1,inplace=True)
    return single_df

def read_provider(date,feed_source):
    date_str = str(date).lstrip('0')
    input_file = feed_dir+feed_source+'/'+underlying+'/'+date_str+'-fullfeed.csv'
    provider_df = pd.DataFrame()
    if (os.path.isfile(input_file)):
        provider_df = pd.read_csv(input_file,sep=',',usecols=[0,4,6],names=['timestamp',feed_source,'flag'])
        provider_df.timestamp = pd.to_datetime(provider_df.timestamp*10**9)
        provider_df = provider_df.set_index('timestamp')
        provider_df = provider_df[provider_df.flag!='BADSRC']
        provider_df = provider_df[provider_df.flag!='IGN|BADSRC']
        provider_df = provider_df[provider_df.flag!='IGN|OUTL']
        provider_df = provider_df[provider_df.flag!='OUTL']
        provider_df = provider_df[provider_df.flag!='IGN']  
        provider_df.drop(['flag'],axis=1,inplace=True)
    return provider_df

def read_bloomberg(date):
    input_file = '/Users/stanly/feed/BB/'+underlying+'.fullfeed'
    bb_df = pd.DataFrame()
    bb_df = pd.read_csv(input_file,sep='|',engine='python',usecols=[0,1,2,3],index_col=False,skiprows=3,skipfooter=1,
                            names=['date','time','type','bloomberg']) 
    format = '%m/%d/%Y %H:%M:%S' 
    bb_df = bb_df[bb_df.type=='T'] 
    bb_df['timestamp'] = pd.to_datetime(bb_df.date+'/2016 '+bb_df.time,format=format) 
    bb_df.drop(['date','time','type'],axis=1,inplace=True) 
    bb_df = bb_df.set_index('timestamp')
    bb_df = bb_df.resample('1S').first()
    bb_df.dropna(inplace=True)
    bb_df = bb_df[bb_df.index.day==pd.to_datetime(date).day] 
    return bb_df

def read_trades():
    input_file = trades_dir+client_id+'.csv'
    format = '%Y-%m-%d %H:%M:%S'
    trades_df = pd.read_csv(input_file,sep=',',usecols=[4,6,7,8,10,11],
                        names=['type','payout','sell_price','start_time','underlying','duration'],skiprows=1)
#    trades_df.duration = (trades_df.duration.str.split('_').str.get(4).astype(int)-
#                          trades_df.duration.str.split('_').str.get(3).astype(int))/60
#    trades_df.duration = trades_df.duration.astype(str)+'m'
    trades_df.duration = '10T'
    trades_df.start_time = pd.to_datetime(trades_df.start_time,format=format)
    trades_df.set_index(trades_df.start_time,inplace=True)
    trades_df = trades_df[trades_df.underlying==underlying]
    return trades_df

def process_dates(num_days,feed_source):
    long_df = pd.DataFrame() # Feed DataFrame containing all days
    for date_time in datelist:
        date = dt.datetime.strftime(date_time, "%d-%b-%y")
        print(date)
        if feed_source=='idata' or feed_source=='panda' or feed_source=='olsen':
            full_df = read_provider(date,feed_source)
        if feed_source=='combined':
            full_df = read_combined(date)
        if feed_source=='bloomberg':
            full_df = read_bloomberg(date)
        if len(full_df)<2: # Skip if only 1 quote
            continue
#        if full_df.index[0].dayofweek==6: # Remove quotes for Sunday
#            continue
#        if full_df.index[0].dayofweek==4: # Remove quotes after 21:00 GMT on Fridays
#            full_df = full_df[full_df.index.hour<21]
        full_df = full_df[(full_df.index.hour>=begin_trading_hour)&(full_df.index.hour<=end_trading_hour)]
        if trading_break:
            full_df = full_df[(full_df.index.hour<begin_break_hour)|(full_df.index.hour>end_break_hour-1)]
        if feed_coverage:   
            total = full_df[primary].count()
            asia = full_df[full_df.index.hour<8][primary].count()
            america = full_df[full_df.index.hour>15][primary].count()
            europe = total-asia-america
            print(underlying,date,'Total ticks:',total,'asia:',asia,'europe:',europe,'america:',america)
        if narrow_trading:
            full_df['range'] = (full_df[primary].rolling(window=time_period).max()-
            full_df[primary].rolling(window=time_period).min())/pip_size[underlying]
            full_df['maxspot'] = full_df[primary].rolling(window=time_period).max()
            full_df['minspot'] = full_df[primary].rolling(window=time_period).min()
            full_df['pipsize'] = pip_size[underlying]
            full_df['is_narrow'] = (full_df.range < min_range)*1
            full_df['begin_narrow'] = ((full_df.is_narrow-full_df.is_narrow.shift())>0)*1
            narrow_df = full_df[full_df.begin_narrow==1]
            full_df.drop(['range','maxspot','minspot','pipsize','is_narrow','begin_narrow'],axis=1,inplace=True)
            narrow_df.drop(['is_narrow','begin_narrow'],axis=1,inplace=True)
            total = len(narrow_df)
            asia = len(narrow_df[narrow_df.index.hour<8])
            america = len(narrow_df[narrow_df.index.hour>15])
            europe = total-asia-america
            print(underlying,date,'Total Narrow Ranges:',total,'asia:',asia,'europe:',europe,'america:',america)
        long_df = long_df.append(full_df)
        num_days = num_days + 1
    return long_df

def process_sources():
    long_df = pd.DataFrame()
    for feed_source in feeds:
        long_df = long_df.join(process_dates(num_days,feed_source),how='outer')
    return long_df

print("Functions defined.")

Functions defined.


## Main processing

In [21]:
# Commence Feed Processing for each underlying
print('Will process for dates ',datelist)

if analyze_gaps: 
    duration1_str = str(gap_duration1)+'min_gaps'
    duration2_str = str(gap_duration2)+'min_gaps'
    gaps_df = pd.DataFrame()
    gaps_df['underlying'] = underlyings
    gaps_df.set_index(gaps_df.underlying,inplace=True)
    gaps_df.drop('underlying',axis=1,inplace=True)
    gaps_df[duration1_str] = 0
    gaps_df[duration2_str] = 0
    gaps_df['largest'] = 0
    gaps_df['average'] = 0
    gaps_df['total'] = 0
    gaps_df['worst_hour_interval'] = 0
    gaps_df['worst_hour'] = 0

primary = feeds[0]
for underlying in underlyings:
    num_days = 0
    print('Processing for ',underlying)    
    long_df = process_sources()
    filled_df = long_df.combined.resample('1S').pad()

    if analyze_vol:
        vol_df = long_df.resample('10S').first()
        vol_df['pct_return'] = vol_df[primary].pct_change()
        vol_df['vol'] = vol_df.pct_return.ewm(halflife=6).std()*np.sqrt(6*60*24*365)
        trace1 = go.Scatter(x=vol_df.index,y=vol_df.vol,name=underlying)
        data = [trace1]
        layout = go.Layout(title=underlying,yaxis=dict(title='Realized Volatility'),xaxis=dict(title='GMT'))
        fig = go.Figure(data=data,layout=layout)
        url = plotly.offline.plot(fig,filename=output_dir+underlying+'_vol.html',auto_open=False)

    if analyze_gaps: 
#        long_df['sec'] = long_df.index.day*86400+long_df.index.hour*3600+long_df.index.minute*60+long_df.index.second
        long_df['sec'] = long_df.index.astype(np.int64) // 10**9
        long_df['gap'] = long_df.sec - long_df.sec.shift()
        long_df.drop(['sec'],axis=1,inplace=True)
        long_df.gap = long_df.gap[long_df.gap<72000]
        long_gaps = long_df[long_df.gap>gap_duration1*60]
        long_gaps.drop([primary],axis=1,inplace=True)
        long_gaps = long_gaps[long_gaps.gap<7200]
        if len(long_gaps) > 0:
            long_gaps['time_gap'] = pd.to_timedelta(long_gaps.gap*1e9)
            long_gaps['begin_gap'] = long_gaps.index-long_gaps.time_gap
            long_gaps.drop(['time_gap'],axis=1,inplace=True)
            long_gaps['end_gap'] = long_gaps.index.time
            long_gaps.set_index(long_gaps.begin_gap,inplace=True)
            long_gaps.begin_gap = long_gaps.index.time
            long_gaps.set_index(long_gaps.index.date,inplace=True)        
            cols = long_gaps.columns.tolist()
            cols = [cols[1]] + [cols[2]] + [cols[0]]
            long_gaps = long_gaps[cols]
            long_gaps.index.rename('gap_date',inplace=True) 
            long_gaps.to_csv(output_dir+underlying+'_gaps.csv')
            gaps_df.loc[underlying,duration1_str] = long_gaps.gap.count()
            gaps_df.loc[underlying,'largest'] = long_gaps.gap.max()
            gaps_df.loc[underlying,'average'] = np.round(long_gaps.gap.mean(),0)
            gaps_df.loc[underlying,'total'] = long_gaps.gap.sum()
            if (long_df.gap>gap_duration2*60).sum() > 0: 
                long_gaps = long_gaps[long_df.gap>gap_duration2*60]
                gaps_df.loc[underlying,duration2_str] =  long_gaps.gap.count()        
            binned = long_df.groupby(long_df.index.hour).count()
            gaps_df.loc[underlying,'worst_hour'] = binned[primary].sort_values(ascending=False).index[3]
            gaps_df.loc[underlying,'worst_hour_interval'] = np.round(
                3600*num_days/binned[primary].sort_values(ascending=False).values[3],1)
            print('Worst hour (GMT):',gaps_df.loc[underlying,'worst_hour'],'Worst-hour interval:',
                  gaps_df.loc[underlying,'worst_hour_interval'])
            
            
    if analyze_digit:
        long_df['digit'] = (long_df[primary]*1e4).astype(int).mod(10)
        print('Number of ticks for digits 0 through 9:')
        print(np.histogram(long_df.digit,bins=range(0,11))[0])
        trace1 = go.Histogram(x=long_df.digit,histnorm='percent',autobinx=True,name=underlying)
        data = [trace1]
        layout = go.Layout(title=underlying+' Last Digit',yaxis=dict(title='Percent of ticks'),
                           xaxis=dict(title='Last Digit',tickmode='array',tickvals=np.arange(0,10)),barmode = 'overlay')
        fig = go.Figure(data=data,layout=layout)
        url = plotly.offline.plot(fig,filename=output_dir+underlying+'_last-digit.html',auto_open=False)       
       
    if feed_chart: # Plot feed
        data = []
        for feed_source in feeds:
            trace = go.Scatter(x=long_df.index,y=long_df[primary],name=feed_source,mode='lines')
            data.append(trace)
        layout = go.Layout(title=underlying,yaxis=dict(title=underlying),xaxis=dict(title='GMT'))
        fig = go.Figure(data=data,layout=layout)
        url = plotly.offline.plot(fig,filename=output_dir+underlying+'_chart.html',auto_open=False)

    if feed_coverage: # Plot tick histogram
        trace1 = go.Histogram(x=long_df.index.hour,autobinx=True,name=underlying)
        data = [trace1]
        layout = go.Layout(title=underlying+' Tick Frequency',yaxis=dict(title='Number of ticks'),
                           xaxis=dict(title='GMT'),barmode = 'overlay')
        fig = go.Figure(data=data,layout=layout)
        url = plotly.offline.plot(fig,filename=output_dir+underlying+'_tick-histogram.html',auto_open=False)
        
    if show_trades:
        trades_df = read_trades()
        datetime_end = pd.to_datetime(str(datelist[len(datelist)-1])+' 23:59:59')
        filtered_trades_df = trades_df[trades_df.index>=datelist[0]]
        filtered_trades_df = filtered_trades_df[filtered_trades_df.index<=datetime_end]
        filtered_trades_df = filtered_trades_df[filtered_trades_df.payout>=min_payout]
        call_trades_df = filtered_trades_df[filtered_trades_df['type']=='CALL']
        call_tradelist = call_trades_df.index
        put_trades_df = filtered_trades_df[filtered_trades_df['type']=='PUT']
        put_tradelist = put_trades_df.index
        digit_trades_df = filtered_trades_df[filtered_trades_df['type']=='DIGITMATCH']
        digit_tradelist = digit_trades_df.index
        trace1 = go.Scatter(x=long_df.index,y=long_df.combined,name=primary,line=dict(color='orange'))
        trace2 = go.Scatter(x=digit_tradelist,y=long_df.combined[digit_tradelist],name='digit',mode='markers',
                        marker=dict(color='yellow'))
        trace3 = go.Scatter(x=put_tradelist,y=filled_df[put_tradelist],name='put',
                        mode='markers',marker=dict(color='red'))
        trace4 = go.Scatter(x=call_tradelist,y=filled_df[call_tradelist],name='call',
                        mode='markers', marker=dict(size=12,symbol='circle-open',color='green',line=dict(width=2))) 
        data = [trace1,trace2,trace3,trace4]
        layout = go.Layout(title=client_id, yaxis=dict(title=underlying),xaxis=dict(title='GMT'))
        fig = go.Figure(data=data,layout=layout)
        url = plotly.offline.plot(fig,filename=output_dir+underlying+'_'+client_id+'_trades.html',auto_open=False)

#if analyze_gaps and len(long_gaps) > 0:
#    gaps_df.to_csv(output_dir+'feed-interval.csv')
    
if analyze_gaps:
    gaps_df.to_csv(output_dir+'feed-interval.csv')
        
print('Finished running script!')

Will process for dates  DatetimeIndex(['2016-12-27'], dtype='datetime64[ns]', freq='D')
Processing for  STI
27-Dec-16
Finished running script!


In [30]:
np.histogram(long_df.digit,bins=range(0,11))[1]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [53]:
feeds = ['Stan','Frank']
primary = feeds[1]
primary

'Frank'