# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        #print(len(tick_list))
        tick_act.append(tick_list[-1])              
        temp = list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]
        tick_avg.append(temp)

        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

## File paths

#### 1. High number of ticks ensures pip level movement and pip level predictions
#### 2. High sma_len helps ml model prediction accuracy (Predictable curve)
#### 3. High sma_len remove connection between actual tick_avg and sema (Prediction is high, but actual prediction is poor)

In [6]:
year = 2013

data = {}
data['number_of_ticks'] = 300
data['rsi_window'] = 14
data['sma_len'] = 5
data['lma_len'] = 6

diff_col = 'sema'
#diff_col = 'tick_avg'

data['pip_diff'] = 0.0001

source_file_path = f'data/yearly_tick_data/{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/yearly_tick_data/2013.csv
chunk_file_path : data/yearly_tick_data\chunk_2013.csv
target_file_path : data/yearly_tick_data\tab_2013.csv


## Read data

In [7]:
%%time
#df = pd.read_csv(source_file_path, nrows=10000000)
df = pd.read_csv(source_file_path)
df.head()

Wall time: 29.2 s


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20130101 00:00:00.463,1.3195,1.31962,2
1,20130101 00:00:07.974,1.31951,1.31963,2
2,20130101 00:00:08.095,1.31945,1.31959,2
3,20130101 00:00:08.163,1.31955,1.31961,1
4,20130101 00:00:09.804,1.31945,1.31961,2


## Data manipulation

In [8]:
%%time

df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|██████████████████████████████████████████████████████| 62080/62080 [03:30<00:00, 295.27it/s]


Records : 62080
Wall time: 3min 33s


Unnamed: 0,tick_act,tick_avg,spread_avg,tick_sd
0,1.32042,1.320225,0.000208,0.000403
1,1.321175,1.320809,0.000277,0.000263
2,1.321135,1.321384,0.000184,0.000237
3,1.319735,1.319967,0.000235,0.000483
4,1.319115,1.319464,0.000251,0.000266


In [9]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

Wall time: 112 ms


In [10]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

62076it [01:12, 853.46it/s]
62075it [01:05, 941.61it/s]

Wall time: 2min 18s





In [11]:
%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

62072it [00:20, 2968.30it/s]
62071it [00:19, 3212.62it/s]

Wall time: 40.2 s





In [12]:
%%time

df['ema_diff'] = df['sema'] - df['lema']

# Direction -------------------------
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
del df['tick_act']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

df.tail()

Total records : 62067
Wall time: 72.7 ms


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
62062,1.375735,3.9e-05,0.00011,1.375701,1.375655,6.3e-05,8e-05,-0.000122,0.000108,0.000264,...,0.000131,8e-05,1.375857,1.375117,0.000122,-0.000618,85.547406,84.242405,4.576786e-05,same
62063,1.375651,9.5e-05,8.9e-05,1.375711,1.375677,1e-05,2.2e-05,-8.3e-05,0.0001,0.00027,...,8.7e-05,0.0001,1.375857,1.375651,0.000205,0.0,83.819391,84.6856,3.460536e-05,same
62064,1.375718,0.000124,0.000118,1.375716,1.375715,4e-06,3.8e-05,6.7e-05,0.000105,0.000241,...,0.0001,8e-06,1.375857,1.375651,0.000138,-6.7e-05,80.275393,82.167907,8.480189e-07,decrease
62065,1.374835,0.000165,0.000402,1.37538,1.375427,-0.000336,-0.000288,-0.000884,0.000105,0.000238,...,-0.00014,-1.8e-05,1.375857,1.374835,0.001022,0.0,-78.737406,-24.63405,-4.662449e-05,decrease
62066,1.374396,0.00043,6.6e-05,1.374978,1.375077,-0.000402,-0.00035,-0.000438,0.000105,0.00024,...,-0.000212,-9.8e-05,1.375735,1.374396,0.001338,0.0,-86.778895,-85.302456,-9.833354e-05,same


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 2.5 s


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same       29215   47.070102
increase   16428   26.468171
decrease   16424   26.461727


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
0,1.319512,0.000135,4.9e-05,1.319691,1.319704,-0.000109,-8.3e-05,-0.000223,0.000152,0.000203,...,-1.6e-05,-5e-06,1.320053,1.319512,0.000541,0.0,-0.855861,35.661365,-1.3e-05,same
1,1.31966,0.000135,9.3e-05,1.319674,1.319687,-1.8e-05,-1.7e-05,0.000148,0.000163,0.000203,...,-1.8e-05,-1.6e-05,1.320053,1.319512,0.000393,-0.000148,-56.577728,-47.500625,-1.3e-05,same
2,1.319769,0.000126,0.000131,1.319715,1.319709,4.1e-05,2.2e-05,0.000109,0.000129,0.000203,...,-5e-06,2.8e-05,1.320053,1.319512,0.000284,-0.000257,-73.671389,-48.748686,6e-06,increase
3,1.320469,0.000144,0.000373,1.319987,1.319965,0.000272,0.000256,0.0007,0.000138,0.000203,...,0.000148,-3.7e-05,1.320469,1.319512,0.0,-0.000957,75.9338,57.642639,2.2e-05,increase
4,1.321059,0.000127,0.000158,1.320411,1.320322,0.000424,0.000357,0.00059,0.00018,0.000102,...,0.000168,6e-05,1.321059,1.319512,0.0,-0.001547,86.738283,84.986027,9e-05,increase


In [15]:
# Direction -------------------------
#diff_col = 'tick_act'
#data['pip_diff'] = 0.0001
diff_col = 'tick_avg'

df['act_direction'] = 'same'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

print('prediction : same')
print(df.loc[df['direction'] == 'same', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : increase')
print(df.loc[df['direction'] == 'increase', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : decrease')
print(df.loc[df['direction'] == 'decrease', 'act_direction'].value_counts(normalize=True))
print('-------------')

df['tick_avg_diff'] = round(df['tick_avg'].diff() * 10000)

print('\n')
print(df[['tick_avg_diff','tick_avg', 'sema', 'direction','act_direction']].head(50))
del df['act_direction']
del df['tick_avg_diff']

prediction : same
same        0.394078
increase    0.303406
decrease    0.302516
Name: act_direction, dtype: float64
-------------
prediction : increase
increase    0.762235
same        0.170319
decrease    0.067446
Name: act_direction, dtype: float64
-------------
prediction : decrease
decrease    0.768692
same        0.171030
increase    0.060278
Name: act_direction, dtype: float64
-------------


    tick_avg_diff  tick_avg      sema direction act_direction
0             NaN  1.319512  1.319691      same      increase
1             1.0  1.319660  1.319674      same      increase
2             1.0  1.319769  1.319715  increase      increase
3             7.0  1.320469  1.319987  increase      increase
4             6.0  1.321059  1.320411  increase      decrease
5            -3.0  1.320790  1.320602  increase      increase
6             3.0  1.321126  1.320851      same      decrease
7            -2.0  1.320957  1.320946  increase      increase
8             3.0  1.321209  1.321071  