# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    #tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        #tick_act.append(tick_list[-1])
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    #temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

## File paths

#### 1. High number of ticks ensures pip level movement and pip level predictions
#### 2. High sma_len helps ml model prediction accuracy (Predictable curve)
#### 3. High sma_len remove connection between actual tick_avg and sema (Prediction is high, but actual prediction is poor)

In [6]:
year = 2018

data = {}
data['number_of_ticks'] = 60
data['rsi_window'] = 14
data['sma_len'] = 10
data['lma_len'] = 20

diff_col = 'sema'
data['pip_diff'] = 0.00002

source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/tick_2018.csv
chunk_file_path : data\chunk_tick_2018.csv
target_file_path : data\tab_tick_2018.csv


## Read data

In [7]:
%%time
df = pd.read_csv(source_file_path, nrows=1000000)
#df = pd.read_csv(source_file_path)
df.head()

Wall time: 574 ms


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20180101 22:00:08.661,1.20102,1.20143,2
1,20180101 22:00:08.895,1.20102,1.20148,2
2,20180101 22:00:10.634,1.20102,1.20147,2
3,20180101 22:00:11.223,1.20102,1.20148,2
4,20180101 22:00:29.530,1.20102,1.20145,2


## Data manipulation

In [8]:
%%time

df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|██████████████████████████████████████████████████████████████████████████| 16667/16667 [00:04<00:00, 4145.65it/s]


Records : 16667
Wall time: 4.21 s


Unnamed: 0,tick_avg,spread_avg,tick_sd
0,1.200786,0.000614,0.000207
1,1.200758,0.000314,0.000176
2,1.201033,0.000224,6.3e-05
3,1.201015,0.000208,0.000129
4,1.200761,0.000364,1e-05


In [9]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

Wall time: 25.9 ms


In [10]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

16658it [00:15, 1063.10it/s]
16648it [00:15, 1065.99it/s]

Wall time: 31.3 s





In [11]:
%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

16649it [00:04, 3504.70it/s]
16639it [00:04, 3495.72it/s]

Wall time: 9.52 s





In [12]:
%%time

df['ema_diff'] = df['sema'] - df['lema']

# Direction -------------------------
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

df.tail()

Total records : 16639
Wall time: 20.9 ms


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
16634,1.228781,2.7e-05,4.3e-05,1.228898,1.228942,-2.3e-05,-2.2e-05,-0.000102,3.2e-05,7.4e-05,...,-1.505417e-05,-9.1e-05,1.22902,1.228618,0.000239,-0.000163,-14.188754,-36.860229,-4.4e-05,same
16635,1.228814,3.4e-05,1.4e-05,1.228888,1.228926,-1e-05,-1.5e-05,3.2e-05,3.5e-05,6e-05,...,-1.035417e-05,-6.1e-05,1.22902,1.228781,0.000207,-3.2e-05,-18.466817,-44.519539,-3.9e-05,increase
16636,1.228996,3.3e-05,9.6e-05,1.228912,1.228933,2.4e-05,6e-06,0.000182,4.8e-05,5.5e-05,...,-9.791667e-07,-4.5e-05,1.22902,1.228781,2.5e-05,-0.000214,-20.748185,-49.002373,-2.1e-05,increase
16637,1.229194,3.6e-05,3.7e-05,1.228969,1.22896,5.7e-05,2.7e-05,0.000199,6.2e-05,4.6e-05,...,8.15e-06,-3.2e-05,1.229194,1.228781,0.0,-0.000413,-1.61938,-50.056407,9e-06,increase
16638,1.229307,3.9e-05,2.3e-05,1.22904,1.228996,7.1e-05,3.6e-05,0.000112,7e-05,2.6e-05,...,9.26875e-06,-9e-06,1.229307,1.228781,0.0,-0.000525,34.813799,-47.233171,4.4e-05,same


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 649 ms


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same        5879   35.332652
increase    5452   32.766392
decrease    5308   31.900956


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
0,1.200929,0.000114,8.8e-05,1.200825,1.200957,1.1e-05,7.207686e-07,-8e-06,3.3e-05,7.5e-05,...,1.1e-05,-0.000253,1.201085,1.200534,0.000156,-0.000395,-80.674499,-41.175363,-0.000132,same
1,1.200891,0.000119,4.1e-05,1.200831,1.200954,7e-06,-3.691216e-06,-3.8e-05,3.3e-05,7.1e-05,...,9e-06,-0.000281,1.201025,1.200534,0.000134,-0.000357,-77.473045,-58.627797,-0.000122,same
2,1.200936,0.000121,4.6e-05,1.200848,1.200951,1.6e-05,-2.429719e-06,4.4e-05,3.3e-05,7.1e-05,...,-2e-06,-0.000287,1.200937,1.200534,2e-06,-0.000401,-68.809173,-66.675155,-0.000103,same
3,1.200887,0.000104,3.7e-05,1.200865,1.200938,1.7e-05,-1.283986e-05,-4.9e-05,3.2e-05,7.5e-05,...,-2.3e-05,-0.000229,1.200937,1.200593,5.1e-05,-0.000293,-47.633771,-70.355628,-7.3e-05,same
4,1.200881,0.000121,1.9e-05,1.200876,1.200925,1.1e-05,-1.297547e-05,-6e-06,3.2e-05,6.1e-05,...,-2.5e-05,-0.000175,1.200937,1.20065,5.6e-05,-0.000231,-1.813543,-72.172633,-4.9e-05,same


In [15]:
# Direction -------------------------
#diff_col = 'tick_act'
diff_col = 'tick_avg'

df['act_direction'] = 'same'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > 0.0001] = 'increase'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -0.0001] = 'decrease'

print('prediction : same')
print(df.loc[df['direction'] == 'same', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : increase')
print(df.loc[df['direction'] == 'increase', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : decrease')
print(df.loc[df['direction'] == 'decrease', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('\n')
print(df[['tick_avg', 'sema', 'direction','act_direction']].head(50))
del df['act_direction']

prediction : same
same        0.788910
decrease    0.108352
increase    0.102739
Name: act_direction, dtype: float64
-------------
prediction : increase
same        0.590976
increase    0.362436
decrease    0.046588
Name: act_direction, dtype: float64
-------------
prediction : decrease
same        0.605878
decrease    0.354182
increase    0.039940
Name: act_direction, dtype: float64
-------------


    tick_avg      sema direction act_direction
0   1.200929  1.200825      same          same
1   1.200891  1.200831      same          same
2   1.200936  1.200848      same          same
3   1.200887  1.200865      same          same
4   1.200881  1.200876      same          same
5   1.200938  1.200895  increase      increase
6   1.201103  1.200944      same      decrease
7   1.200975  1.200956      same          same
8   1.200906  1.200948  increase      increase
9   1.201187  1.200999  increase      increase
10  1.201465  1.201099  increase          same
11  1.201527  1.201194  increase 