# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    #tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        #tick_act.append(tick_list[-1])
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    #temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

## File paths

#### 1. High number of ticks ensures pip level movement and pip level predictions
#### 2. High sma_len helps ml model prediction accuracy (Predictable curve)
#### 3. High sma_len remove connection between actual tick_avg and sema (Prediction is high, but actual prediction is poor)

In [6]:
year = 2020

data = {}
data['number_of_ticks'] = 300
data['rsi_window'] = 14
data['sma_len'] = 5
data['lma_len'] = 10

diff_col = 'sema'
data['pip_diff'] = 0.0001

source_file_path = f'data/yearly_tick_data/{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/yearly_tick_data/2020.csv
chunk_file_path : data/yearly_tick_data\chunk_2020.csv
target_file_path : data/yearly_tick_data\tab_2020.csv


## Read data

In [7]:
%%time
#df = pd.read_csv(source_file_path, nrows=100000)
df = pd.read_csv(source_file_path)
df.head()

Wall time: 23.1 s


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20200101 22:01:12.821,1.12106,1.1216,1
1,20200101 22:01:17.176,1.1212,1.1216,1
2,20200101 22:01:18.545,1.12117,1.1216,1
3,20200101 22:01:19.145,1.12123,1.12161,1
4,20200101 22:01:19.246,1.1212,1.12161,1


## Data manipulation

In [8]:
%%time

df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|████████████████████████████████████████████████████████████████████████| 109213/109213 [00:34<00:00, 3177.90it/s]


Records : 109213
Wall time: 37.1 s


Unnamed: 0,tick_avg,spread_avg,tick_sd
0,1.121443,0.000238,6.3e-05
1,1.121549,0.000145,4.8e-05
2,1.121619,0.000117,1.3e-05
3,1.121626,0.000109,2.6e-05
4,1.121618,0.000106,5.9e-05


In [9]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

Wall time: 81.8 ms


In [10]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

109209it [01:44, 1047.16it/s]
109204it [01:44, 1048.73it/s]

Wall time: 3min 28s





In [11]:
%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

109205it [00:31, 3453.84it/s]
109200it [00:31, 3461.34it/s]

Wall time: 1min 3s





In [12]:
%%time

df['ema_diff'] = df['sema'] - df['lema']

# Direction -------------------------
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

df.tail()

Total records : 109200
Wall time: 97.7 ms


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
109195,1.222372,4.6e-05,0.00022,1.221984,1.221805,0.000244,0.00014,0.00046,0.00014,9.2e-05,...,4.2e-05,0.000135,1.222372,1.221566,0.0,-0.000806,84.783261,37.625794,0.000179,same
109196,1.222057,5.7e-05,6.9e-05,1.222033,1.22185,4.9e-05,4.4e-05,-0.000315,0.00014,0.000113,...,-5e-06,0.000238,1.222372,1.221616,0.000315,-0.000442,85.525893,77.65709,0.000184,same
109197,1.221835,7e-05,0.000119,1.221976,1.221858,-5.7e-05,8e-06,-0.000222,0.00014,0.000126,...,3.8e-05,0.000233,1.222372,1.221616,0.000537,-0.000219,84.345456,81.623676,0.000118,same
109198,1.221717,7.9e-05,6.5e-05,1.221895,1.22185,-8.1e-05,-7e-06,-0.000118,0.000129,0.000134,...,6.4e-05,0.00019,1.222372,1.221717,0.000655,0.0,71.671488,81.386132,4.4e-05,decrease
109199,1.221635,0.000267,5.5e-05,1.221794,1.221817,-0.000101,-3.4e-05,-8.2e-05,0.000117,0.00014,...,1.9e-05,0.000115,1.222372,1.221635,0.000737,0.0,-79.073615,78.970075,-2.3e-05,same


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 4.14 s


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same       64748   59.293040
increase   22445   20.554029
decrease   22007   20.152930


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
0,1.12189,2.6e-05,7.9e-05,1.121775,1.121809,5.5e-05,2.7e-05,0.000172,6.3e-05,3.1e-05,...,2.6e-05,-7.7e-05,1.12189,1.12163,0.0,-0.000259,-76.852918,19.699694,-3.3e-05,increase
1,1.12209,2.4e-05,6.1e-05,1.121892,1.121873,0.000117,6.4e-05,0.0002,7.8e-05,3.1e-05,...,4.7e-05,-7.8e-05,1.12209,1.12163,0.0,-0.00046,54.681072,-30.71745,1.9e-05,increase
2,1.122291,2.6e-05,5.5e-05,1.122056,1.121959,0.000164,8.6e-05,0.000201,8.4e-05,3.1e-05,...,3.4e-05,1e-05,1.122291,1.12163,0.0,-0.000661,82.75904,-3.637628,9.7e-05,increase
3,1.122379,2.3e-05,3.5e-05,1.122201,1.122046,0.000146,8.7e-05,8.8e-05,8.6e-05,3.1e-05,...,3.8e-05,0.000121,1.122379,1.121717,0.0,-0.000662,85.400742,58.144592,0.000156,same
4,1.12235,2.5e-05,2.1e-05,1.122283,1.122109,8.1e-05,6.3e-05,-3e-05,8.5e-05,3.4e-05,...,2.9e-05,0.000219,1.122379,1.12189,3e-05,-0.00046,85.684777,75.691082,0.000174,same


In [15]:
# Direction -------------------------
#diff_col = 'tick_act'
diff_col = 'tick_avg'

df['act_direction'] = 'same'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

print('prediction : same')
print(df.loc[df['direction'] == 'same', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : increase')
print(df.loc[df['direction'] == 'increase', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : decrease')
print(df.loc[df['direction'] == 'decrease', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('\n')
print(df[['tick_avg', 'sema', 'direction','act_direction']].head(50))
del df['act_direction']

prediction : same
same        0.510070
increase    0.247529
decrease    0.242401
Name: act_direction, dtype: float64
-------------
prediction : increase
increase    0.771352
same        0.200490
decrease    0.028158
Name: act_direction, dtype: float64
-------------
prediction : decrease
decrease    0.776753
same        0.192212
increase    0.031036
Name: act_direction, dtype: float64
-------------


    tick_avg      sema direction act_direction
0   1.121890  1.121775  increase      increase
1   1.122090  1.121892  increase      increase
2   1.122291  1.122056  increase          same
3   1.122379  1.122201      same          same
4   1.122350  1.122283      same          same
5   1.122331  1.122321      same      decrease
6   1.122039  1.122224  decrease          same
7   1.121972  1.122124      same          same
8   1.122027  1.122074      same      decrease
9   1.121922  1.122002      same      increase
10  1.122048  1.122003      same      increase
11  1.122256  1.122098      same 