# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time
import imblearn

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        #print(len(tick_list))
        tick_act.append(tick_list[-1])              
        temp = list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]
        tick_avg.append(temp)

        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

## File paths

#### 1. High number of ticks ensures pip level movement and pip level predictions
#### 2. High sma_len helps ml model prediction accuracy (Predictable curve)
#### 3. High sma_len remove connection between actual tick_avg and sema (Prediction is high, but actual prediction is poor)

In [6]:
year = 2020

data = {}
data['number_of_ticks'] = 300
data['rsi_window'] = 5
data['sma_len'] = 5
data['lma_len'] = 6

diff_col = 'sema'
#diff_col = 'tick_avg'

data['pip_diff'] = 0.0002

source_file_path = f'data/yearly_tick_data/{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/yearly_tick_data/2020.csv
chunk_file_path : data/yearly_tick_data\chunk_2020.csv
target_file_path : data/yearly_tick_data\tab_2020.csv


## Read data

In [7]:
%%time
#df = pd.read_csv(source_file_path, nrows=10000000)
df = pd.read_csv(source_file_path)
df.head()

Wall time: 18.6 s


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20200101 22:01:12.821,1.12106,1.1216,1
1,20200101 22:01:17.176,1.1212,1.1216,1
2,20200101 22:01:18.545,1.12117,1.1216,1
3,20200101 22:01:19.145,1.12123,1.12161,1
4,20200101 22:01:19.246,1.1212,1.12161,1


## Data manipulation

In [8]:
%%time

df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|████████████████████████████████████████████████████████████████████████████████████████| 109213/109213 [02:21<00:00, 771.61it/s]


Records : 109213
Wall time: 2min 24s


Unnamed: 0,tick_act,tick_avg,spread_avg,tick_sd
0,1.12145,1.121447,0.000238,6.3e-05
1,1.121605,1.121567,0.000145,4.8e-05
2,1.12162,1.121621,0.000117,1.3e-05
3,1.12165,1.121636,0.000109,2.6e-05
4,1.12188,1.121614,0.000106,5.9e-05


In [9]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

Wall time: 84.1 ms


In [10]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

df['ema_diff'] = df['sema'] - df['lema']

109209it [01:47, 1019.06it/s]
109208it [01:45, 1037.54it/s]

Wall time: 3min 32s





In [11]:
%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

df['slope_diff'] = df['small_sema_slope'] - df['long_sema_slope']

109205it [00:32, 3350.27it/s]
109204it [00:32, 3358.76it/s]

Wall time: 1min 5s





In [12]:
%%time
df = df.round(5)

# Direction -------------------------
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
#del df['tick_act']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

df.tail()

Total records : 109204
Wall time: 220 ms


Unnamed: 0,tick_act,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,...,sma_diff,max_tick,min_tick,max_gap,min_gap,ema_diff,small_sema_slope,long_sema_slope,slope_diff,direction
109199,1.22195,1.2223,5e-05,0.00022,1.22198,1.22194,0.0002,0.00019,0.00031,0.00017,...,1e-05,1.2223,1.22154,0.0,-0.00076,4e-05,84.76995,83.17892,1.59104,same
109200,1.22198,1.22204,6e-05,7e-05,1.22203,1.22198,5e-05,4e-05,-0.00026,0.00017,...,7e-05,1.2223,1.2216,0.00026,-0.00044,4e-05,85.24346,84.74207,0.5014,same
109201,1.22173,1.22178,7e-05,0.00012,1.22195,1.22194,-8e-05,-5e-05,-0.00026,0.00014,...,4e-05,1.2223,1.2216,0.00052,-0.00018,1e-05,83.54918,83.61919,-0.07001,same
109202,1.22158,1.22172,8e-05,7e-05,1.22188,1.22188,-7e-05,-6e-05,-6e-05,0.00014,...,6e-05,1.2223,1.22172,0.00058,0.0,0.0,59.40508,79.00383,-19.59874,same
109203,1.22158,1.22161,0.00027,6e-05,1.22177,1.2218,-0.00011,-7e-05,-0.00011,6e-05,...,-2e-05,1.2223,1.22161,0.00069,0.0,-3e-05,-79.96272,-50.01793,-29.94479,same


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 2.31 s


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same       95887   87.805392
decrease    6799    6.225962
increase    6518    5.968646


Unnamed: 0,tick_act,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,...,sma_diff,max_tick,min_tick,max_gap,min_gap,ema_diff,small_sema_slope,long_sema_slope,slope_diff,direction
0,1.12184,1.12185,3e-05,3e-05,1.12192,1.12191,-2e-05,-1e-05,-4e-05,0.0001,...,6e-05,1.12209,1.12185,0.00024,0.0,1e-05,77.33828,81.3419,-4.00362,same
1,1.12163,1.12166,3e-05,6e-05,1.12182,1.12184,-0.0001,-7e-05,-0.00019,3e-05,...,-1e-05,1.12209,1.12166,0.00043,0.0,-2e-05,-42.56335,58.30642,-100.86977,same
2,1.12164,1.12163,3e-05,1e-05,1.12174,1.12177,-8e-05,-7e-05,-2e-05,2e-05,...,-3e-05,1.12209,1.12163,0.00046,0.0,-3e-05,-79.78926,-70.34724,-9.44203,same
3,1.12184,1.12176,3e-05,9e-05,1.12173,1.12176,-1e-05,-1e-05,0.00013,3e-05,...,-6e-05,1.12189,1.12163,0.00012,-0.00013,-2e-05,-80.70459,-79.20934,-1.49526,same
4,1.12206,1.12193,3e-05,8e-05,1.1218,1.1218,7e-05,4e-05,0.00016,6e-05,...,-2e-05,1.12193,1.12163,0.0,-0.00029,0.0,-73.63685,-75.87025,2.2334,same


In [15]:
# Direction -------------------------
#diff_col = 'tick_act'
#data['pip_diff'] = 0.0001
diff_col = 'tick_avg'

df['act_direction'] = 'same'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

print('prediction : same')
print(df.loc[df['direction'] == 'same', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : increase')
print(df.loc[df['direction'] == 'increase', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : decrease')
print(df.loc[df['direction'] == 'decrease', 'act_direction'].value_counts(normalize=True))
print('-------------')

df['tick_avg_diff'] = round(df['tick_avg'].diff() * 10000)

print('\n')
print(df[['tick_avg_diff','tick_avg', 'sema', 'direction','act_direction']].head(50))
del df['act_direction']
del df['tick_avg_diff']

prediction : same
same        0.720442
increase    0.141156
decrease    0.138402
Name: act_direction, dtype: float64
-------------
prediction : increase
increase    0.789506
same        0.200828
decrease    0.009666
Name: act_direction, dtype: float64
-------------
prediction : decrease
decrease    0.791587
same        0.202530
increase    0.005883
Name: act_direction, dtype: float64
-------------


    tick_avg_diff  tick_avg     sema direction act_direction
0             NaN   1.12185  1.12192      same          same
1            -2.0   1.12166  1.12182      same          same
2            -0.0   1.12163  1.12174      same          same
3             1.0   1.12176  1.12173      same          same
4             2.0   1.12193  1.12180      same          same
5             2.0   1.12210  1.12191      same      increase
6             2.0   1.12231  1.12208      same          same
7             1.0   1.12237  1.12221      same          same
8            -0.0   1.12234  1.12229      same  

In [16]:
# Direction -------------------------
diff_col = 'tick_act'
#data['pip_diff'] = 0.0001
#diff_col = 'tick_avg'

df['act_direction'] = 'same'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

print('prediction : same')
print(df.loc[df['direction'] == 'same', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : increase')
print(df.loc[df['direction'] == 'increase', 'act_direction'].value_counts(normalize=True))
print('-------------')

print('prediction : decrease')
print(df.loc[df['direction'] == 'decrease', 'act_direction'].value_counts(normalize=True))
print('-------------')

df['tick_avg_diff'] = round(df['tick_avg'].diff() * 10000)

print('\n')
print(df[['tick_avg_diff','tick_avg', 'sema', 'direction','act_direction']].head(50))
del df['act_direction']
del df['tick_avg_diff']

prediction : same
same        0.618864
increase    0.192278
decrease    0.188858
Name: act_direction, dtype: float64
-------------
prediction : increase
increase    0.615986
same        0.334305
decrease    0.049708
Name: act_direction, dtype: float64
-------------
prediction : decrease
decrease    0.633181
same        0.314752
increase    0.052066
Name: act_direction, dtype: float64
-------------


    tick_avg_diff  tick_avg     sema direction act_direction
0             NaN   1.12185  1.12192      same      decrease
1            -2.0   1.12166  1.12182      same          same
2            -0.0   1.12163  1.12174      same          same
3             1.0   1.12176  1.12173      same      increase
4             2.0   1.12193  1.12180      same          same
5             2.0   1.12210  1.12191      same          same
6             2.0   1.12231  1.12208      same          same
7             1.0   1.12237  1.12221      same          same
8            -0.0   1.12234  1.12229      same  