# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    spread_cls = []
    spread_avg = []
    
    tick_opn = []
    tick_hig = []
    tick_low = []
    tick_cls = []
    tick_avg = []
    tick_sd = []
    tick_ema_10 = []
    tick_ema_25 = []
    tick_ema_50 = []
    tick_ema_75 = []
    tick_ema = []
    tick_slope = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])      
        
        spread_cls.append(spread_list[-1])
        spread_avg.append(np.mean(spread_list))

        tick_opn.append(tick_list[0])
        tick_hig.append(np.max(tick_list))
        tick_low.append(np.min(tick_list))
        tick_cls.append(tick_list[-1])
        tick_avg.append(np.mean(tick_list))  
        tick_sd.append(np.std(tick_list))
        
        l = tick_list[-(int(len(tick_list)*0.10)):]
        tick_ema_10.append(list(pd.DataFrame(l).ewm(span=len(l)).mean()[0])[len(l) - 1])        
        
        l = tick_list[-(int(len(tick_list)*0.25)):]
        tick_ema_25.append(list(pd.DataFrame(l).ewm(span=len(l)).mean()[0])[len(l) - 1])        
        
        l = tick_list[-(int(len(tick_list)*0.50)):]
        tick_ema_50.append(list(pd.DataFrame(l).ewm(span=len(l)).mean()[0])[len(l) - 1])        
        
        l = tick_list[-(int(len(tick_list)*0.75)):]
        tick_ema_75.append(list(pd.DataFrame(l).ewm(span=len(l)).mean()[0])[len(l) - 1])        
        
        tick_ema.append(list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]) 
        tick_slope.append(get_slope(tick_list))
        
    temp_df['spread_avg'] = spread_avg  
    temp_df['spread_cls'] = spread_cls  

    temp_df['tick_opn'] = tick_opn
    temp_df['tick_high'] = tick_hig
    temp_df['tick_low'] = tick_low
    temp_df['tick_cls'] = tick_cls
    temp_df['tick_avg'] = tick_avg  
    temp_df['tick_sd'] = tick_sd
    
    temp_df['tick_ema_10'] = tick_ema_10  
    temp_df['tick_ema_25'] = tick_ema_25  
    temp_df['tick_ema_50'] = tick_ema_50  
    temp_df['tick_ema_75'] = tick_ema_75  
    temp_df['tick_ema'] = tick_ema 
    temp_df['tick_slope'] = tick_slope 

    return(temp_df)

In [3]:
def get_slope(y_axis):
    global data
    x_axis = []
    for i in range(len(y_axis)):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))

    return(slope_tick)

## File paths

In [4]:
year = 2018
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/tick_2018.csv
chunk_file_path : data\chunk_tick_2018.csv
target_file_path : data\tab_tick_2018.csv


## Read data

In [5]:
data = {}
diff_col = 'sema'
data['pip_diff'] = 0.00002

data['number_of_ticks'] = 100

data['sma_len'] = 20
data['lma_len'] = 50

data['rsi_window'] = 14

In [18]:
%%time
df = pd.read_csv(source_file_path, nrows=1000000)
#df = pd.read_csv(source_file_path)
df.head()

Wall time: 581 ms


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20180101 22:00:08.661,1.20102,1.20143,2
1,20180101 22:00:08.895,1.20102,1.20148,2
2,20180101 22:00:10.634,1.20102,1.20147,2
3,20180101 22:00:11.223,1.20102,1.20148,2
4,20180101 22:00:29.530,1.20102,1.20145,2


In [19]:
%%time
df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')
df = pd.read_csv(chunk_file_path)
df.head()

100%|█████████████████| 10000/10000 [00:52<00:00, 191.57it/s]


Records : 10000
Wall time: 52.5 s


Unnamed: 0,spread_avg,spread_cls,tick_opn,tick_high,tick_low,tick_cls,tick_avg,tick_sd,tick_ema_10,tick_ema_25,tick_ema_50,tick_ema_75,tick_ema,tick_slope
0,0.000491,0.0003,1.201225,1.20125,1.20046,1.20065,1.200736,0.000183,1.200601,1.200619,1.200646,1.200662,1.200686,-19.644262
1,0.000242,0.00025,1.2007,1.201165,1.200695,1.201115,1.201024,8.9e-05,1.201109,1.201077,1.201048,1.201043,1.201036,4.611156
2,0.000302,0.0004,1.20112,1.201145,1.200735,1.20076,1.200852,0.000145,1.200762,1.200761,1.200761,1.200765,1.200798,-21.937026
3,0.000256,0.00025,1.200775,1.200785,1.200655,1.200705,1.200723,3.6e-05,1.200714,1.200715,1.200714,1.200711,1.200716,-3.494262
4,0.00025,0.00025,1.20072,1.20073,1.200705,1.200715,1.200718,6e-06,1.200717,1.200716,1.200717,1.200718,1.200718,0.13993


In [20]:
%%time

df['tick_next_ema_diff'] = df['tick_ema'].shift(-1) - df['tick_ema']
df['direction'] = 'same'
df.loc[df['tick_next_ema_diff'] >= 0.0001, 'direction'] = 'increase'
df.loc[df['tick_next_ema_diff'] <= -0.0001, 'direction'] = 'decrease'

df.head()

Wall time: 5.98 ms


Unnamed: 0,spread_avg,spread_cls,tick_opn,tick_high,tick_low,tick_cls,tick_avg,tick_sd,tick_ema_10,tick_ema_25,tick_ema_50,tick_ema_75,tick_ema,tick_slope,tick_next_ema_diff,direction
0,0.000491,0.0003,1.201225,1.20125,1.20046,1.20065,1.200736,0.000183,1.200601,1.200619,1.200646,1.200662,1.200686,-19.644262,0.000349,increase
1,0.000242,0.00025,1.2007,1.201165,1.200695,1.201115,1.201024,8.9e-05,1.201109,1.201077,1.201048,1.201043,1.201036,4.611156,-0.000237,decrease
2,0.000302,0.0004,1.20112,1.201145,1.200735,1.20076,1.200852,0.000145,1.200762,1.200761,1.200761,1.200765,1.200798,-21.937026,-8.2e-05,same
3,0.000256,0.00025,1.200775,1.200785,1.200655,1.200705,1.200723,3.6e-05,1.200714,1.200715,1.200714,1.200711,1.200716,-3.494262,2e-06,same
4,0.00025,0.00025,1.20072,1.20073,1.200705,1.200715,1.200718,6e-06,1.200717,1.200716,1.200717,1.200718,1.200718,0.13993,-1e-06,same


In [21]:
%%time
df['ema_gap'] = df['tick_ema_10'] - df['tick_ema']    

col_list = ['tick_opn', 'tick_high', 'tick_low', 'tick_cls', 'tick_avg', 'tick_ema', 'spread_cls', 'tick_ema_10', 'tick_ema_25', 'tick_ema_50', 'tick_ema_75']
for i, val in enumerate(col_list):    
    df[val+'_diff'] =  df[val].diff()
    del df[val]        

Wall time: 13 ms


In [22]:
%%time

data['rs_max'] = 1e6
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['tick_avg_diff'] > 0] = abs(df['tick_avg_diff'])
df['loss'].loc[df['tick_avg_diff'] < 0] = abs(df['tick_avg_diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

Wall time: 13 ms


In [12]:
col_order = [
 'spread_avg',
 'spread_cls_diff', 
 'tick_opn_diff',
 'tick_high_diff',
 'tick_low_diff',
 'tick_cls_diff',
 'tick_avg_diff',
 'tick_sd',
 'tick_ema_10_diff',
 'tick_ema_25_diff',
 'tick_ema_50_diff',
 'tick_ema_75_diff',
 'tick_ema_diff',
 'ema_gap',
 'gain',
 'loss',
 'avg_gain',
 'avg_loss',
 'rsi',
 'tick_slope',
 'direction'
 ]

df = df[col_order]
df = df.dropna()

## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 348 ms


In [14]:
print(f'Total records : {len(df)}')
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head()

Total records : 9987
          counts  percentage
same        5297   53.038951
increase    2382   23.851006
decrease    2308   23.110043


Unnamed: 0,spread_avg,spread_cls_diff,tick_opn_diff,tick_high_diff,tick_low_diff,tick_cls_diff,tick_avg_diff,tick_sd,tick_ema_10_diff,tick_ema_25_diff,...,tick_ema_75_diff,tick_ema_diff,ema_gap,gain,loss,avg_gain,avg_loss,rsi,tick_slope,direction
13,0.000106,-0.00015,-0.00058,-0.000455,7.5e-05,0.000195,-0.000249,4.5e-05,0.000188,0.000161,...,-1.8e-05,-9.8e-05,5.7e-05,0.0,0.000249,7.9e-05,9e-05,46.774637,6.271693,increase
14,4.8e-05,-2e-05,0.00019,0.00011,0.000165,4e-05,0.000114,3.7e-05,6.3e-05,9.9e-05,...,0.000113,0.000114,6e-06,0.000114,0.0,8.7e-05,9e-05,49.233689,5.992988,increase
15,6.7e-05,6e-05,5.5e-05,0.00015,8.5e-05,0.00017,9.7e-05,6.4e-05,0.000164,0.000152,...,0.00012,0.000112,5.7e-05,9.7e-05,0.0,7.3e-05,9e-05,44.966399,10.706193,increase
16,0.000117,1e-05,0.00015,0.000145,7e-05,6.5e-05,0.000135,8.7e-05,0.000108,0.000131,...,0.000147,0.000142,2.3e-05,0.000135,0.0,8.3e-05,7.7e-05,51.711323,13.887784,same
17,0.000113,-1e-05,0.0001,-6.5e-05,0.0,-6.5e-05,-4e-05,5.7e-05,-0.00011,-0.000129,...,-0.000104,-8e-05,-6e-06,0.0,4e-05,8.3e-05,7.1e-05,53.837956,-1.632136,same
