# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global pip_diff
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
        
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(row['tick_avg'])    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(row['tick_avg'])
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

In [5]:
def before_lma():
    global data    
    data['lsma_list'].append(row['tick_avg'])    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(row['tick_avg'])
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

## Parameters

In [6]:
year = 2018
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')


number_of_ticks = 20
pip_diff = 0.00001
rsi_window = 14
rs_max = 1e6
window_s_ma = 20
window_l_ma = 50

source_file_path : data/tick_2018.csv
chunk_file_path : data\chunk_tick_2018.csv
target_file_path : data\tab_tick_2018.csv


## Read data

In [7]:
%%time
df = pd.read_csv(source_file_path, nrows=100000)
#df = pd.read_csv(source_file_path)
df.head()

Wall time: 159 ms


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20180101 22:00:08.661,1.20102,1.20143,2
1,20180101 22:00:08.895,1.20102,1.20148,2
2,20180101 22:00:10.634,1.20102,1.20147,2
3,20180101 22:00:11.223,1.20102,1.20148,2
4,20180101 22:00:29.530,1.20102,1.20145,2


## Data manipulation

In [8]:
%%time
df = chunk_ticks(df, number_of_ticks)
#df = chunk_ticks_rolling(df, number_of_ticks)

df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')
df = pd.read_csv(chunk_file_path)
df.head()

100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 1712.29it/s]


Records : 5000
Wall time: 3.05 s


Unnamed: 0,tick_avg,spread_avg,tick_sd
0,1.200982,0.000382,0.000245
1,1.200657,0.000704,9.2e-05
2,1.20072,0.000756,3.2e-05
3,1.200684,0.000365,0.000109
4,1.200634,0.000248,7.5e-05


In [9]:
%%time
# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=window_s_ma).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=window_l_ma).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=window_s_ma).max()
df['min_tick'] = df['tick_avg'].rolling(window=window_s_ma).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

df.tail()

Wall time: 48 ms


Unnamed: 0,tick_avg,spread_avg,tick_sd,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,ssma_diff,lsma,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap
4995,1.204505,3.2e-05,2.1e-05,-6.2e-05,0.0,6.2e-05,7e-06,4.5e-05,0.155104,13.427684,1.204865,-3.9e-05,1.205387,-2.4e-05,-0.000522,1.205265,1.204505,0.00076,0.0
4996,1.204463,2.4e-05,1.7e-05,-4.1e-05,0.0,4.1e-05,7e-06,4.5e-05,0.155104,13.427684,1.204825,-4e-05,1.205362,-2.5e-05,-0.000537,1.205176,1.204463,0.000713,0.0
4997,1.20451,2.3e-05,1.2e-05,4.7e-05,4.7e-05,0.0,1e-05,4.4e-05,0.231418,18.792822,1.204792,-3.3e-05,1.205338,-2.4e-05,-0.000546,1.205109,1.204463,0.000599,-4.7e-05
4998,1.204505,2.2e-05,1.1e-05,-5e-06,0.0,5e-06,1e-05,3.7e-05,0.280292,21.892816,1.204762,-3e-05,1.205313,-2.5e-05,-0.000552,1.205074,1.204463,0.000568,-4.2e-05
4999,1.204501,3.2e-05,7e-06,-4e-06,0.0,4e-06,1e-05,2.9e-05,0.350152,25.934264,1.204733,-2.9e-05,1.205287,-2.6e-05,-0.000554,1.205074,1.204463,0.000572,-3.8e-05


In [10]:
data = {}
data['sma_len'] = 20

data['ssma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])

df['sema'] = ''
df['sema_diff'] = ''

data['lma_len'] = 50

data['lsma_list'] = collections.deque([])
data['lema_ready'] = collections.deque([])

df['lema'] = ''
df['lema_diff'] = ''

In [11]:
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    # Build sema ---------------------------------------------
    if len(data['ssma_list']) < data['sma_len']:
        before_sma()          
    elif len(data['ssma_list']) == data['sma_len']:
        after_sma() 
        df['sema'][i] = data['sema']
        df['sema_diff'][i]= data['sema_diff']
        
    # Build lema ---------------------------------------------
    if len(data['lsma_list']) < data['lma_len']:
        before_lma()     
    elif len(data['lsma_list']) == data['lma_len']:
        after_lma() 
        df['lema'][i] = data['lema']
        df['lema_diff'][i]= data['lema_diff']    
        
df = df.dropna()        

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:28<00:00, 176.85it/s]


In [12]:
df.tail()

Unnamed: 0,tick_avg,spread_avg,tick_sd,diff,gain,loss,avg_gain,avg_loss,rs,rsi,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,sema,sema_diff,lema,lema_diff
4995,1.204505,3.2e-05,2.1e-05,-6.2e-05,0.0,6.2e-05,7e-06,4.5e-05,0.155104,13.427684,...,-2.4e-05,-0.000522,1.205265,1.204505,0.00076,0.0,1.20476,-3.9604e-05,1.20513,-3.32671e-05
4996,1.204463,2.4e-05,1.7e-05,-4.1e-05,0.0,4.1e-05,7e-06,4.5e-05,0.155104,13.427684,...,-2.5e-05,-0.000537,1.205176,1.204463,0.000713,0.0,1.20472,-3.99817e-05,1.2051,-3.38093e-05
4997,1.20451,2.3e-05,1.2e-05,4.7e-05,4.7e-05,0.0,1e-05,4.4e-05,0.231418,18.792822,...,-2.4e-05,-0.000546,1.205109,1.204463,0.000599,-4.7e-05,1.20469,-2.97056e-05,1.20507,-3.0486e-05
4998,1.204505,2.2e-05,1.1e-05,-5e-06,0.0,5e-06,1e-05,3.7e-05,0.280292,21.892816,...,-2.5e-05,-0.000552,1.205074,1.204463,0.000568,-4.2e-05,1.20466,-2.64102e-05,1.20504,-2.96378e-05
4999,1.204501,3.2e-05,7e-06,-4e-06,0.0,4e-06,1e-05,2.9e-05,0.350152,25.934264,...,-2.6e-05,-0.000554,1.205074,1.204463,0.000572,-3.8e-05,1.20464,-2.37611e-05,1.20501,-2.90506e-05


In [13]:
# Exp Moving Averages ------------------
# df['sema'] = df['tick_avg'].ewm(span=window_s_ma).mean()
# df['sema_diff'] = df['sema'].diff()
# df['lema'] = df['tick_avg'].ewm(span=window_l_ma).mean()
# df['lema_diff'] = df['lema'].diff()
# df['ema_diff'] = df['sema'] - df['lema']

In [14]:

# Slopes -----------------------------
#df['small_sema_slope'] = df['sema'].rolling(window=window_s_ma).progress_apply(get_slope_s)
#df['long_sema_slope'] = df['sema'].rolling(window=window_l_ma).progress_apply(get_slope_l)

# Direction -------------------------
diff_col = 'sema'
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

Total records : 4948


## Write data to csv

In [15]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 471 ms


## Print Report

df = df[['sema_diff',
'min_gap',
'max_gap',
'lema_diff',
'ssma_diff',
'rsi',
'rs',
'avg_loss',
'avg_gain',
'tick_sd',
'diff',
'direction']]

In [16]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same        2242   45.311237
increase    1434   28.981407
decrease    1272   25.707357


Unnamed: 0,tick_avg,spread_avg,tick_sd,diff,avg_gain,avg_loss,rs,rsi,ssma,ssma_diff,...,sma_diff,max_tick,min_tick,max_gap,min_gap,sema,sema_diff,lema,lema_diff,direction
0,1.201499,0.000148,2e-05,5e-06,1.1e-05,1.2e-05,0.886398,46.988906,1.201447,1.12875e-05,...,0.00038,1.201534,1.201319,3.5e-05,-0.00018,1.20146,7.33704e-06,1.20122,1.63754e-05,same
1,1.201461,0.000141,2.6e-05,-3.8e-05,1e-05,1.5e-05,0.653706,39.529758,1.201452,5.875e-06,...,0.00037,1.201534,1.201319,7.4e-05,-0.000142,1.20147,1.3611e-06,1.20123,1.42081e-05,decrease
2,1.201332,0.00021,2.2e-05,-0.000128,1e-05,2.2e-05,0.445364,30.813288,1.201452,-7.5e-08,...,0.000356,1.201534,1.201319,0.000202,-1.4e-05,1.20145,-1.27295e-05,1.20124,8.14962e-06,decrease
3,1.201305,0.000126,2e-05,-2.8e-05,9e-06,2.4e-05,0.397271,28.431905,1.201451,-1.675e-06,...,0.000347,1.201534,1.201305,0.00023,0.0,1.20144,-1.46361e-05,1.20125,4.60951e-06,decrease
4,1.201228,0.0001,5.6e-05,-7.6e-05,9e-06,2.9e-05,0.293538,22.692674,1.201446,-4.5375e-06,...,0.00034,1.201534,1.201228,0.000306,0.0,1.20142,-2.13796e-05,1.20125,8.80187e-08,decrease


In [17]:
np.round((df['rsi']),0).value_counts()

60.0    96
67.0    95
62.0    92
50.0    86
37.0    86
        ..
4.0      7
97.0     7
98.0     6
3.0      5
96.0     5
Name: rsi, Length: 101, dtype: int64