# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool


import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global pip_diff
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
        
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def chunk_ticks_rolling(df, number_of_ticks):   
    global pip_diff
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    
    temp_df['tick_avg'] = df['tick'].rolling(window=number_of_ticks).mean()
    temp_df['spread_avg'] = df['spread'].rolling(window=number_of_ticks).mean()
    temp_df['tick_sd'] = df['spread'].rolling(window=number_of_ticks).mean()
    
    temp_df = temp_df.dropna()
    
    return(temp_df)

## Parameters

In [5]:
year = 2019
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')


number_of_ticks = 20
pip_diff = 0.00001
rsi_window = 14
rs_max = 1e6
window_s_ma = 20
window_l_ma = 50

source_file_path : data/tick_2019.csv
chunk_file_path : data\chunk_tick_2019.csv
target_file_path : data\tab_tick_2019.csv


## Read data

In [6]:
%%time
df = pd.read_csv(source_file_path, nrows=1000000)
#df = pd.read_csv(source_file_path)
df.head()

Wall time: 568 ms


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20190101 22:02:37.254,1.14598,1.14682,4
1,20190101 22:02:38.590,1.14599,1.14682,2
2,20190101 22:02:39.138,1.14599,1.14684,4
3,20190101 22:02:55.787,1.14598,1.14684,4
4,20190101 22:03:02.060,1.14598,1.14684,4


## Data manipulation

In [7]:
%%time
df = chunk_ticks(df, number_of_ticks)
#df = chunk_ticks_rolling(df, number_of_ticks)

df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')
df = pd.read_csv(chunk_file_path)
df.head()

100%|█████████████████████████████████| 50000/50000 [00:11<00:00, 4238.67it/s]


Records : 50000
Wall time: 12.2 s


Unnamed: 0,tick_avg,spread_avg,tick_sd
0,1.146403,0.000732,3.6e-05
1,1.146509,0.000655,5.4e-05
2,1.146389,0.000359,0.000146
3,1.146712,0.000216,3.8e-05
4,1.146684,0.000273,2.8e-05


In [8]:
%%time
# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Moving Averages ------------------
df['sema'] = df['tick_avg'].ewm(span=window_s_ma).mean()
df['sema_diff'] = df['sema'].diff()
df['lema'] = df['tick_avg'].ewm(span=window_l_ma).mean()
df['lema_diff'] = df['lema'].diff()
df['ema_diff'] = df['sema'] - df['lema']

df['ssma'] = df['tick_avg'].rolling(window=window_s_ma).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=window_l_ma).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=window_s_ma).max()
df['min_tick'] = df['tick_avg'].rolling(window=window_s_ma).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

# Slopes -----------------------------
#df['small_sema_slope'] = df['sema'].rolling(window=window_s_ma).progress_apply(get_slope_s)
#df['long_sema_slope'] = df['sema'].rolling(window=window_l_ma).progress_apply(get_slope_l)

# Direction -------------------------
diff_col = 'sema'
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

# Remove NaNs ------------------------
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

Total records : 49950
Wall time: 70.2 ms


## Write data to csv

In [9]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 1.76 s


## Print Report

df = df[['sema_diff',
'min_gap',
'max_gap',
'lema_diff',
'ssma_diff',
'rsi',
'rs',
'avg_loss',
'avg_gain',
'tick_sd',
'diff',
'direction']]

In [10]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same       30019   60.098098
increase   10026   20.072072
decrease    9905   19.829830


Unnamed: 0,tick_avg,spread_avg,tick_sd,diff,gain,loss,avg_gain,avg_loss,rs,rsi,...,ssma,ssma_diff,lsma,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,direction
0,1.146322,0.000234,1.1e-05,-9e-06,0.0,9e-06,7e-06,1e-05,0.726481,42.078708,...,1.146312,3e-06,1.146357,-2e-06,-4.6e-05,1.146361,1.146198,3.9e-05,-0.000124,same
1,1.146271,0.000306,1.5e-05,-5.1e-05,0.0,5.1e-05,7e-06,1.1e-05,0.686985,40.722656,...,1.146315,4e-06,1.146353,-5e-06,-3.7e-05,1.146361,1.146224,9e-05,-4.7e-05,same
2,1.14626,0.000335,7e-06,-1.1e-05,0.0,1.1e-05,7e-06,1.2e-05,0.597542,37.403846,...,1.146317,2e-06,1.14635,-3e-06,-3.3e-05,1.146361,1.146228,0.000101,-3.2e-05,same
3,1.14631,0.000254,2.8e-05,5e-05,5e-05,0.0,1e-05,1.2e-05,0.88172,46.857143,...,1.146321,4e-06,1.146342,-8e-06,-2.1e-05,1.146361,1.14626,5.2e-05,-5e-05,same
4,1.146294,0.000307,2.8e-05,-1.5e-05,0.0,1.5e-05,1e-05,9e-06,1.205882,54.666667,...,1.146319,-2e-06,1.146334,-8e-06,-1.5e-05,1.146361,1.14626,6.7e-05,-3.5e-05,same


In [11]:
np.round(df['diff'][0],6)

-9e-06

In [12]:
df['rsi'].value_counts()

0.000000     21
99.999900    20
50.000000     9
49.339623     2
73.292868     2
             ..
69.168467     1
36.887967     1
98.760331     1
56.891839     1
38.411725     1
Name: rsi, Length: 49852, dtype: int64