# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)


def get_month(row):
    month_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().month
    return(month_val)

def get_day(row):
    day_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().day
    return(day_val)   

def get_hour(row):
    hour_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().hour
    return(hour_val)   

def get_min(row):
    min_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().minute
    return(min_val)   


def get_dow(row):
    dow = dt.datetime.strptime(df['TS'][0], '%Y-%m-%d %H:%M').weekday()
    return(dow)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global pip_diff
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df = df[['tick']]

    temp_df = pd.DataFrame()
    open_list = []
    high_list = []
    low_list = []
    close_list = []


    for i in tqdm(range(0,len(df),number_of_ticks)):
        chunk_list = list(df['tick'][i:i+number_of_ticks])

        open_val = chunk_list[0]
        high_val = np.max(chunk_list)
        low_val = np.min(chunk_list)
        close_val = chunk_list[-1]

        open_list.append(chunk_list[0]) 
        high_list.append(np.max(chunk_list))
        low_list.append(np.min(chunk_list))
        close_list.append(chunk_list[-1]) 

    temp_df['open'] = open_list
    temp_df['high'] = high_list
    temp_df['low'] = low_list    
    temp_df['close'] = close_list    
    
    return(temp_df)

## Parameters

In [4]:
#pd.DataFrame(np.random.rand(14,4), columns=['a', 'b', 'c', 'd'])

year = 2019
source_file_path = f'data/tick_{year}.csv'

number_of_ticks = 60
pip_diff = 0.0001

diff_col = 'Close'

rsi_window = 10
rs_max = 1e6

window_s_ma = 10
window_l_ma = 50

## Read data

In [5]:
%%time
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

df = pd.read_csv(source_file_path)

Wall time: 40.6 s


In [6]:
#df = df[0:50000]

print(f'Total recs : {len(df)}')
df = chunk_ticks(df, number_of_ticks)
print(f'Reduced recs : {len(df)}')
df.head()

Total recs : 29186310


100%|████████████████████████████████████████████████████████████████████████| 486439/486439 [02:11<00:00, 3699.30it/s]


Reduced recs : 486439


Unnamed: 0,open,high,low,close
0,1.1464,1.1467,1.14622,1.1467
1,1.1467,1.14674,1.14663,1.146685
2,1.14669,1.14677,1.1464,1.146435
3,1.14644,1.14644,1.146215,1.14622
4,1.14623,1.146415,1.14622,1.14626


## Data manipulation

In [7]:
%%time
df['target_gain'] = abs(df['high'].shift(-1) - df['close'])
df['target_loss'] = abs(df['low'].shift(-1) - df['close'])

df['direction'] = 'same'
df.loc[(df['target_gain'] > df['target_loss']) & (df['target_gain'] > pip_diff), 'direction'] = 'increase'
df.loc[(df['target_loss'] > df['target_gain']) & (df['target_loss'] > pip_diff), 'direction'] = 'decrease'

df['diff'] = df['close'].diff()
df['open_diff'] = df['open'].diff()
df['high_diff'] = df['high'].diff()
df['low_diff'] = df['low'].diff()

df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])

df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()

df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 

df['rsi'] = 100 - (100 / (df['rs'] + 1))

df['ssma'] = df['close'].rolling(window=window_s_ma).mean()
df['lsma'] = df['close'].rolling(window=window_l_ma).mean()
df['sma_diff'] = df['ssma'] - df['lsma']

df['sema'] = df['close'].ewm(span=window_s_ma).mean()
df['lema'] = df['close'].ewm(span=window_l_ma).mean()
df['ema_diff'] = df['sema'] - df['lema']

%time df['slope_s_c'] = df['close'].rolling(window=window_s_ma).apply(get_slope_s)
%time df['slope_s_h'] = df['high'].rolling(window=window_s_ma).apply(get_slope_s)
%time df['slope_s_l'] = df['low'].rolling(window=window_s_ma).apply(get_slope_s)
%time df['slope_s_o'] = df['open'].rolling(window=window_s_ma).apply(get_slope_s)

#df['slope_l'] = df['close'].rolling(window=window_l_ma).apply(get_slope_l)
#df['sma_slope'] = df['ssma'].rolling(window=rsi_window).apply(get_slope)

df = df.dropna()
df = df.reset_index(drop=True)

Wall time: 2min 10s
Wall time: 2min 7s
Wall time: 2min 42s
Wall time: 2min 18s
Wall time: 9min 18s


## Write data to csv

In [8]:
remove_columns = ['target_gain', 'target_loss']
df = df[df.columns.difference(remove_columns)]

df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [9]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))

df.head(10)

          counts  percentage
same      332595   68.380453
decrease   77303   15.893246
increase   76491   15.726301


Unnamed: 0,avg_gain,avg_loss,close,diff,direction,ema_diff,gain,high,high_diff,lema,...,open_diff,rs,rsi,sema,slope_s_c,slope_s_h,slope_s_l,slope_s_o,sma_diff,ssma
0,1.7e-05,1.4e-05,1.146605,-2.5e-05,same,0.000152,0.0,1.146645,-5e-06,1.146438,...,0.0,1.214286,54.83871,1.146591,27.937143,29.275254,48.193449,35.10838,0.000222,1.146609
1,1.5e-05,1.4e-05,1.146605,0.0,same,0.000148,0.0,1.14664,-5e-06,1.146446,...,-1.5e-05,1.107143,52.542373,1.146593,21.200217,24.155512,27.255328,30.061738,0.000225,1.14661
2,1.5e-05,1.1e-05,1.1466,-5e-06,same,0.000142,0.0,1.146605,-3.5e-05,1.146453,...,-1.5e-05,1.47619,59.615385,1.146594,-3.814075,0.868051,8.615648,21.651578,0.000232,1.146615
3,1.2e-05,1.1e-05,1.146595,-5e-06,same,0.000136,0.0,1.146605,0.0,1.146459,...,-5e-06,1.090909,52.173913,1.146595,-20.59409,-26.146841,-12.473645,-1.388717,0.00023,1.146616
4,8e-06,1.1e-05,1.14659,-5e-06,same,0.000129,0.0,1.146605,0.0,1.146465,...,-5e-06,0.695652,41.025641,1.146594,-24.587684,-32.962798,-21.651578,-20.897765,0.000219,1.146613
5,8e-06,1.5e-05,1.146545,-4.5e-05,same,0.000117,0.0,1.146595,-1e-05,1.146468,...,5e-06,0.533333,34.782609,1.146585,-38.976007,-39.080784,-30.579227,-26.146841,0.000206,1.146605
6,1e-05,1.5e-05,1.14661,6.5e-05,same,0.000115,6.5e-05,1.14662,2.5e-05,1.146475,...,-4e-05,0.666667,40.0,1.146589,-22.841416,-32.840404,-33.448993,-37.256839,0.000194,1.1466
7,1e-05,8e-06,1.146615,5e-06,same,0.000113,5e-06,1.14665,3e-05,1.146481,...,5.5e-05,1.235294,55.263158,1.146594,-20.74608,-14.606737,-23.720396,-22.693795,0.000192,1.146602
8,7e-06,8e-06,1.146615,2.220446e-16,same,0.000111,2.220446e-16,1.14665,0.0,1.146486,...,1.5e-05,0.823529,45.16129,1.146598,-7.253195,-3.29521,-13.29857,-17.017286,0.000186,1.146601
9,7e-06,9e-06,1.14661,-5e-06,same,0.000108,0.0,1.14665,0.0,1.146492,...,5e-06,0.777778,43.75,1.1466,6.568764,12.804266,2.775803,0.173623,0.000178,1.146599
