# Data Preparation

## Packages

In [3]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [4]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [5]:
def chunk_ticks(df, number_of_ticks):   
    global pip_diff
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []

    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))

    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    
    return(temp_df)

## Parameters

In [6]:
#pd.DataFrame(np.random.rand(14,4), columns=['a', 'b', 'c', 'd'])

year = 2019
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

number_of_ticks = 10
pip_diff = 0.00001
rsi_window = 10
rs_max = 1e6
window_s_ma = 10
window_l_ma = 100

## Read data

%%time

#df = pd.read_csv(source_file_path, nrows=10000)
df = pd.read_csv(source_file_path)
df.head()

## Data manipulation

%%time
df = chunk_ticks(df, number_of_ticks)

df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')
df.head()

In [7]:
%%time
df = pd.read_csv(chunk_file_path)
df.head()

Wall time: 1.93 s


Unnamed: 0,tick_avg,spread_avg
0,1.146424,0.000826
1,1.146383,0.000638
2,1.146515,0.000694
3,1.146503,0.000615
4,1.1464,0.000404


In [8]:
%%time

df['tick_diff'] = df['tick_avg'].diff()

# Moving Averages ------------------
df['sema'] = df['tick_avg'].ewm(span=window_s_ma).mean()
df['lema'] = df['tick_avg'].ewm(span=window_l_ma).mean()
df['ema_diff'] = df['sema'] - df['tick_avg']


# RSI -----------------------------
df['diff'] = df['sema'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=window_s_ma).progress_apply(get_slope_s)
df['long_sema_slope'] = df['sema'].rolling(window=window_l_ma).progress_apply(get_slope_s)

# Direction -------------------------
diff_col = 'sema'
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

# Remove NaNs ------------------------
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

2918622it [21:49, 2228.91it/s]
1it [00:00,  4.50it/s]


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 10 and the array at index 1 has size 100

## Write data to csv

In [None]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [None]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))

In [None]:
df.head(5)