# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time
import imblearn

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        #print(len(tick_list))
        tick_act.append(tick_list[-1])              
        temp = list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]
        tick_avg.append(temp)

        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

In [6]:
def print_custom_value_counts(df, target_column, filter_column = None, filter_value = None):    
    if filter_column is None and filter_value is None:
        print(f'target_column : {target_column}')
        g= df[target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    else:
        print(f'{filter_column} : {filter_value}')
        g= df.loc[df[filter_column] == filter_value, target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    print('=======================')

In [7]:
def run_data_prep(year):
    global data
    print(f'-----------------------------------{year}--------------------------------------')
    
    diff_col = 'sema'
    #diff_col = 'tick_avg'

    source_file_path = f'data/yearly_tick_data/{year}.csv'
    path, file_name = os.path.split(source_file_path)

    target_file_name = 'tab_'+file_name
    target_file_path = os.path.join(path, target_file_name)

    chunk_file_name = 'chunk_'+file_name
    chunk_file_path = os.path.join(path, chunk_file_name)

    print(f'source_file_path : {source_file_path}')
    print(f'chunk_file_path : {chunk_file_path}')
    print(f'target_file_path : {target_file_path}')

    if data['input_rows'] is None:
        df = pd.read_csv(source_file_path)
    else:
        df = pd.read_csv(source_file_path, nrows=data['input_rows'])
    print(f'Total input recs : {len(df)}')
    print("Data manipulation...")
    df = chunk_ticks(df, data['number_of_ticks'])
    df.to_csv(chunk_file_path, index = False)
    print(f'Records : {len(df)}')

    df = pd.read_csv(chunk_file_path)

    data['rs_max'] = 1e6

    data['ssma_list'] = collections.deque([])
    data['lsma_list'] = collections.deque([])
    data['sema_ready'] = collections.deque([])
    data['lema_ready'] = collections.deque([])
    df['sema'] = ''
    df['lema'] = ''
    df['sema_diff'] = ''
    df['lema_diff'] = ''


    # RSI -----------------------------
    df['diff'] = df['tick_avg'].diff()
    df['gain'] = 0
    df['loss'] = 0
    df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
    df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
    df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
    df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
    df['rs'] = df['avg_gain']/df['avg_loss']
    df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
    df['rsi'] = 100 - (100 / (df['rs'] + 1))

    # Simple Moving Averages ------------------
    df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
    df['ssma_diff'] = df['ssma'].diff()
    df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
    df['lsma_diff'] = df['lsma'].diff()
    df['sma_diff'] = df['ssma'] - df['lsma']

    df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
    df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

    df['max_gap'] = df['max_tick'] -  df['tick_avg']
    df['min_gap'] = df['min_tick'] - df['tick_avg']

    print("Emas creation...")
    # Emas ----------------
    df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
    df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

    df['sema_diff'] = df['sema'].diff()
    df['lema_diff'] = df['lema'].diff()

    df['ema_diff'] = df['sema'] - df['lema']

    print("slope creation...")
    # Slopes -----------------------------
    df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
    df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

    df['slope_diff'] = df['small_sema_slope'] - df['long_sema_slope']

    print('Direction identification...')
    df = df.round(5)

    # Direction -------------------------
    df['direction'] = 'same'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    # Remove NaNs ------------------------
    del df['gain']
    del df['loss']
    
    df = df.dropna()
    df = df.reset_index(drop=True)
    print(f'Total records : {len(df)}')

    df.to_csv(target_file_path, index = False)
    winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

    print_custom_value_counts(df = df, target_column = 'direction')    
    
    print('Avg Direction -------------------------')
    diff_col = 'tick_avg'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')    
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')

    print('\n')
    df['tick_act_direction'] = df['act_direction']
    del df['act_direction']    

    print('Act Direction -------------------------')
    diff_col = 'tick_act'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')


    print('\n')
    #del df['act_direction']
    print(f'-----------------------------------{year}--------------------------------------')
    return(df)

In [8]:
data = {}
data['number_of_ticks']   = 300
data['rsi_window']        = 14
data['sma_len']           = 5
data['lma_len']           = 10
data['pip_diff']          = 0.00012

data['input_rows']        = 5_000_000
data['input_rows']        = None

train_files = [2019]
#train_files = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [9]:
%%time
for year in train_files:
    df = run_data_prep(year)

-----------------------------------2019--------------------------------------
source_file_path : data/yearly_tick_data/2019.csv
chunk_file_path : data/yearly_tick_data\chunk_2019.csv
target_file_path : data/yearly_tick_data\tab_2019.csv
Total input recs : 29186310
Data manipulation...


100%|██████████████████████████████████████████████████████████████████████████████████████████| 97288/97288 [02:07<00:00, 765.10it/s]
0it [00:00, ?it/s]

Records : 97288
Emas creation...


97284it [01:31, 1058.14it/s]
97279it [01:33, 1043.78it/s]
680it [00:00, 3377.22it/s]

slope creation...


97280it [00:28, 3444.22it/s]
97275it [00:28, 3414.62it/s]


Direction identification...
Total records : 97275
target_column : direction
          counts  percentage
same       83200   85.530712
decrease    7307    7.511694
increase    6768    6.957594
Avg Direction -------------------------
direction : same
          counts  percentage
same       57299   68.868990
increase   13103   15.748798
decrease   12798   15.382212
direction : increase
          counts  percentage
increase    5362   79.225768
same        1330   19.651300
decrease      76    1.122931
direction : decrease
          counts  percentage
decrease    5761   78.842206
same        1449   19.830300
increase      97    1.327494


Act Direction -------------------------
direction : same
          counts  percentage
same       48155   57.878606
increase   17737   21.318510
decrease   17308   20.802885
direction : increase
          counts  percentage
increase    4156   61.406619
same        2172   32.092199
decrease     440    6.501182
direction : decrease
          counts  percentage

df.to_csv('temp.csv')