# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time
import imblearn

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_act = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        #print(len(tick_list))
        tick_act.append(tick_list[-1])              
        temp = list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]
        tick_avg.append(temp)

        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
    temp_df['tick_act'] = tick_act      
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

In [6]:
def print_custom_value_counts(df, target_column, filter_column = None, filter_value = None):    
    if filter_column is None and filter_value is None:
        print(f'target_column : {target_column}')
        g= df[target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    else:
        print(f'{filter_column} : {filter_value}')
        g= df.loc[df[filter_column] == filter_value, target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    print('=======================')

In [7]:
def run_data_prep(year):
    global data
    print(f'-----------------------------------{year}--------------------------------------')
    
    diff_col = 'sema'
    #diff_col = 'tick_avg'

    source_file_path = f'data/yearly_tick_data/{year}.csv'
    path, file_name = os.path.split(source_file_path)

    target_file_name = 'tab_'+file_name
    target_file_path = os.path.join(path, target_file_name)

    chunk_file_name = 'chunk_'+file_name
    chunk_file_path = os.path.join(path, chunk_file_name)

    print(f'source_file_path : {source_file_path}')
    print(f'chunk_file_path : {chunk_file_path}')
    print(f'target_file_path : {target_file_path}')

    if data['input_rows'] is None:
        df = pd.read_csv(source_file_path)
    else:
        df = pd.read_csv(source_file_path, nrows=data['input_rows'])
    print(f'Total input recs : {len(df)}')
    print("Data manipulation...")
    df = chunk_ticks(df, data['number_of_ticks'])
    df.to_csv(chunk_file_path, index = False)
    print(f'Records : {len(df)}')

    df = pd.read_csv(chunk_file_path)

    data['rs_max'] = 1e6

    data['ssma_list'] = collections.deque([])
    data['lsma_list'] = collections.deque([])
    data['sema_ready'] = collections.deque([])
    data['lema_ready'] = collections.deque([])
    df['sema'] = ''
    df['lema'] = ''
    df['sema_diff'] = ''
    df['lema_diff'] = ''


    # RSI -----------------------------
    df['diff'] = df['tick_avg'].diff()
    df['gain'] = 0
    df['loss'] = 0
    df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
    df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
    df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
    df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
    df['rs'] = df['avg_gain']/df['avg_loss']
    df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
    df['rsi'] = 100 - (100 / (df['rs'] + 1))

    # Simple Moving Averages ------------------
    df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
    df['ssma_diff'] = df['ssma'].diff()
    df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
    df['lsma_diff'] = df['lsma'].diff()
    df['sma_diff'] = df['ssma'] - df['lsma']

    df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
    df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

    df['max_gap'] = df['max_tick'] -  df['tick_avg']
    df['min_gap'] = df['min_tick'] - df['tick_avg']

    print("Emas creation...")
    # Emas ----------------
    df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
    df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

    df['sema_diff'] = df['sema'].diff()
    df['lema_diff'] = df['lema'].diff()

    df['ema_diff'] = df['sema'] - df['lema']

    print("slope creation...")
    # Slopes -----------------------------
    df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
    df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

    df['slope_diff'] = df['small_sema_slope'] - df['long_sema_slope']

    print('Direction identification...')
    df = df.round(5)

    # Direction -------------------------
    df['direction'] = 'same'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

    # Remove NaNs ------------------------
    del df['gain']
    del df['loss']
    
    df = df.dropna()
    df = df.reset_index(drop=True)
    print(f'Total records : {len(df)}')

    df.to_csv(target_file_path, index = False)
    winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

    print_custom_value_counts(df = df, target_column = 'direction')    
    
    print('Avg Direction -------------------------')
    diff_col = 'tick_avg'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')    
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')

    print('\n')
    df['tick_act_direction'] = df['act_direction']
    del df['act_direction']    

    print('Act Direction -------------------------')
    diff_col = 'tick_act'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')


    print('\n')
    #del df['act_direction']
    print(f'-----------------------------------{year}--------------------------------------')
    return(df)

In [8]:
data = {}
data['number_of_ticks']   = 180
data['rsi_window']        = 14
data['sma_len']           = 5
data['lma_len']           = 10
data['pip_diff']          = 0.0002

data['input_rows']        = 5_000_000
data['input_rows']        = None

train_files = [2020]
#train_files = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [9]:
%%time
for year in train_files:
    df = run_data_prep(year)

-----------------------------------2020--------------------------------------
source_file_path : data/yearly_tick_data/2020.csv
chunk_file_path : data/yearly_tick_data\chunk_2020.csv
target_file_path : data/yearly_tick_data\tab_2020.csv
Total input recs : 32763877
Data manipulation...


100%|████████████████████████████████████████████████████████████████████████████████████████| 182022/182022 [03:50<00:00, 790.12it/s]


Records : 182022


94it [00:00, 932.86it/s]

Emas creation...


182018it [02:54, 1042.78it/s]
182013it [02:53, 1051.79it/s]
630it [00:00, 3154.51it/s]

slope creation...


182014it [00:53, 3407.17it/s]
182009it [00:53, 3404.09it/s]


Direction identification...
Total records : 182009
target_column : direction
          counts  percentage
same      171025   93.965134
decrease    5637    3.097100
increase    5347    2.937767
Avg Direction -------------------------
direction : same
          counts  percentage
same      137322   80.293524
increase   16892    9.876919
decrease   16811    9.829557
direction : increase
          counts  percentage
increase    4267   79.801758
same        1048   19.599776
decrease      32    0.598466
direction : decrease
          counts  percentage
decrease    4512   80.042576
same        1102   19.549406
increase      23    0.408018


Act Direction -------------------------
direction : same
          counts  percentage
same      122632   71.704137
increase   24278   14.195585
decrease   24115   14.100278
direction : increase
          counts  percentage
increase    3299   61.698148
same        1830   34.224799
decrease     218    4.077053
direction : decrease
          counts  percentag

In [10]:
df.to_csv('temp.csv')