# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
#import modin.pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time
import imblearn

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_date_hour(df):    
    
    weekday_val = []
    hour = []
    
    for i in tqdm(df['DateTime']):
        date_val  = dt.datetime.strptime(i, '%Y%m%d %H:%M:%S.%f')
        weekday_val.append(date_val.weekday())
        hour.append(date_val.hour)

    df['weekday'] = weekday_val
    df['hour'] = hour
    return(df)

In [3]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [4]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['weekday', 'hour','tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_act = []
    candle_height = []
    candle_max_val = []
    candle_min_val = []
    weekday = []
    hour = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        weekday_list = list(df['weekday'][i:i+number_of_ticks])
        hour_list = list(df['hour'][i:i+number_of_ticks])
        
        tick_act.append(tick_list[-1])              
        temp = list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1]
        tick_avg.append(temp)

        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        candle_height.append(np.max(tick_list) - np.min(tick_list))
        candle_max_val.append(np.max(tick_list))
        candle_min_val.append(np.min(tick_list))
        weekday.append(weekday_list[-1])
        hour.append(hour_list[-1])
        
    temp_df['weekday']      =  weekday
    temp_df['hour']      = hour
    temp_df['tick_act']      = tick_act      
    temp_df['tick_avg']      = tick_avg  
    temp_df['spread_avg']    = spread_avg  
    temp_df['tick_sd']       = tick_sd  
    temp_df['candle_height'] = candle_height
    temp_df['candle_max_val'] = candle_max_val
    temp_df['candle_min_val'] = candle_min_val
    
    return(temp_df)

In [5]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [6]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

In [7]:
def print_custom_value_counts(df, target_column, filter_column = None, filter_value = None):    
    if filter_column is None and filter_value is None:
        print(f'target_column : {target_column}')
        g= df[target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    else:
        print(f'{filter_column} : {filter_value}')
        g= df.loc[df[filter_column] == filter_value, target_column]
        print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
    print('=======================')

In [8]:
def get_dir_cols_old(df):
    
    print('Calculating overall and instant dirs ...')
    
    cont_dir = 0
    cont_dir_list = []

    dir = 0
    dir_list = []

    for s_diff in tqdm(df['sema_diff']):

        if s_diff >= data['pip_diff']:
            cont_dir += 1
            dir = 1

        elif s_diff <= -data['pip_diff']:
            cont_dir -= 1
            dir = -1

        else:
            cont_dir += 0
            dir = 0

        cont_dir_list.append(cont_dir)
        dir_list.append(dir)

    df['overall_dir'] =  cont_dir_list
    df['dir_val'] =  dir_list
    return(df)

In [9]:
def get_dir_cols(df):
    
    print('Calculating overall and instant dirs ...')
    time.sleep(1)
    
    cont_dir = 0
    cont_dir_list = []

    dir = 0
    dir_list = []

    for s_diff in tqdm(df['sema_diff']):

        if s_diff >= data['pip_diff']:
            cont_dir += 1
            dir += 1

        elif s_diff <= -data['pip_diff']:
            cont_dir -= 1
            dir -= 1

        else:
            cont_dir += 0
            dir = 0

        cont_dir_list.append(cont_dir)
        dir_list.append(dir)

    df['overall_dir'] =  cont_dir_list
    df['dir_val'] =  dir_list
    return(df)

In [10]:
def run_data_prep(year):
    global data
    print(f'-----------------------------------{year}--------------------------------------')
    
    diff_col = 'sema'
    #diff_col = 'tick_avg'

    source_file_path = f'data/yearly_tick_data/{year}.csv'
    path, file_name = os.path.split(source_file_path)

    target_file_name = 'tab_'+file_name
    target_file_path = os.path.join(path, target_file_name)

    chunk_file_name = 'chunk_'+file_name
    chunk_file_path = os.path.join(path, chunk_file_name)

    print(f'source_file_path : {source_file_path}')
    print(f'chunk_file_path : {chunk_file_path}')
    print(f'target_file_path : {target_file_path}')

    if data['input_rows'] is None:
        df = pd.read_csv(source_file_path)
    else:
        df = pd.read_csv(source_file_path, nrows=data['input_rows'])
    
    print(f'Total input recs : {len(df)}')
    print("Data manipulation...")
    time.sleep(1)
    
    print('Extracting weekday and hour...')
    time.sleep(1)
    df = get_date_hour(df)
    
    print('Chunking ticks...')
    time.sleep(1)
    df = chunk_ticks(df, data['number_of_ticks'])
    
    df.to_csv(chunk_file_path, index = False)
        
    print(f'Records : {len(df)}')

    df = pd.read_csv(chunk_file_path)

    data['rs_max'] = 1e6

    data['ssma_list'] = collections.deque([])
    data['lsma_list'] = collections.deque([])
    data['sema_ready'] = collections.deque([])
    data['lema_ready'] = collections.deque([])
    df['sema'] = ''
    df['lema'] = ''
    df['sema_diff'] = ''
    df['lema_diff'] = ''

    df['top_diff'] = df['candle_max_val'] - df['tick_act']
    df['bottom_diff'] = df['tick_act'] - df['candle_min_val'] 
    
    # RSI -----------------------------
    df['diff'] = df['tick_avg'].diff()
    df['gain'] = 0
    df['loss'] = 0
    df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
    df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
    df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
    df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
    df['rs'] = df['avg_gain']/df['avg_loss']
    df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
    df['rsi'] = 100 - (100 / (df['rs'] + 1))

    # Simple Moving Averages ------------------
    df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
    df['ssma_diff'] = df['ssma'].diff()
    df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
    df['lsma_diff'] = df['lsma'].diff()
    df['sma_diff'] = df['ssma'] - df['lsma']

    df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
    df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

    df['max_gap'] = df['max_tick'] -  df['tick_avg']
    df['min_gap'] = df['min_tick'] - df['tick_avg']

    print("Emas creation...")
    time.sleep(1)
    # Emas ----------------
    df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
    df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

    df['sema_diff'] = df['sema'].diff()
    df['lema_diff'] = df['lema'].diff()

    df['ema_diff'] = df['sema'] - df['lema']

  
    print("slope creation...")
    time.sleep(1)
    # Slopes -----------------------------
    df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
    df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

    df['slope_diff'] = df['small_sema_slope'] - df['long_sema_slope']
    
    print('Direction identification...')
    df = df.round(5)
    time.sleep(1)
    df = get_dir_cols(df)
    
    # Direction -------------------------
    df['direction'] = 'same'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    # Remove NaNs ------------------------
    del df['gain']
    del df['loss']
    
    del df['candle_max_val']
    del df['candle_min_val']    
    
    df = df.dropna()
    df = df.reset_index(drop=True)
    print(f'Total records : {len(df)}')
    

    if data['write_to_csv']:        
        df.to_csv(target_file_path, index = False)
    #winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

    print_custom_value_counts(df = df, target_column = 'direction')    
    
    print('Avg Direction -------------------------')
    diff_col = 'tick_avg'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')    
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')

    print('\n')
    df['tick_act_direction'] = df['act_direction']
    del df['act_direction']    

    print('Act Direction -------------------------')
    diff_col = 'tick_act'

    df['act_direction'] = 'same'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] >= data['pip_diff']] = 'increase'
    df['act_direction'].loc[df[diff_col].shift(-1) - df[diff_col] <= -data['pip_diff']] = 'decrease'

    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'same')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'increase')
    print_custom_value_counts(df = df, target_column = 'act_direction', filter_column = 'direction', filter_value = 'decrease')


    print('\n')
    #del df['act_direction']
    print(f'-----------------------------------{year}--------------------------------------')
    return(df)

In [11]:
data = {}
data['number_of_ticks']   = 300
data['rsi_window']        = 14
data['sma_len']           = 5
data['lma_len']           = 10
data['pip_diff']          = 0.00012


#data['input_rows']        = 1_000_000
data['input_rows']        = None

data['write_to_csv'] = True

train_files = [2020]
#train_files = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [12]:
%%time
for year in train_files:
    df = run_data_prep(year)

-----------------------------------2020--------------------------------------
source_file_path : data/yearly_tick_data/2020.csv
chunk_file_path : data/yearly_tick_data\chunk_2020.csv
target_file_path : data/yearly_tick_data\tab_2020.csv
Total input recs : 32763877
Data manipulation...
Extracting weekday and hour...


100%|████████████| 32763877/32763877 [05:46<00:00, 94437.29it/s]


Chunking ticks...


100%|██████████████████| 109213/109213 [03:04<00:00, 591.76it/s]


Records : 109213
Emas creation...


109209it [01:16, 1427.14it/s]
109204it [01:17, 1400.13it/s]


slope creation...


109205it [00:27, 3949.31it/s]
109200it [00:27, 3913.84it/s]


Direction identification...
Calculating overall and instant dirs ...


100%|██████████████| 109213/109213 [00:00<00:00, 1869226.00it/s]


Total records : 109200
target_column : direction
          counts  percentage
same       73270   67.097070
increase   18039   16.519231
decrease   17891   16.383700
Avg Direction -------------------------
direction : same
          counts  percentage
same       40734   55.594377
increase   16475   22.485328
decrease   16061   21.920295
direction : increase
          counts  percentage
increase   14058   77.931149
same        3556   19.712844
decrease     425    2.356006
direction : decrease
          counts  percentage
decrease   13929   77.854787
same        3548   19.831200
increase     414    2.314013


Act Direction -------------------------
direction : same
          counts  percentage
same       31967   43.629043
increase   20866   28.478231
decrease   20437   27.892726
direction : increase
          counts  percentage
increase   11467   63.567825
same        4832   26.786407
decrease    1740    9.645768
direction : decrease
          counts  percentage
decrease   11395   63.6912

In [13]:
df

Unnamed: 0,weekday,hour,tick_act,tick_avg,spread_avg,tick_sd,candle_height,sema,lema,sema_diff,...,min_gap,ema_diff,small_sema_slope,long_sema_slope,slope_diff,overall_dir,dir_val,direction,tick_act_direction,act_direction
0,3,1,1.12206,1.12193,0.00003,0.00008,0.00026,1.12180,1.12182,0.00007,...,-0.00029,-0.00002,-73.63685,21.70225,-95.33910,1,0,same,increase,increase
1,3,1,1.12226,1.12210,0.00002,0.00006,0.00025,1.12191,1.12189,0.00011,...,-0.00047,0.00003,67.66099,-23.89354,91.55453,1,0,increase,increase,increase
2,3,1,1.12240,1.12231,0.00003,0.00005,0.00024,1.12208,1.12197,0.00017,...,-0.00068,0.00011,83.36255,12.49880,70.86374,2,1,increase,same,same
3,3,1,1.12238,1.12237,0.00002,0.00004,0.00013,1.12221,1.12206,0.00013,...,-0.00061,0.00016,85.40420,62.21494,23.18926,3,2,same,same,same
4,3,1,1.12234,1.12234,0.00002,0.00002,0.00010,1.12229,1.12212,0.00007,...,-0.00042,0.00017,85.51911,76.68733,8.83178,3,0,same,same,same
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109195,3,21,1.22195,1.22230,0.00005,0.00022,0.00085,1.22198,1.22181,0.00020,...,-0.00076,0.00017,84.76995,49.54013,35.22982,206,2,same,decrease,same
109196,3,21,1.22198,1.22204,0.00006,0.00007,0.00025,1.22203,1.22185,0.00005,...,-0.00044,0.00018,85.24346,78.42883,6.81463,206,0,same,decrease,decrease
109197,3,21,1.22173,1.22178,0.00007,0.00012,0.00040,1.22195,1.22185,-0.00008,...,-0.00018,0.00010,83.54918,81.57060,1.97858,206,0,same,same,decrease
109198,3,21,1.22158,1.22172,0.00008,0.00007,0.00029,1.22188,1.22184,-0.00007,...,0.00000,0.00003,59.40508,80.91217,-21.50708,206,0,same,same,same


In [15]:
df[['sema', 'sema_diff', 'overall_dir', 'dir_val', 'direction']].head(50)

Unnamed: 0,sema,sema_diff,overall_dir,dir_val,direction
0,1.1218,7e-05,1,0,same
1,1.12191,0.00011,1,0,increase
2,1.12208,0.00017,2,1,increase
3,1.12221,0.00013,3,2,same
4,1.12229,7e-05,3,0,same
5,1.12232,3e-05,3,0,same
6,1.1222,-0.00011,3,0,same
7,1.12212,-8e-05,3,0,same
8,1.12207,-5e-05,3,0,same
9,1.122,-7e-05,3,0,same
