# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope(y_axis):
    global data
    ma_len = len(y_axis)
    
    x_axis = []
    for i in range(ma_len):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

In [3]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        tick_avg.append(np.mean(tick_list))
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        
        
    temp_df['tick_avg'] = tick_avg  
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd  
    
    return(temp_df)

In [4]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [5]:
def roll_ma(ma_list):
    global data
    ma_len = len(ma_list)
    sema_val = list(pd.DataFrame(ma_list).ewm(span=ma_len).mean()[0])[ma_len - 1]    
    return(sema_val)

## File paths

In [6]:
year = 2017
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/tick_2017.csv
chunk_file_path : data\chunk_tick_2017.csv
target_file_path : data\tab_tick_2017.csv


## Read data

In [7]:
%%time
#df = pd.read_csv(source_file_path, nrows=1000000)
df = pd.read_csv(source_file_path)
df.head()

Wall time: 20.3 s


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20170101 22:00:20.786,1.05148,1.05236,1
1,20170101 22:00:36.636,1.05153,1.05236,2
2,20170101 22:01:37.024,1.05153,1.05236,2
3,20170101 22:02:18.648,1.0517,1.05236,2
4,20170101 22:02:30.583,1.0517,1.05248,2


## Data manipulation

In [8]:
%%time
data = {}
data['number_of_ticks'] = 100
data['pip_diff'] = 0.00001
data['rsi_window'] = 14
data['sma_len'] = 10
data['lma_len'] = 50


df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|████████████████████████████████████████████████████████████████████████| 217041/217041 [01:51<00:00, 1949.30it/s]


Records : 217041
Wall time: 1min 56s


Unnamed: 0,tick_avg,spread_avg,tick_sd
0,1.052075,0.000614,9.1e-05
1,1.052608,0.000752,0.000435
2,1.052657,0.000628,0.000176
3,1.05242,0.000683,3.7e-05
4,1.052459,0.000642,8.3e-05


In [9]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

Wall time: 304 ms


In [10]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_ma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_ma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

217032it [08:41, 416.33it/s]
216992it [04:54, 737.66it/s]

Wall time: 13min 35s





In [11]:
%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope)

217023it [01:33, 2312.15it/s]
216983it [01:25, 2545.05it/s]

Wall time: 2min 59s





In [12]:
%%time

df['ema_diff'] = df['sema'] - df['lema']

# Direction -------------------------
diff_col = 'sema'
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

df.tail()

Total records : 216983
Wall time: 198 ms


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
216978,1.199388,5.9e-05,7.3e-05,1.199605,1.200019,-5.6e-05,-3.487974e-05,1.2e-05,2.6e-05,8.9e-05,...,-2.9e-05,-0.000558,1.199939,1.199376,0.000551,-1.2e-05,-68.681776,-70.446925,-0.000414,increase
216979,1.199755,4.5e-05,5e-05,1.199633,1.200002,2.8e-05,-1.765005e-05,0.000367,5.2e-05,6.6e-05,...,-2.4e-05,-0.000532,1.199939,1.199376,0.000183,-0.000379,-69.404853,-70.623162,-0.000369,increase
216980,1.199833,0.000153,5.1e-05,1.199673,1.199987,4e-05,-1.430889e-05,7.8e-05,5.8e-05,5.7e-05,...,-2.5e-05,-0.000495,1.199939,1.199376,0.000106,-0.000457,-68.597731,-70.592586,-0.000315,increase
216981,1.200122,0.000454,0.000251,1.199764,1.199987,9.1e-05,-7.552187e-07,0.000289,7.9e-05,4.6e-05,...,-2e-05,-0.00044,1.200122,1.199376,0.0,-0.000746,-62.255643,-70.282501,-0.000222,increase
216982,1.200367,0.001297,9.6e-05,1.199886,1.199997,0.000122,1.053261e-05,0.000246,9.6e-05,4.3e-05,...,-1.4e-05,-0.000382,1.200367,1.199376,0.0,-0.000992,-24.675949,-69.64365,-0.000111,same


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 7.98 s


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
increase   92082   42.437426
decrease   90384   41.654876
same       34517   15.907698


Unnamed: 0,tick_avg,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,avg_gain,avg_loss,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,small_sema_slope,long_sema_slope,ema_diff,direction
0,1.051767,0.0001,2.2e-05,1.051715,1.051827,7.699175e-06,-4e-06,-3.3e-05,3.2e-05,4.8e-05,...,-4e-06,-0.000163,1.051852,1.051399,8.5e-05,-0.000368,-64.238891,-17.880531,-0.000112,same
1,1.051766,6.7e-05,3.7e-05,1.051722,1.051824,6.7082e-06,-4e-06,-1e-06,3.2e-05,4.5e-05,...,-4e-06,-0.000168,1.051799,1.051399,3.4e-05,-0.000366,-45.202398,-16.589788,-0.000102,same
2,1.051711,7e-05,6.8e-05,1.051721,1.051817,-1.198336e-06,-6e-06,-5.4e-05,3.2e-05,4.1e-05,...,-6e-06,-0.000159,1.051799,1.051399,8.8e-05,-0.000312,-4.687474,-15.440384,-9.7e-05,same
3,1.051697,0.000181,2.5e-05,1.05172,1.051811,-8.231908e-07,-7e-06,-1.4e-05,3.1e-05,4.2e-05,...,-6e-06,-0.000141,1.051799,1.051399,0.000102,-0.000298,28.167559,-14.559632,-9.1e-05,same
4,1.05171,0.000195,1.1e-05,1.051727,1.051805,7.046343e-06,-6e-06,1.3e-05,3e-05,4.2e-05,...,-6e-06,-0.000104,1.051799,1.051602,8.9e-05,-0.000109,34.137909,-13.787003,-7.8e-05,same
