# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import  Pool
import time

import math, collections
from scipy.stats import linregress

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import warnings
warnings.filterwarnings('ignore')

In [2]:
def chunk_ticks(df, number_of_ticks):   
    global data
    
    df['tick'] = (df['Bid'] + df['Ask'])/2
    df['spread'] = df['Ask'] - df['Bid']
    df = df[['tick', 'spread']]
    
    temp_df = pd.DataFrame()
    tick_avg = []
    spread_avg = []
    tick_sd = []
    tick_high = []
    tick_low = []
    
    for i in tqdm(range(0,len(df),number_of_ticks)):
        tick_list = list(df['tick'][i:i+number_of_ticks])
        spread_list = list(df['spread'][i:i+number_of_ticks])
        
        
        tick_avg.append(list(pd.DataFrame(tick_list).ewm(span=len(tick_list)).mean()[0])[len(tick_list) - 1])        
        #tick_avg.append(np.mean(tick_list))  
        
        spread_avg.append(np.mean(spread_list))
        tick_sd.append(np.std(tick_list))
        tick_high.append(np.max(tick_list))
        tick_low.append(np.min(tick_list))
        
        
    temp_df['tick_avg'] = tick_avg  
    temp_df['tick_high'] = tick_high
    temp_df['tick_low'] = tick_low
    temp_df['spread_avg'] = spread_avg  
    temp_df['tick_sd'] = tick_sd

    return(temp_df)

In [3]:
def before_sma():
    global data    
    data['ssma_list'].append(val)    
    return()

def after_sma():
    global data
    
    data['ssma_list'].popleft()
    data['ssma_list'].append(val)
    data['sema'] = list(pd.DataFrame(list(data['ssma_list'])).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]
    
    if len(data['sema_ready']) < 2:
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = np.nan

    elif len(data['sema_ready']) > 1:
        data['sema_ready'].popleft()
        data['sema_ready'].append(data['sema'])
        data['sema_diff'] = data['sema_ready'][-1] - data['sema_ready'][len(data['sema_ready'])-2]
    
    return()

def before_lma():
    global data    
    data['lsma_list'].append(val)    
    return()

def after_lma():
    global data
    
    data['lsma_list'].popleft()
    data['lsma_list'].append(val)
    data['lema'] = list(pd.DataFrame(list(data['lsma_list'])).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]
    
    if len(data['lema_ready']) < 2:
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = np.nan

    elif len(data['lema_ready']) > 1:
        data['lema_ready'].popleft()
        data['lema_ready'].append(data['lema'])
        data['lema_diff'] = data['lema_ready'][-1] - data['lema_ready'][len(data['lema_ready'])-2]
    
    return()

In [4]:
def roll_sma(ssma_list):
    global data
    sema_val = list(pd.DataFrame(ssma_list).ewm(span=data['sma_len']).mean()[0])[data['sma_len'] - 1]    
    return(sema_val)

def roll_lma(lsma_list):
    global data
    lema_val = list(pd.DataFrame(ssma_list).ewm(span=data['lma_len']).mean()[0])[data['lma_len'] - 1]    
    return(lema_val)

In [5]:
def get_slope_s(y_axis):
    global data
    x_axis = []
    for i in range(data['sma_len']):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))

    return(slope_tick)

def get_slope_l(y_axis):
    global data
    x_axis = []
    for i in range(data['lma_len']):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

## File paths

In [6]:
year = 2018
source_file_path = f'data/tick_{year}.csv'
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

chunk_file_name = 'chunk_'+file_name
chunk_file_path = os.path.join(path, chunk_file_name)

print(f'source_file_path : {source_file_path}')
print(f'chunk_file_path : {chunk_file_path}')
print(f'target_file_path : {target_file_path}')

source_file_path : data/tick_2018.csv
chunk_file_path : data\chunk_tick_2018.csv
target_file_path : data\tab_tick_2018.csv


## Read data

In [7]:
%%time
df = pd.read_csv(source_file_path, nrows=1000000)
#df = pd.read_csv(source_file_path)
df.head()

Wall time: 583 ms


Unnamed: 0,DateTime,Bid,Ask,Volume
0,20180101 22:00:08.661,1.20102,1.20143,2
1,20180101 22:00:08.895,1.20102,1.20148,2
2,20180101 22:00:10.634,1.20102,1.20147,2
3,20180101 22:00:11.223,1.20102,1.20148,2
4,20180101 22:00:29.530,1.20102,1.20145,2


## Data manipulation

In [8]:
data = {}
diff_col = 'tick_avg'
data['pip_diff'] = 0.0002

data['number_of_ticks'] = 200

data['sma_len'] = 20
data['lma_len'] = 50

data['rsi_window'] = 14

In [9]:
%%time

df = chunk_ticks(df, data['number_of_ticks'])
df.to_csv(chunk_file_path, index = False)
print(f'Records : {len(df)}')

df = pd.read_csv(chunk_file_path)
df.head()

100%|███████████████████████████████████████████████| 5000/5000 [00:06<00:00, 789.64it/s]

Records : 5000
Wall time: 6.45 s





Unnamed: 0,tick_avg,tick_high,tick_low,spread_avg,tick_sd
0,1.200944,1.20125,1.20046,0.000366,0.000204
1,1.200747,1.201145,1.200655,0.000279,0.000123
2,1.200715,1.200785,1.200685,0.000245,1.5e-05
3,1.201365,1.201695,1.200725,0.000219,0.000239
4,1.201463,1.20157,1.20139,0.000167,4.8e-05


In [10]:
%%time

data['rs_max'] = 1e6

data['ssma_list'] = collections.deque([])
data['lsma_list'] = collections.deque([])
data['sema_ready'] = collections.deque([])
data['lema_ready'] = collections.deque([])
df['sema'] = ''
df['lema'] = ''
df['sema_diff'] = ''
df['lema_diff'] = ''


# RSI -----------------------------
df['diff'] = df['tick_avg'].diff()
df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])
df['avg_gain'] = df['gain'].rolling(window=data['rsi_window']).mean()
df['avg_loss'] = df['loss'].rolling(window=data['rsi_window']).mean()
df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= data['rs_max'], data['rs_max']) 
df['rsi'] = 100 - (100 / (df['rs'] + 1))

# Simple Moving Averages ------------------
df['ssma'] = df['tick_avg'].rolling(window=data['sma_len']).mean()
df['ssma_diff'] = df['ssma'].diff()
df['lsma'] = df['tick_avg'].rolling(window=data['lma_len']).mean()
df['lsma_diff'] = df['lsma'].diff()
df['sma_diff'] = df['ssma'] - df['lsma']

df['max_tick'] = df['tick_avg'].rolling(window=data['sma_len']).max()
df['min_tick'] = df['tick_avg'].rolling(window=data['sma_len']).min()

df['max_gap'] = df['max_tick'] -  df['tick_avg']
df['min_gap'] = df['min_tick'] - df['tick_avg']

df['tick_high_diff'] = df['tick_high'] - df['tick_avg']
df['tick_low_diff'] = df['tick_avg'] - df['tick_low']

Wall time: 20.9 ms


In [11]:
%%time
# Emas ----------------
df['sema'] = df['tick_avg'].rolling(window=data['sma_len']).progress_apply(roll_sma)
df['lema'] = df['tick_avg'].rolling(window=data['lma_len']).progress_apply(roll_sma)

df['sema_diff'] = df['sema'].diff()
df['lema_diff'] = df['lema'].diff()

4981it [00:04, 1057.40it/s]
4951it [00:04, 1063.68it/s]

Wall time: 9.38 s





%%time
# Slopes -----------------------------
df['small_sema_slope'] = df['sema'].rolling(window=data['sma_len']).progress_apply(get_slope_s)
df['long_sema_slope'] = df['sema'].rolling(window=data['lma_len']).progress_apply(get_slope_l)

In [12]:
%%time

df['ema_diff'] = df['sema'] - df['lema']

# Direction -------------------------
df['direction'] = 'same'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] > data['pip_diff']] = 'increase'
df['direction'].loc[df[diff_col].shift(-1) - df[diff_col] < -data['pip_diff']] = 'decrease'

# Remove NaNs ------------------------
del df['gain']
del df['loss']
df = df.dropna()
df = df.reset_index(drop=True)
print(f'Total records : {len(df)}')

Total records : 4950
Wall time: 11 ms


## Write data to csv

In [13]:
%%time
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

Wall time: 233 ms


## Print Report

In [14]:
g= df['direction']
print(pd.concat([g.value_counts(), g.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage')))
df.head(5)

          counts  percentage
same        3416   69.010101
increase     794   16.040404
decrease     740   14.949495


Unnamed: 0,tick_avg,tick_high,tick_low,spread_avg,tick_sd,sema,lema,sema_diff,lema_diff,diff,...,lsma_diff,sma_diff,max_tick,min_tick,max_gap,min_gap,tick_high_diff,tick_low_diff,ema_diff,direction
0,1.201148,1.20124,1.201055,3.4e-05,4.7e-05,1.201501,1.201165,-4.5e-05,-6e-06,-5.9e-05,...,4e-06,0.000308,1.202251,1.201028,0.001103,-0.00012,9.2e-05,9.3e-05,0.000336,same
1,1.201045,1.20119,1.200915,3.2e-05,7.3e-05,1.201446,1.201155,-5.5e-05,-1e-05,-0.000104,...,6e-06,0.000262,1.202251,1.201028,0.001207,-1.6e-05,0.000145,0.00013,0.000291,same
2,1.200871,1.201,1.2008,3.3e-05,4.5e-05,1.201372,1.201157,-7.4e-05,2e-06,-0.000173,...,3e-06,0.000196,1.202251,1.200871,0.00138,0.0,0.000129,7.1e-05,0.000216,same
3,1.200926,1.20102,1.200795,3.2e-05,7.2e-05,1.201313,1.201178,-6e-05,2.2e-05,5.5e-05,...,-9e-06,0.000147,1.202251,1.200871,0.001325,-5.5e-05,9.4e-05,0.000131,0.000135,same
4,1.201045,1.20117,1.200925,2.6e-05,6.5e-05,1.20127,1.201224,-4.3e-05,4.6e-05,0.000118,...,-8e-06,9.7e-05,1.202251,1.200871,0.001207,-0.000173,0.000125,0.00012,4.6e-05,increase


In [32]:
inc = df[df['direction'] == 'increase']['diff']
dec = df[df['direction'] == 'decrease']['diff']

In [16]:
pd.DataFrame(np.round(df['tick_low_diff'],4))['tick_low_diff'].value_counts()

0.0001    2292
0.0002    1701
0.0003     541
0.0004     173
0.0000     102
0.0005      75
0.0006      32
0.0008      11
0.0007       9
0.0009       3
0.0012       2
0.0015       2
0.0010       2
0.0013       2
0.0018       1
0.0019       1
0.0024       1
Name: tick_low_diff, dtype: int64

In [17]:
pd.DataFrame(np.round(df['tick_high_diff'],4))['tick_high_diff'].value_counts()

0.0001    2336
0.0002    1641
0.0003     542
0.0004     203
0.0000      97
0.0005      75
0.0006      24
0.0007      13
0.0008       7
0.0009       4
0.0011       2
0.0015       2
0.0010       2
0.0016       1
0.0012       1
Name: tick_high_diff, dtype: int64