# Data Preparation

## Packages

In [2]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm

import math, collections
from scipy.stats import linregress

import warnings
warnings.filterwarnings('ignore')

In [None]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)


def get_month(row):
    month_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().month
    return(month_val)

def get_day(row):
    day_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().day
    return(day_val)   

def get_hour(row):
    hour_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().hour
    return(hour_val)   

def get_min(row):
    min_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().minute
    return(min_val)   


def get_dow(row):
    dow = dt.datetime.strptime(df['TS'][0], '%Y-%m-%d %H:%M').weekday()
    return(dow)

## Parameters

In [None]:
#pd.DataFrame(np.random.rand(14,4), columns=['a', 'b', 'c', 'd'])

year = 2019

source_file_path = f'data\M1_{year}.csv'

min_bar = 10

pip_diff = 0.0003
diff_col = 'Close'

rsi_window = 10
rs_max = 1e6

window_s_ma = 10
window_l_ma = 50

## Read data

In [None]:
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

raw_df = pd.read_csv(source_file_path)

#raw_df['Close'] = (raw_df['High'] - raw_df['Low'])/2
#raw_df['tick'] = raw_df['Close']
#df = raw_df[['TS', 'Open', 'High', 'Low', 'Close']]

df = raw_df[['Open', 'High', 'Low', 'Close']]
print(f'{len(df)} records in df')
df.head()

## Data manipulation

In [None]:
%%time
df = df.iloc[::min_bar]
print(f'{len(df)} records under {min_bar} min_bar')

#df['month'] = df.apply(get_month, axis=1)
##df['day'] = df.apply(get_day, axis=1)
#df['dow'] = df.apply(get_dow, axis=1)
#df['hour'] = df.apply(get_hour, axis=1)
#df['min'] = df.apply(get_min, axis=1)

df['diff'] = df['Close'].diff()

df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])

df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()

df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 

df['rsi'] = 100 - (100 / (df['rs'] + 1))

df['ssma'] = df['Close'].rolling(window=window_s_ma).mean()
df['lsma'] = df['Close'].rolling(window=window_l_ma).mean()
df['sma_diff'] = df['ssma'] - df['lsma']

df['sema'] = df['Close'].ewm(span=window_s_ma).mean()
df['lema'] = df['Close'].ewm(span=window_l_ma).mean()

df['ema_diff'] = df['sema'] - df['lema']

#df['slope_s'] = df['Close'].rolling(window=window_s_ma).apply(get_slope_s)
#df['slope_l'] = df['Close'].rolling(window=window_l_ma).apply(get_slope_l)

#df['sma_slope'] = df['ssma'].rolling(window=rsi_window).apply(get_slope)

df['target'] = 'same'

df['target'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['target'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

df = df.dropna()
df = df.reset_index(drop=True)

## Write data to csv

In [None]:
#del df['TS']
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [None]:
target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

df.head(10)

In [None]:
df['High']

In [None]:
df['High'].shift(1) - df['Close']

In [None]:
1.14943 - 

In [120]:
#df = pd.read_csv('data/tick_2019_small.csv')
df = pd.read_csv('data/tick_2019.csv')

In [121]:
df['tick'] = (df['Bid'] + df['Ask'])/2
df = df[['tick']]

In [122]:
#df = df[0:50]

In [123]:
def chunk_ticks(number_of_ticks):    

    temp_df = pd.DataFrame()
    open_list = []
    high_list = []
    low_list = []
    close_list = []
    high_diff_list = []
    low_diff_list = []
    dir_list = []
    min_pip = 0.0001

    for i in tqdm(range(0,len(df),number_of_ticks)):
        chunk_list = list(df['tick'][i:i+number_of_ticks])

        open_val = chunk_list[0]
        high_val = np.max(chunk_list)
        low_val = np.min(chunk_list)
        close_val = chunk_list[-1]
        high_diff = high_val - open_val
        low_diff = open_val - low_val

        if high_diff > low_diff and high_diff >= min_pip:
            direction = 'increase'
        elif low_diff > high_diff and low_diff >= min_pip:
            direction = 'decrease'
        else:
            direction = 'same'


        open_list.append(chunk_list[0]) 
        high_list.append(np.max(chunk_list))
        low_list.append(np.min(chunk_list))
        close_list.append(chunk_list[-1]) 
        high_diff_list.append(high_diff) 
        low_diff_list.append(low_diff)
        dir_list.append(direction)



    temp_df['open'] = open_list
    temp_df['high'] = high_list
    temp_df['low'] = low_list    
    temp_df['close'] = close_list    
    temp_df['high_diff'] = high_diff_list    
    temp_df['low_diff'] = low_diff_list    
    temp_df['direction'] = dir_list    
    
    return(temp_df)

100%|████████████████████████████████████████████████████████████████████████| 486439/486439 [01:05<00:00, 7478.51it/s]


In [124]:
print(len(df))
print(len(temp_df))
temp_df.head()

29186310
486439


Unnamed: 0,open,high,low,close,high_diff,low_diff,direction
0,1.1464,1.1467,1.14622,1.1467,0.0003,0.00018,increase
1,1.1467,1.14674,1.14663,1.146685,4e-05,7e-05,same
2,1.14669,1.14677,1.1464,1.146435,8e-05,0.00029,decrease
3,1.14644,1.14644,1.146215,1.14622,0.0,0.000225,decrease
4,1.14623,1.146415,1.14622,1.14626,0.000185,1e-05,increase
