# Data Preparation

## Packages

In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import math, collections
from scipy.stats import linregress

import warnings
warnings.filterwarnings('ignore')

In [13]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)


def get_month(row):
    month_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().month
    return(month_val)

def get_day(row):
    day_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().day
    return(day_val)   

def get_hour(row):
    hour_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().hour
    return(hour_val)   

def get_min(row):
    min_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().minute
    return(min_val)   


def get_dow(row):
    dow = dt.datetime.strptime(df['TS'][0], '%Y-%m-%d %H:%M').weekday()
    return(dow)

## Read data

In [34]:
raw_df

Unnamed: 0,TS,Open,High,Low,Close,tick
0,2019-01-01 17:02,1.14598,1.14599,1.14598,1.14598,1.14598
1,2019-01-01 17:03,1.14598,1.14607,1.14598,1.14607,1.14607
2,2019-01-01 17:04,1.14607,1.14607,1.14606,1.14606,1.14606
3,2019-01-01 17:05,1.14606,1.14621,1.14606,1.14621,1.14621
4,2019-01-01 17:06,1.14619,1.14666,1.14604,1.14665,1.14665
...,...,...,...,...,...,...
372525,2019-12-31 16:55,1.12117,1.12118,1.12115,1.12118,1.12118
372526,2019-12-31 16:56,1.12118,1.12120,1.12115,1.12115,1.12115
372527,2019-12-31 16:57,1.12115,1.12115,1.12105,1.12105,1.12105
372528,2019-12-31 16:58,1.12105,1.12110,1.12099,1.12099,1.12099


In [14]:
raw_df = pd.read_csv('data\M1_2019.csv')

#raw_df['tick'] = (raw_df['High'] - raw_df['Low'])/2
raw_df['tick'] = raw_df['Close']

df = raw_df[['TS', 'tick', 'Open', 'High', 'Low']]
print(f'{len(df)} records in df')
df.head()

372530 records in df


Unnamed: 0,TS,tick
0,2019-01-01 17:02,1.14598
1,2019-01-01 17:03,1.14607
2,2019-01-01 17:04,1.14606
3,2019-01-01 17:05,1.14621
4,2019-01-01 17:06,1.14665


## Parameters

In [15]:
min_bar = 2

pip_diff = 0.0001
diff_col = 'ssma'

rsi_window = 10
rs_max = 1e6

window_s_ma = 10
window_l_ma = 100

## Data manipulation

In [16]:
%%time
df = df.iloc[::min_bar]
print(f'{len(df)} records under {min_bar} min_bar')

df['month'] = df.apply(get_month, axis=1)
df['day'] = df.apply(get_day, axis=1)
df['dow'] = df.apply(get_dow, axis=1)
df['hour'] = df.apply(get_hour, axis=1)
df['min'] = df.apply(get_min, axis=1)

df['diff'] = df['tick'].diff()

df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])

df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()

df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 

df['rsi'] = 100 - (100 / (df['rs'] + 1))

df['ssma'] = df['tick'].rolling(window=window_s_ma).mean()
df['lsma'] = df['tick'].rolling(window=window_l_ma).mean()
df['sma_diff'] = df['ssma'] - df['lsma']

df['sema'] = df['tick'].ewm(span=window_s_ma).mean()
df['lema'] = df['tick'].ewm(span=window_l_ma).mean()

df['ema_diff'] = df['sema'] - df['lema']

df['slope_s'] = df['tick'].rolling(window=window_s_ma).apply(get_slope_s)

#df['slope_l'] = df['tick'].rolling(window=window_l_ma).apply(get_slope_l)
#df['sma_slope'] = df['ssma'].rolling(window=rsi_window).apply(get_slope)

df['target'] = 'same'

df['target'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['target'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

df = df.dropna()
df = df.reset_index(drop=True)

df.head()

186265 records under 2 min_bar
Wall time: 2min 8s


## Write data to csv

In [9]:
del df['TS']
df.to_csv('data/tab_df.csv', index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [18]:
target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

df.head(10)

Record count : 186166
--------------------------
same        180369
decrease      2992
increase      2805
Name: target, dtype: int64
--------------------------
same        0.968861
decrease    0.016072
increase    0.015067
Name: target, dtype: float64


Unnamed: 0,TS,tick,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,...,sma_diff,sema,lema,ema_diff,slope_s,target,month,day,hour,min
0,2019-01-01 20:20,1.14541,9e-05,9e-05,0.0,3.8e-05,6e-05,0.633333,38.77551,1.1453,...,-0.000744,1.145337,1.145868,-0.000531,-52.366769,same,1,1,20,20
1,2019-01-01 20:22,1.14547,6e-05,6e-05,0.0,4.4e-05,4.8e-05,0.916667,47.826087,1.145296,...,-0.000743,1.145361,1.145859,-0.000498,45.855097,same,1,1,20,22
2,2019-01-01 20:24,1.14541,-6e-05,0.0,6e-05,4.4e-05,4.5e-05,0.977778,49.438202,1.145295,...,-0.000737,1.14537,1.145849,-0.000479,68.029719,same,1,1,20,24
3,2019-01-01 20:26,1.14533,-8e-05,0.0,8e-05,4.4e-05,3.8e-05,1.157895,53.658537,1.145301,...,-0.000718,1.145363,1.145837,-0.000474,68.222514,same,1,1,20,26
4,2019-01-01 20:28,1.14526,-7e-05,0.0,7e-05,3.6e-05,4.5e-05,0.8,44.444444,1.145292,...,-0.000719,1.145344,1.145824,-0.00048,69.007147,same,1,1,20,28
5,2019-01-01 20:30,1.14526,0.0,0.0,0.0,3.6e-05,2.6e-05,1.384615,58.064516,1.145302,...,-0.000701,1.145329,1.145811,-0.000482,57.196955,same,1,1,20,30
6,2019-01-01 20:32,1.14521,-5e-05,0.0,5e-05,3.6e-05,2.9e-05,1.241379,55.384615,1.145309,...,-0.000685,1.145307,1.145798,-0.00049,-1.735705,same,1,1,20,32
7,2019-01-01 20:34,1.14526,5e-05,5e-05,0.0,4.1e-05,2.6e-05,1.576923,61.19403,1.145324,...,-0.000662,1.145299,1.145786,-0.000487,-58.380571,same,1,1,20,34
8,2019-01-01 20:36,1.14531,5e-05,5e-05,0.0,2.6e-05,2.6e-05,1.0,50.0,1.145324,...,-0.000654,1.145301,1.145775,-0.000474,-60.863287,same,1,1,20,36
9,2019-01-01 20:38,1.14526,-5e-05,0.0,5e-05,2.5e-05,3.1e-05,0.806452,44.642857,1.145318,...,-0.000652,1.145293,1.145763,-0.00047,-65.255314,same,1,1,20,38
