# Data Preparation

## Packages

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
raw_df = pd.read_csv('data\M1_2019.csv')

#raw_df['tick'] = (raw_df['High'] - raw_df['Low'])/2
raw_df['tick'] = raw_df['Close']

df = raw_df[['TS', 'tick']]
print(f'{len(df)} records in df')
df.head()

372530 records in df


Unnamed: 0,TS,tick
0,2019-01-01 17:02,1.14598
1,2019-01-01 17:03,1.14607
2,2019-01-01 17:04,1.14606
3,2019-01-01 17:05,1.14621
4,2019-01-01 17:06,1.14665


## Parameters

In [3]:
min_bar = 2

pip_diff = 0.0001
diff_col = 'ssma'

rsi_window = 14
rs_max = 1e6

window_s_ma = 10
window_l_ma = 100

## Data manipulation

In [4]:
df = df.iloc[::min_bar]
print(f'{len(df)} records under {min_bar} min_bar')

df['diff'] = df['tick'].diff()

df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])

df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()

df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 

df['rsi'] = 100 - (100 / (df['rs'] + 1))


df['ssma'] = df['tick'].rolling(window=window_s_ma).mean()
df['lsma'] = df['tick'].rolling(window=window_l_ma).mean()

df['sema'] = df['tick'].ewm(span=window_s_ma).mean()
df['lema'] = df['tick'].ewm(span=window_l_ma).mean()

df['target'] = 'same'

df['target'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['target'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'


df = df.dropna()
df = df.reset_index(drop=True)

186265 records under 2 min_bar


## Write data to csv

In [5]:
del df['TS']
df.to_csv('data/tab_df.csv', index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [6]:
target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

df.head(10)

Record count : 186166
--------------------------
same        180369
decrease      2992
increase      2805
Name: target, dtype: int64
--------------------------
same        0.968861
decrease    0.016072
increase    0.015067
Name: target, dtype: float64


Unnamed: 0,tick,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,lsma,sema,lema,target
0,1.14541,9e-05,9e-05,0.0,2.9e-05,4.3e-05,0.683333,40.594059,1.1453,1.146044,1.145337,1.145868,same
1,1.14547,6e-05,6e-05,0.0,3.4e-05,4.3e-05,0.783333,43.925234,1.145296,1.146039,1.145361,1.145859,same
2,1.14541,-6e-05,0.0,6e-05,3.4e-05,4.7e-05,0.712121,41.59292,1.145295,1.146032,1.14537,1.145849,same
3,1.14533,-8e-05,0.0,8e-05,3.3e-05,5.3e-05,0.621622,38.333333,1.145301,1.146019,1.145363,1.145837,same
4,1.14526,-7e-05,0.0,7e-05,3.1e-05,5.8e-05,0.54321,35.2,1.145292,1.146011,1.145344,1.145824,same
5,1.14526,0.0,0.0,0.0,3.1e-05,4.9e-05,0.637681,38.938053,1.145302,1.146003,1.145329,1.145811,same
6,1.14521,-5e-05,0.0,5e-05,3.1e-05,4.6e-05,0.676923,40.366972,1.145309,1.145994,1.145307,1.145798,same
7,1.14526,5e-05,5e-05,0.0,3.5e-05,3.6e-05,0.98,49.494949,1.145324,1.145986,1.145299,1.145786,same
8,1.14531,5e-05,5e-05,0.0,3.3e-05,3.6e-05,0.92,47.916667,1.145324,1.145978,1.145301,1.145775,same
9,1.14526,-5e-05,0.0,5e-05,3.3e-05,2.6e-05,1.277778,56.097561,1.145318,1.14597,1.145293,1.145763,same
