# Data Preparation

## Packages

In [1]:
import os
import winsound
import numpy as np
import pandas as pd
import datetime as dt
from tqdm import tqdm

import math, collections
from scipy.stats import linregress

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_slope_s(y_axis):
    global window_s_ma
    x_axis = []
    for i in range(window_s_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)

def get_slope_l(y_axis):
    global window_l_ma
    x_axis = []
    for i in range(window_l_ma):
        x_axis.append(1 + ((i+1) * 0.0001 * 0.1))
    
    slope_tick, intercept, _, _, _ = linregress(x_axis, y_axis)
    slope_tick = math.degrees(math.atan(slope_tick))
    
    return(slope_tick)


def get_month(row):
    month_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().month
    return(month_val)

def get_day(row):
    day_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').date().day
    return(day_val)   

def get_hour(row):
    hour_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().hour
    return(hour_val)   

def get_min(row):
    min_val = dt.datetime.strptime(row['TS'], '%Y-%m-%d %H:%M').time().minute
    return(min_val)   


def get_dow(row):
    dow = dt.datetime.strptime(df['TS'][0], '%Y-%m-%d %H:%M').weekday()
    return(dow)

## Read data

In [3]:
#source_file_path = 'data\M1_2019.csv'
source_file_path = 'data\M1_2018.csv'

In [4]:
path, file_name = os.path.split(source_file_path)

target_file_name = 'tab_'+file_name
target_file_path = os.path.join(path, target_file_name)

raw_df = pd.read_csv(source_file_path)

#raw_df['Close'] = (raw_df['High'] - raw_df['Low'])/2
#raw_df['tick'] = raw_df['Close']
#df = raw_df[['TS', 'Open', 'High', 'Low', 'Close']]

df = raw_df[['Open', 'High', 'Low', 'Close']]
print(f'{len(df)} records in df')
df.head()

372607 records in df


Unnamed: 0,Open,High,Low,Close
0,1.20037,1.201,1.20037,1.201
1,1.20083,1.20095,1.20017,1.2003
2,1.20035,1.20043,1.20035,1.20043
3,1.20041,1.2005,1.20031,1.20046
4,1.20049,1.20049,1.20046,1.20048


## Parameters

In [5]:
min_bar = 2

pip_diff = 0.0001
diff_col = 'ssma'

rsi_window = 10
rs_max = 1e6

window_s_ma = 10
window_l_ma = 100

## Data manipulation

In [6]:
%%time
df = df.iloc[::min_bar]
print(f'{len(df)} records under {min_bar} min_bar')

#df['month'] = df.apply(get_month, axis=1)
##df['day'] = df.apply(get_day, axis=1)
#df['dow'] = df.apply(get_dow, axis=1)
#df['hour'] = df.apply(get_hour, axis=1)
#df['min'] = df.apply(get_min, axis=1)

df['diff'] = df['Close'].diff()

df['gain'] = 0
df['loss'] = 0
df['gain'].loc[df['diff'] > 0] = abs(df['diff'])
df['loss'].loc[df['diff'] < 0] = abs(df['diff'])

df['avg_gain'] = df['gain'].rolling(window=rsi_window).mean()
df['avg_loss'] = df['loss'].rolling(window=rsi_window).mean()

df['rs'] = df['avg_gain']/df['avg_loss']
df['rs'] = df['rs'].where(df['rs'] <= rs_max, rs_max) 

df['rsi'] = 100 - (100 / (df['rs'] + 1))

df['ssma'] = df['Close'].rolling(window=window_s_ma).mean()
df['lsma'] = df['Close'].rolling(window=window_l_ma).mean()
df['sma_diff'] = df['ssma'] - df['lsma']

df['sema'] = df['Close'].ewm(span=window_s_ma).mean()
df['lema'] = df['Close'].ewm(span=window_l_ma).mean()

df['ema_diff'] = df['sema'] - df['lema']

df['slope_s'] = df['Close'].rolling(window=window_s_ma).apply(get_slope_s)

#df['slope_l'] = df['Close'].rolling(window=window_l_ma).apply(get_slope_l)
#df['sma_slope'] = df['ssma'].rolling(window=rsi_window).apply(get_slope)

df['target'] = 'same'

df['target'].loc[df[diff_col].shift(-1) - df[diff_col] > pip_diff] = 'increase'
df['target'].loc[df[diff_col].shift(-1) - df[diff_col] < -pip_diff] = 'decrease'

df = df.dropna()
df = df.reset_index(drop=True)

186304 records under 2 min_bar
Wall time: 50.9 s


## Write data to csv

In [7]:
#del df['TS']
df.to_csv(target_file_path, index = False)
winsound.PlaySound('C:\\Windows\\Media\\tada.wav', winsound.SND_ASYNC)

## Print Report

In [8]:
target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

df.head(10)

Record count : 186205
--------------------------
same        168535
decrease      9137
increase      8533
Name: target, dtype: int64
--------------------------
same        0.905105
decrease    0.049070
increase    0.045826
Name: target, dtype: float64


Unnamed: 0,Open,High,Low,Close,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,lsma,sma_diff,sema,lema,ema_diff,slope_s,target
0,1.20224,1.20228,1.20223,1.20228,2e-05,2e-05,0.0,8.6e-05,1.9e-05,4.526316,81.904762,1.202058,1.201216,0.000842,1.202029,1.201389,0.00064,78.289211,same
1,1.20217,1.20221,1.20207,1.20214,-0.00014,0.0,0.00014,6.7e-05,3.3e-05,2.030303,67.0,1.202092,1.201227,0.000865,1.202049,1.201406,0.000643,74.274375,same
2,1.20219,1.2022,1.20208,1.20213,-1e-05,0.0,1e-05,6.7e-05,3.4e-05,1.970588,66.336634,1.202125,1.201244,0.000881,1.202064,1.201422,0.000641,61.108353,same
3,1.20218,1.20218,1.2021,1.20218,5e-05,5e-05,0.0,5.4e-05,3.4e-05,1.588235,61.363636,1.202145,1.201261,0.000884,1.202085,1.20144,0.000645,48.878525,same
4,1.20222,1.20223,1.20219,1.20222,4e-05,4e-05,0.0,4e-05,3.4e-05,1.176471,54.054054,1.202151,1.201279,0.000872,1.20211,1.201457,0.000652,58.851419,same
5,1.20223,1.20228,1.20223,1.20227,5e-05,5e-05,0.0,4.5e-05,3e-05,1.5,60.0,1.202166,1.201293,0.000873,1.202139,1.201476,0.000663,64.504524,same
6,1.20224,1.20224,1.20221,1.20223,-4e-05,0.0,4e-05,4.5e-05,2.3e-05,1.956522,66.176471,1.202188,1.201307,0.000881,1.202155,1.201493,0.000663,54.5793,same
7,1.20221,1.20221,1.20193,1.20206,-0.00017,0.0,0.00017,4.5e-05,3.6e-05,1.25,55.555556,1.202197,1.20132,0.000877,1.202138,1.201505,0.000633,-36.702855,same
8,1.20186,1.20192,1.20186,1.20191,-0.00015,0.0,0.00015,2.2e-05,5.1e-05,0.431373,30.136986,1.202168,1.20133,0.000838,1.202097,1.201514,0.000582,-66.418349,same
9,1.20192,1.20192,1.20182,1.20182,-9e-05,0.0,9e-05,1.6e-05,6e-05,0.266667,21.052632,1.202124,1.201344,0.00078,1.202046,1.201521,0.000525,-74.375765,same
