# Test Notebook 1: Data Gathering and Preparation
<br>

##### Purpose:
* Gather and wrangle data to prepare for building Machine Learning (ML) models

#####  How:

* Gather data using API calls to OANDA Foreign Exchange Market
* Using indicators, create extra features from the initial data
* Build the two decision variables (long and short)
    * Long: represents the result of a Long trade
    * Short: represents the result of a Short trade
    * A success is defined as high(long)/low(short) >10 pips in the direction of the trade within the two hour data window (candle) and <30 pips in the opposing direction of the trade
    * A failure is defined as whatever is not a success

In [1]:
import pandas as pd
import numpy as np
import OANDA_api as api
import Indicators

### Data Gathering

In [2]:
'''
Initialize an API object to make web calls, gather data, and convert data into a Pandas DataFrame.

For more documentaion of the oanda_api object visit OANDA_api python module file
'''

sess = api.oanda_api()

In [3]:
'''
Import data into a pandas dataframe within the oanda_api object
'''

candles = sess.complete_candles_df('USD_CAD', 35000, 'H2', 'MBA')

In [4]:
candles.head()

Unnamed: 0,time,volume,mid_o,mid_h,mid_l,mid_c,bid_o,bid_h,bid_l,bid_c,ask_o,ask_h,ask_l,ask_c
0,2011-06-19T01:00:00.000000000Z,1,0.97942,0.97942,0.97942,0.97942,0.97892,0.97892,0.97892,0.97892,0.97992,0.97992,0.97992,0.97992
1,2011-06-19T03:00:00.000000000Z,2,0.97942,0.97942,0.97942,0.97942,0.97892,0.97892,0.97892,0.97892,0.97992,0.97992,0.97992,0.97992
2,2011-06-19T05:00:00.000000000Z,1,0.97942,0.97942,0.97942,0.97942,0.97892,0.97892,0.97892,0.97892,0.97992,0.97992,0.97992,0.97992
3,2011-06-19T07:00:00.000000000Z,1,0.97942,0.97942,0.97942,0.97942,0.97892,0.97892,0.97892,0.97892,0.97992,0.97992,0.97992,0.97992
4,2011-06-19T09:00:00.000000000Z,1,0.97942,0.97942,0.97942,0.97942,0.97892,0.97892,0.97892,0.97892,0.97992,0.97992,0.97992,0.97992


In [5]:
'''
Build more variables using some common forex indicators

For more details on these function, visit the Indicators python module
'''
Indicators.relative_strength_index(candles)
Indicators.macd(candles)
Indicators.stochastic(candles)
Indicators.spread(candles)
Indicators.candle_range(candles)

  stochastic_k = ((curr_close - low) / (high - low)) * 100


In [6]:
'''
Shift mid_c to lag by one data point
'''

candles['mid_c_prev'] = candles['mid_c'].shift(1)

In [7]:
'''
Build a variable that is a combination of the range and stochastic
'''

candles['stochastic_range_k'] = candles['range_14'] * candles['stochastic_k']
candles['stochastic_range_d'] = candles['range_14'] * candles['stochastic_d']

In [8]:
'''
Set lags of lenghts 1 to 10 so that each candle data point contains the information of the previous
10 candles
'''

for lag in range(1,11):
    candles[f'rsi_14_lag_{lag}'] = candles['rsi_14'].shift(lag)
    candles[f'MACD_12_26_9_lag_{lag}'] = candles[f'MACD_12_26_9'].shift(lag)
    candles[f'mid_o_lag_{lag}'] = candles['mid_o'].shift(lag)
    candles[f'mid_l_lag_{lag}'] = candles['mid_l'].shift(lag)
    candles[f'mid_h_lag_{lag}'] = candles['mid_h'].shift(lag)
    candles[f'stochastic_range_k_lag_{lag}'] = candles['stochastic_range_k'].shift(lag)
    candles[f'stochastic_range_d_lag_{lag}'] = candles['stochastic_range_d'].shift(lag)

In [9]:
'''
Set the decision variable for each dataframe as described in the opening notes
'''

candles['long'] = False
candles['short'] = False

for candle in range(len(candles)):
    if (candles.loc[candle, 'bid_o'] + 0.001 <= candles.loc[candle, 'ask_h'] and 
        candles.loc[candle, 'bid_o'] - 0.003 <= candles.loc[candle, 'ask_l']):
        candles.loc[candle, 'short'] = True
    if (candles.loc[candle, 'ask_o'] - 0.001 >= candles.loc[candle, 'bid_l'] and 
        candles.loc[candle, 'ask_o'] + 0.003 >= candles.loc[candle, 'bid_h']):
        candles.loc[candle, 'long'] = True

In [10]:
candles.columns

Index(['time', 'volume', 'mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h',
       'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 'rsi_14',
       'MACD_12_26_9', 'stochastic_k', 'stochastic_d', 'range_14',
       'mid_c_prev', 'stochastic_range_k', 'stochastic_range_d',
       'rsi_14_lag_1', 'MACD_12_26_9_lag_1', 'mid_o_lag_1', 'mid_l_lag_1',
       'mid_h_lag_1', 'stochastic_range_k_lag_1', 'stochastic_range_d_lag_1',
       'rsi_14_lag_2', 'MACD_12_26_9_lag_2', 'mid_o_lag_2', 'mid_l_lag_2',
       'mid_h_lag_2', 'stochastic_range_k_lag_2', 'stochastic_range_d_lag_2',
       'rsi_14_lag_3', 'MACD_12_26_9_lag_3', 'mid_o_lag_3', 'mid_l_lag_3',
       'mid_h_lag_3', 'stochastic_range_k_lag_3', 'stochastic_range_d_lag_3',
       'rsi_14_lag_4', 'MACD_12_26_9_lag_4', 'mid_o_lag_4', 'mid_l_lag_4',
       'mid_h_lag_4', 'stochastic_range_k_lag_4', 'stochastic_range_d_lag_4',
       'rsi_14_lag_5', 'MACD_12_26_9_lag_5', 'mid_o_lag_5', 'mid_l_lag_5',
       'mid_h_lag_5', 'stochast

In [11]:
'''
Manipulate data so that the lag information of the previos candles are in relation to the current
candle. This standardizes the data and makes it more useful for comparison across data points
'''
candle_data = ['mid_o_lag_1', 'mid_l_lag_1', 'mid_h_lag_1','mid_o_lag_2', 
               'mid_l_lag_2', 'mid_h_lag_2', 'mid_o_lag_3', 'mid_l_lag_3', 
               'mid_h_lag_3', 'mid_o_lag_4', 'mid_l_lag_4', 'mid_h_lag_4', 
               'mid_o_lag_5', 'mid_l_lag_5', 'mid_h_lag_5', 'mid_o_lag_6', 
               'mid_l_lag_6', 'mid_h_lag_6', 'mid_o_lag_7', 'mid_l_lag_7', 
               'mid_h_lag_7', 'mid_o_lag_8', 'mid_l_lag_8', 'mid_h_lag_8', 
               'mid_o_lag_9', 'mid_l_lag_9', 'mid_h_lag_9', 'mid_o_lag_10', 
               'mid_l_lag_10', 'mid_h_lag_10']

for col in candle_data:
    candles[col] = candles['mid_c_prev'] - candles[col]

In [12]:
'''
Drop columns that are not valuable or not relevant to predictions
'''

candles.drop(['mid_o', 'mid_h', 'mid_l', 'mid_c', 'bid_o', 'bid_h', 
              'bid_l', 'bid_c', 'ask_o', 'ask_h', 'ask_l', 'ask_c', 
              'time', 'volume', 'rsi_14', 'MACD_12_26_9', 'stochastic_k', 'stochastic_d'],
             axis=1, inplace=True)

In [13]:
'''
Drop data points that have missing features. All of these data points will be at the start 
of DataFrame as a result of creating the lagged variables
'''

candles.dropna(inplace=True)

In [14]:
candles.tail()

Unnamed: 0,range_14,mid_c_prev,stochastic_range_k,stochastic_range_d,rsi_14_lag_1,MACD_12_26_9_lag_1,mid_o_lag_1,mid_l_lag_1,mid_h_lag_1,stochastic_range_k_lag_1,...,stochastic_range_d_lag_9,rsi_14_lag_10,MACD_12_26_9_lag_10,mid_o_lag_10,mid_l_lag_10,mid_h_lag_10,stochastic_range_k_lag_10,stochastic_range_d_lag_10,long,short
34995,0.00815,1.29948,0.589,0.583207,49.867949,-9.2e-05,-0.00222,0.00153,-0.00302,0.409,...,0.179667,46.039819,-0.00115,0.0016,0.00232,0.00032,0.215,0.218777,False,True
34996,0.00956,1.30128,0.039,0.421597,54.088268,4.6e-05,0.00179,0.00289,-0.00226,0.589,...,0.145857,44.970714,-0.001124,0.00279,0.00354,0.00196,0.168,0.179667,True,False
34997,0.00994,1.29437,0.336,0.364971,40.081103,-0.000311,-0.0069,0.00039,-0.0069,0.039,...,0.077761,42.518643,-0.001136,-0.00358,-0.00243,-0.00384,0.054571,0.145857,False,True
34998,0.00994,1.29696,0.222,0.199517,45.818741,-0.000353,0.00262,0.00336,-0.0008,0.336,...,0.026312,41.504948,-0.00111,-9e-05,0.00042,-0.00024,0.029355,0.077761,True,True
34999,0.00994,1.29582,0.02,0.192667,43.851134,-0.000432,-0.00114,0.00036,-0.00195,0.222,...,0.13487,39.192653,-0.001115,-0.00072,0.00038,-0.00176,0.002,0.026312,True,False


In [15]:
'''
Export DataFrame
'''
candles.to_pickle('CAD_USD_H2')