## Forex Backtester using Machine Learning

In this project, the goal is to look at different machine learning models to see which will preform best. The models that will be utilized are Linear Regression, Logistic Regression, Naive Bayes, and SVC (Support Vector Machine). The data set is the USDJPY on the HR Timeframe.

In [1]:
import pandas as pd
import quandl, datetime, math
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
style.use('ggplot')
#Read in data
df = pd.read_csv('USDJPY60.csv',header=None)
c=['Date','Time','Open','High','Low','Close','Volume']
df.columns = c

df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume
0,2009.08.13,13:00,95.72,95.83,95.29,95.37,928
1,2009.08.13,14:00,95.36,95.65,95.29,95.47,1124
2,2009.08.13,15:00,95.48,95.5,95.23,95.43,537
3,2009.08.13,16:00,95.42,95.49,95.38,95.38,310
4,2009.08.13,17:00,95.39,95.41,95.05,95.19,692


In [2]:
#create 2 new columns that have different smooth moving averages
sma1 = 15
sma2 = 30

df['sma1'] = df['Close'].rolling(sma1).mean()
df['sma2'] = df['Close'].rolling(sma2).mean()
df.dropna(inplace=True)
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2
29,2009.08.14,18:00,94.74,94.88,94.69,94.87,440,94.934,95.128667
30,2009.08.14,19:00,94.88,94.93,94.79,94.81,254,94.908,95.11
31,2009.08.14,20:00,94.82,94.94,94.77,94.92,293,94.88,95.091667
32,2009.08.16,22:00,94.77,94.77,94.64,94.72,155,94.843333,95.068
33,2009.08.16,23:00,94.73,94.87,94.67,94.73,428,94.812667,95.046333


In [4]:
# Create returns column 
df['Returns'] = np.log(df['Close']/df['Close'].shift(1))
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns
30,2009.08.14,19:00,94.88,94.93,94.79,94.81,254,94.908,95.11,-0.000633
31,2009.08.14,20:00,94.82,94.94,94.77,94.92,293,94.88,95.091667,0.00116
32,2009.08.16,22:00,94.77,94.77,94.64,94.72,155,94.843333,95.068,-0.002109
33,2009.08.16,23:00,94.73,94.87,94.67,94.73,428,94.812667,95.046333,0.000106
34,2009.08.17,00:00,94.74,94.74,94.54,94.59,535,94.774,95.026333,-0.001479


In [6]:
# Create trading rules: Go long (1) if shorter sma is above longer sma 
# or go short (-1) if shorter sma is below longer sma
df['Position'] = np.where(df['sma1']> df['sma2'],1,-1)

In [7]:
# Multiply position shifted by 1 hour by returns
df['Strategy'] = df['Position'].shift(1) * df['Returns']
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns,Position,Strategy
30,2009.08.14,19:00,94.88,94.93,94.79,94.81,254,94.908,95.11,-0.000633,-1,
31,2009.08.14,20:00,94.82,94.94,94.77,94.92,293,94.88,95.091667,0.00116,-1,-0.00116
32,2009.08.16,22:00,94.77,94.77,94.64,94.72,155,94.843333,95.068,-0.002109,-1,0.002109
33,2009.08.16,23:00,94.73,94.87,94.67,94.73,428,94.812667,95.046333,0.000106,-1,-0.000106
34,2009.08.17,00:00,94.74,94.74,94.54,94.59,535,94.774,95.026333,-0.001479,-1,0.001479


In [8]:
# drop na
df.dropna(inplace=True)
df.tail()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns,Position,Strategy
64995,2020.02.18,14:00,109.773,109.941,109.762,109.913,4819,109.7446,109.8176,0.001265,-1,-0.001265
64996,2020.02.18,15:00,109.912,109.927,109.79,109.8,4383,109.7482,109.815433,-0.001029,-1,0.001029
64997,2020.02.18,16:00,109.8,109.828,109.767,109.799,3602,109.7504,109.8131,-9e-06,-1,9e-06
64998,2020.02.18,17:00,109.8,109.884,109.8,109.863,2172,109.763133,109.812533,0.000583,-1,-0.000583
64999,2020.02.18,18:00,109.863,109.9,109.838,109.855,1396,109.773533,109.811933,-7.3e-05,-1,7.3e-05


In [9]:
# sum the returns column
ret = df['Strategy'].sum()
ret

0.39322044663786315

In [10]:
# direction
df['direction'] = np.sign(df['Returns']).astype(int)

In [11]:
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns,Position,Strategy,direction
31,2009.08.14,20:00,94.82,94.94,94.77,94.92,293,94.88,95.091667,0.00116,-1,-0.00116,1
32,2009.08.16,22:00,94.77,94.77,94.64,94.72,155,94.843333,95.068,-0.002109,-1,0.002109,-1
33,2009.08.16,23:00,94.73,94.87,94.67,94.73,428,94.812667,95.046333,0.000106,-1,-0.000106,1
34,2009.08.17,00:00,94.74,94.74,94.54,94.59,535,94.774,95.026333,-0.001479,-1,0.001479,-1
35,2009.08.17,01:00,94.6,94.6,94.47,94.56,455,94.739333,95.007667,-0.000317,-1,0.000317,-1


In [12]:
# create lags columns, ie, lag_1 will be lagged 1 day, lag_2 for 2 days, etc.
lags = 3
def create_lags(df):
    global cols
    cols = []
    for lag in range(1,lags+1):
        col = 'lag_{}'.format(lag)
        df[col]= df['Returns'].shift(lag)
        cols.append(col)
create_lags(df)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns,Position,Strategy,direction,lag_1,lag_2,lag_3
34,2009.08.17,00:00,94.74,94.74,94.54,94.59,535,94.774,95.026333,-0.001479,-1,0.001479,-1,0.000106,-0.002109,0.00116
35,2009.08.17,01:00,94.6,94.6,94.47,94.56,455,94.739333,95.007667,-0.000317,-1,0.000317,-1,-0.001479,0.000106,-0.002109
36,2009.08.17,02:00,94.55,94.66,94.48,94.51,298,94.705333,94.979,-0.000529,-1,0.000529,-1,-0.000317,-0.001479,0.000106
37,2009.08.17,03:00,94.5,94.65,94.5,94.6,251,94.69,94.949667,0.000952,-1,-0.000952,1,-0.000529,-0.000317,-0.001479
38,2009.08.17,04:00,94.61,94.62,94.52,94.58,254,94.673333,94.923667,-0.000211,-1,0.000211,-1,0.000952,-0.000529,-0.000317


In [13]:
# use linear regression 
lr = LinearRegression()
df['pos_ols_1'] = lr.fit(df[cols],df['Returns']).predict(df[cols])
df['pos_ols_2'] = lr.fit(df[cols],df['direction']).predict(df[cols])
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,sma1,sma2,Returns,Position,Strategy,direction,lag_1,lag_2,lag_3,pos_ols_1,pos_ols_2
34,2009.08.17,00:00,94.74,94.74,94.54,94.59,535,94.774,95.026333,-0.001479,-1,0.001479,-1,0.000106,-0.002109,0.00116,1.7e-05,0.051235
35,2009.08.17,01:00,94.6,94.6,94.47,94.56,455,94.739333,95.007667,-0.000317,-1,0.000317,-1,-0.001479,0.000106,-0.002109,3.1e-05,0.095263
36,2009.08.17,02:00,94.55,94.66,94.48,94.51,298,94.705333,94.979,-0.000529,-1,0.000529,-1,-0.000317,-0.001479,0.000106,2.2e-05,0.063858
37,2009.08.17,03:00,94.5,94.65,94.5,94.6,251,94.69,94.949667,0.000952,-1,-0.000952,1,-0.000529,-0.000317,-0.001479,2.3e-05,0.061353
38,2009.08.17,04:00,94.61,94.62,94.52,94.58,254,94.673333,94.923667,-0.000211,-1,0.000211,-1,0.000952,-0.000529,-0.000317,4e-06,-0.003312


In [14]:
# change positions to 1 or -1
df[['pos_ols_1','pos_ols_2']]=np.where(df[['pos_ols_1','pos_ols_2']]>0,1,-1)

In [15]:
# Calculate returns on different strategies
df['strat_1'] =df['pos_ols_1'] * df['Returns']
df['strat_2'] =df['pos_ols_2'] * df['Returns']
df.dropna(inplace=True)
df[['Returns','strat_1','strat_2']].sum().apply(np.exp)


Returns    1.159664
strat_1    2.597954
strat_2    2.176838
dtype: float64

Looking at the results, using the 1st strategy had a better performance.

In [16]:
df_strat = pd.DataFrame(df[['Returns','strat_1','strat_2']])

In [17]:
df_strat.head()

Unnamed: 0,Returns,strat_1,strat_2
34,-0.001479,-0.001479,-0.001479
35,-0.000317,-0.000317,-0.000317
36,-0.000529,-0.000529,-0.000529
37,0.000952,0.000952,0.000952
38,-0.000211,-0.000211,0.000211


In [18]:
# create bins 
def create_bins(data,bins=[0]):
    global cols_bin
    cols_bin = []
    for col in cols:
        col_bin = col + '_bin'
        data[col_bin] = np.digitize(data[col],bins=bins)
        cols_bin.append(col_bin)
create_bins(df)
df[cols_bin + ['direction']].head()

Unnamed: 0,lag_1_bin,lag_2_bin,lag_3_bin,direction
34,1,0,1,-1
35,0,1,0,-1
36,0,0,1,-1
37,0,0,0,1
38,1,0,0,-1


In [22]:
# implement different ml techniques
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import linear_model

C = 1

models = {
    'log_reg':linear_model.LogisticRegression(C=C,solver='liblinear',multi_class='auto'),
    'gauss_nb': GaussianNB(),
    'svm':SVC(C=C,gamma='scale')
}

def fit_models(data):
    mfit = {model:models[model].fit(data[cols_bin],
                                  data['direction'])
           for model in models.keys()}
fit_models(df)
def derive_positions(data):
    for model in models.keys():
        data['pos_'+model] = models[model].predict(data[cols_bin])
        print(data['pos_'+model])
derive_positions(df)

34      -1
35       1
36       1
37       1
38       1
39       1
40       1
41       1
42      -1
43       1
44      -1
45      -1
46       1
47      -1
48       1
49       1
50       1
51       1
52       1
53      -1
54       1
55      -1
56       1
57       1
58       1
59      -1
60      -1
61       1
62       1
63       1
        ..
64970    1
64971    1
64972    1
64973   -1
64974    1
64975   -1
64976   -1
64977   -1
64978    1
64979    1
64980    1
64981    1
64982    1
64983    1
64984    1
64985   -1
64986   -1
64987   -1
64988   -1
64989    1
64990   -1
64991    1
64992   -1
64993    1
64994   -1
64995   -1
64996   -1
64997    1
64998    1
64999    1
Name: pos_log_reg, Length: 64966, dtype: int64
34      -1
35       1
36       1
37       1
38       1
39       1
40       1
41       1
42      -1
43       1
44      -1
45      -1
46       1
47      -1
48       1
49       1
50       1
51       1
52       1
53      -1
54       1
55      -1
56       1
57       1
58       1
59     

In [23]:
# evaluate the strategies
def evaluate_strats(data):
    global sel 
    sel = []
    for model in models.keys():
        col = 'strat_' + model
        data[col] = data['pos_'+model]*data['Returns']
        sel.append(col)
    sel.insert(0,'Returns')
strats = evaluate_strats(df)

In [24]:
df[sel].sum().apply(np.exp)

Returns           1.159664
strat_log_reg     1.778881
strat_gauss_nb    1.778881
strat_svm         1.778881
dtype: float64

Looking at reults, all three models had the same result and not better than the Linear Regression model above.

In [25]:
# create 5 lags and bins
lags = 5
create_lags(df)
df.dropna(inplace=True)
create_bins(df)
df[cols_bin].tail()

Unnamed: 0,lag_1_bin,lag_2_bin,lag_3_bin,lag_4_bin,lag_5_bin
64995,1,1,0,1,0
64996,1,1,1,0,1
64997,0,1,1,1,0
64998,0,0,1,1,1
64999,1,0,0,1,1


In [26]:
df.dropna(inplace=True)

In [27]:
#fit models, derive positions, and evaluate strategies
fit_models(df)
derive_positions(df)
evaluate_strats(df)


39       1
40       1
41       1
42      -1
43       1
44      -1
45      -1
46       1
47      -1
48       1
49       1
50       1
51       1
52       1
53      -1
54       1
55      -1
56       1
57       1
58       1
59      -1
60      -1
61       1
62       1
63       1
64       1
65      -1
66      -1
67      -1
68       1
        ..
64970    1
64971    1
64972    1
64973   -1
64974    1
64975   -1
64976   -1
64977   -1
64978    1
64979    1
64980    1
64981    1
64982    1
64983    1
64984    1
64985   -1
64986   -1
64987   -1
64988   -1
64989    1
64990   -1
64991    1
64992   -1
64993    1
64994   -1
64995   -1
64996   -1
64997    1
64998    1
64999    1
Name: pos_log_reg, Length: 64961, dtype: int64
39       1
40       1
41      -1
42      -1
43       1
44      -1
45      -1
46       1
47      -1
48       1
49       1
50       1
51       1
52       1
53      -1
54       1
55      -1
56       1
57       1
58      -1
59      -1
60      -1
61       1
62       1
63       1
64     

Returns           1.161503
strat_log_reg     1.776440
strat_gauss_nb    1.917308
strat_svm         1.804198
dtype: float64

In [28]:
df[sel].sum().apply(np.exp)

Returns           1.161503
strat_log_reg     1.776440
strat_gauss_nb    1.917308
strat_svm         1.804198
dtype: float64

Using more lags and bins, the naive bayes gaussian model preformed the strongest but still not like linear regression model though that model has tendency to overfit. 