# Model Testing and Validation

In [1]:
import pandas as pd
import numpy as np
import sklearn as skl

### Upload Data

In [2]:
candles = pd.read_pickle('CAD_USD_H2')

In [3]:
candles.head()

Unnamed: 0,range_14,mid_c_prev,stochastic_range_k,stochastic_range_d,rsi_14_lag_1,MACD_12_26_9_lag_1,mid_o_lag_1,mid_l_lag_1,mid_h_lag_1,stochastic_range_k_lag_1,...,stochastic_range_d_lag_9,rsi_14_lag_10,MACD_12_26_9_lag_10,mid_o_lag_10,mid_l_lag_10,mid_h_lag_10,stochastic_range_k_lag_10,stochastic_range_d_lag_10,long,short
37,0.00959,0.97308,0.222,0.196333,33.408488,-0.000375,0.00057,0.00057,-0.00083,0.213,...,0.066845,37.432008,0.0,-0.0034,-0.00311,-0.00532,0.151,0.106078,False,False
38,0.00959,0.97317,0.065361,0.166787,33.796158,-0.000232,6e-05,0.00019,-0.00052,0.222,...,0.095262,33.116789,-0.000124,-0.00455,-0.003,-0.00555,0.043,0.066845,True,False
39,0.00851,0.97153,0.148587,0.134529,28.903525,-0.000218,-0.00162,9e-05,-0.00176,0.065361,...,0.079146,27.659356,-0.000305,-0.00505,-0.00277,-0.00592,0.05,0.095262,False,True
40,0.00819,0.97238,0.426892,0.20857,34.162447,-0.000126,0.00083,0.00121,-0.00022,0.148587,...,0.12418,22.628277,-0.00054,-0.00239,0.00092,-0.00322,0.108,0.079146,False,True
41,0.00777,0.975,0.371,0.303889,47.310287,0.000123,0.0026,0.00283,-0.00028,0.426892,...,0.079521,24.485597,-0.000635,0.00249,0.00372,0.00141,0.198906,0.12418,True,False


### Data Preparation

In [4]:
'''
Separate the decision (y) variable (long, short) from the rest of the features (X)
'''

candle_features = candles[[col for col in candles.columns if col not in ['long', 'short']]]
long = candles['long']
short = candles['short']

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
'''
Normalize features for faster model building
'''

scaler = StandardScaler()
scaler.fit(candle_features)

scaled_candles = scaler.transform(candle_features)

In [7]:
'''
Split the training data from the testing data
'''
x_train_candles = scaled_candles[:-3069]
y_train_long = long[:-3069]
y_train_short = short[:-3069]

x_test_candles = scaled_candles[-3069:]
y_test_long = long[-3069:]
y_test_short = short[-3069:]

y_test_long.reset_index(drop=True, inplace=True)
y_test_short.reset_index(drop=True, inplace=True)

### Model Training

In [8]:
from sklearn.linear_model import LogisticRegression as LR

In [9]:
'''
Initialize models with the optimal parameters from model validation
'''
long_model = LR(penalty='l1', solver='liblinear', C=1)
short_model = LR(penalty='l1', solver='liblinear', C=0.1)

In [10]:
'''
Fit the model to training data
'''
long_model.fit(x_train_candles, y_train_long)
short_model.fit(x_train_candles, y_train_short)

### Model Testing

In [11]:
'''
Identify the predicted wins as those are the trades we will be taking and therefore what the 
succes of the model is going to based off of
'''
long_test_results = pd.DataFrame(long_model.predict(x_test_candles), columns=['predictions'])
long_test_results['actual'] = y_test_long
long_takes = long_test_results.loc[long_test_results['predictions'] == True]
long_takes_count = long_takes['actual'].count()

short_test_results = pd.DataFrame(short_model.predict(x_test_candles), columns=['predictions'])
short_test_results['actual'] = y_test_short
short_takes = short_test_results.loc[short_test_results['predictions'] == True]
short_takes_count = short_takes['actual'].count()

In [12]:
'''
Final metrics of model testing.
'''
print(f"Long Trades:\n\nModel accuracy: {long_takes['actual'].mean()} \nCount: {long_takes_count}")
print(f"Random accuracy: {y_test_long.mean()}")

Long Trades:

Model accuracy: 0.7929324240545568 
Count: 1613
Random accuracy: 0.545128706419029


In [13]:
print(f"Short Trades: \n\nModel accuracy: {short_takes['actual'].mean()} \nCount: {short_takes_count}")
print(f"Random accuracy: {y_test_short.mean()}")

Short Trades: 

Model accuracy: 0.807865892972276 
Count: 1551
Random accuracy: 0.5493646138807429


In [14]:
'''
Calculate the growth multiplier over the test data (approx. a calendar year) based on a 1% risk
'''
long_wins = long_takes.loc[long_takes['actual'] == True, 'actual'].count()
long_ror = ((301/300) ** long_wins) * (0.99 ** (long_takes_count - long_wins))
long_ror

2.458204863953639

In [15]:
short_wins = short_takes.loc[short_takes['actual'] == True, 'actual'].count()
short_ror = ((301/300) ** short_wins) * (0.99 ** (short_takes_count - short_wins))
short_ror

3.2372393100069967