1. Import libraries

In [69]:
import numpy as np
import pandas as pd
import sklearn
from scipy.stats import loguniform, randint, uniform
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from numpy import inf
from sklearn.metrics import make_scorer

2. Define functions to load data, create features, create target, and scoring function.

---



In [70]:
def csv_loader(path, index_col=True, flip=True):
    if index_col:
        df = pd.read_csv(path, index_col=0, parse_dates=True) 
    else: df = pd.read_csv(path)
    if flip:
        df = df.iloc[::-1]
    return df

def create_features(df):
    df['Spread'] = df['High'] - df['Low']
    df['LastClose'] = df['Close'].shift(periods=1)
    df['Gap'] = df['Open'] - df['LastClose']
    return df

def drop_features(df):
    df.drop(columns=['Spread',
                     'LastClose',
                     'Gap',
                     ],
            inplace=True)
    
    df.drop(columns=['Open','High','Low','Close','Volume',
                     'Adj Close'
                     ], inplace=True)
    return df

def process_features(df, lookback, step):
    lookback += 1
    for i in range(step, lookback, step):
        df['%d Max' % (i)] = df['High'].rolling(window=i).max()
        df['%d Max' % (i)] = df['%d Max' % (i)].pct_change(fill_method=None)
        df['%d Min' % (i)] = df['Low'].rolling(window=i).min()
        df['%d Min' % (i)] = df['%d Min' % (i)].pct_change(fill_method=None)

        df['%d Spread' % (i)] = df['Spread'].pct_change(periods=i, fill_method=None)
        df['%d Rolling Avg Spread' % (i)] = df['Spread'].rolling(window=i).mean()

        df['%d Gap' % (i)] = df['Gap'].pct_change(periods=i, fill_method=None)
        df['%d Rolling Avg Gap' % (i)] = df['Gap'].rolling(window=i).mean()

        df['%d Rolling Avg Open' % (i)] = df['Open'].rolling(window=i).mean()
        df['%d Rolling Avg Open' % (i)] = df['%d Rolling Avg Open' % (i)].pct_change(fill_method=None)

        df['%d Rolling Avg High' % (i)] = df['High'].rolling(window=i).mean()
        df['%d Rolling Avg High' % (i)] = df['%d Rolling Avg High' % (i)].pct_change(fill_method=None)

        df['%d Rolling Avg Low' % (i)] = df['Low'].rolling(window=i).mean()
        df['%d Rolling Avg Low' % (i)] = df['%d Rolling Avg Low' % (i)].pct_change(fill_method=None)

        df['%d Rolling Avg Close' % (i)] = df['Close'].rolling(window=i).mean()
        df['%d Rolling Avg Close' % (i)] = df['%d Rolling Avg Close' % (i)].pct_change(fill_method=None)
    return df
  
def features(df, lookback, step):
    create_features(df)
    process_features(df, lookback, step)
    drop_features(df)
    return df

def create_target(df, lookforward=2, target='Open'):
    df['Target'] = np.log(df[target].shift(periods=-lookforward)/df[target].shift(periods=-1))
    return df

def custom_score(y_true, y_pred):
  pred_sign = np.sign(y_pred)
  y_true = np.squeeze(y_true)
  returns = np.where((pred_sign == 1), y_true, 0)
  return returns.mean()

custom_scorer = make_scorer(custom_score, greater_is_better=True)

3. Define the models we are going to use


In [71]:
estimator1 = Ridge()
estimator2 = Lasso(alpha=.001)
#estimator3 = ElasticNet(alpha=.001)
estimator4 = KNeighborsRegressor()
models = [estimator1,
          estimator2,
          #estimator3,
          estimator4
          ]
estimator = VotingRegressor(estimators=[('Ridge', estimator1),
                                        ('Lasso', estimator2),
                                        #('ENet'), estimator3),
                                        ('KNN', estimator4),
                                        ],)

4. Define target, cross validation folds, interval, and lookback parameters.

In [72]:
lookforward = 2
tscv = TimeSeriesSplit(n_splits=4, gap=lookforward)
step = 1
lookback = 2

5. Load data

In [73]:
spy = csv_loader(path='/content/SPY.csv', index_col=True, flip=False)
agg = csv_loader(path='/content/AGG.csv', index_col=True, flip=False)

spy = create_target(spy, lookforward, target='Open')

spy = features(spy, lookback, step)
spy = spy.add_suffix(' SPY')
agg = features(agg, lookback, step)
agg = agg.add_suffix(' AGG')
cv = pd.merge(spy, agg, how='inner', on='Date')

cv.drop(cv.tail(lookforward).index, inplace=True)
cv.drop(cv.head(lookback).index, inplace=True)
X = cv
y = X[['Target SPY']]
X = X.drop(columns=['Target SPY'])
X.fillna(method="ffill", inplace=True)
X.replace([np.inf, -np.inf], 0, inplace=True)
X.fillna(0, inplace=True)

6. Define parameter grid. (Look at models on sklearns docs to find which parameters of a model you can change)

In [74]:
param_grid = {
    'Ridge__alpha': loguniform(1e-5, 1e0),
    'Lasso__alpha': loguniform(1e-5, 1e0),
    'KNN__n_neighbors': randint(1,10),
}

7. Split train and test data and run Random Search on train data

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)
search = RandomizedSearchCV(estimator, param_distributions=param_grid, n_iter=100, cv=tscv, scoring=custom_scorer, n_jobs=-1, verbose=1)
search.fit(X_train, y_train)

Fitting 4 folds for each of 100 candidates, totalling 400 fits


  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=TimeSeriesSplit(gap=2, max_train_size=None, n_splits=4, test_size=None),
                   estimator=VotingRegressor(estimators=[('Ridge', Ridge()),
                                                         ('Lasso',
                                                          Lasso(alpha=0.001)),
                                                         ('KNN',
                                                          KNeighborsRegressor())]),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'KNN__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7c0ff10ed0>,
                                        'Lasso__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7c0ff10c50>,
                                        'Ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7c0ff10050>},
                   scoring=make_scorer(custom_score), verbose=1)

8. Print best results of Random Search

In [76]:
print(search.best_params_)
print(search.best_score_)

{'KNN__n_neighbors': 6, 'Lasso__alpha': 0.38234352018940804, 'Ridge__alpha': 0.18530636826913355}
0.0003215068526279799


9. Back test best model parameters

In [None]:
!pip3 install backtesting
from backtesting import Strategy, Backtest
from sklearn.model_selection import train_test_split

In [77]:
step = 1
lookback = 2

estimator1 = Ridge(alpha=0.18530636826913355)
estimator2 = Lasso(alpha=0.38234352018940804)
#estimator3 = ElasticNet(alpha=.001)
estimator4 = KNeighborsRegressor(n_neighbors=6)
models = [estimator1,
          estimator2,
          #estimator3,
          estimator4
          ]
estimator = VotingRegressor(estimators=[('Ridge', estimator1),
                                        ('Lasso', estimator2),
                                        #('ENet'), estimator3),
                                        ('KNN', estimator4),
                                        ],)

X_test = X_test.iloc[(abs(lookforward)):]
y_test = y_test.iloc[(abs(lookforward)):]

estimator.fit(X_train, y_train)
forecasted = estimator.predict(X_test)

data = csv_loader(path='/content/SPY.csv', index_col=True, flip=False)
data.drop(data.tail(lookforward).index,inplace=True)
data.drop(data.head(lookback).index,inplace=True)
data = data.iloc[(-X_test.shape[0]):]
data['forecastedValue'] = forecasted
prediction = data

class MyStrategy(Strategy):
    Data = prediction

    def init(self):
        super().init()

    def next(self):
        if self.data.forecastedValue < 0:
            self.sell()
        elif self.data.forecastedValue > 0:
            self.buy()


bt = Backtest(prediction, MyStrategy,
              cash=1000,
              trade_on_close=False,
              exclusive_orders=True
              )
print(bt.run())

  y = column_or_1d(y, warn=True)


Start                     2018-12-04 00:00:00
End                       2022-09-16 00:00:00
Duration                   1382 days 00:00:00
Exposure Time [%]                   99.790136
Equity Final [$]                  1615.470184
Equity Peak [$]                   1903.270224
Return [%]                          61.547018
Buy & Hold Return [%]               42.667899
Return (Ann.) [%]                   13.522017
Volatility (Ann.) [%]               21.849221
Sharpe Ratio                         0.618879
Sortino Ratio                        1.080323
Calmar Ratio                         0.852019
Max. Drawdown [%]                  -15.870565
Avg. Drawdown [%]                   -3.378431
Max. Drawdown Duration      373 days 00:00:00
Avg. Drawdown Duration       31 days 00:00:00
# Trades                                  951
Win Rate [%]                         54.46898
Best Trade [%]                        8.32795
Worst Trade [%]                     -8.799434
Avg. Trade [%]                    

In [78]:
bt.plot()

These results aren't as good as the results in the previous notebook, this is to be expected as the results aren't cheating by being evaluated on data they've already seen, but this still beats buying and holding. Would I run this strategy? No, but that's a discussion for another time. For now this concludes the 3 part series on how to create a strategy using AI.

Somethings to explore futher: Create more features, add more data sources, evaluate more models, evaluate more parameters, evaluate higher period step interval and look back periods, evaluate how many cross validation folds are optimal when taking the bias-variance trade-off into account, etc.