# Walk Forward: A Realistic Approach to Backtesting

In [None]:
???

![](<src/10_Table_Validation Methods.png>)

## Load the data

In [1]:
import pandas as pd

df = pd.read_excel('data/microsoft-linkedin-processed.xlsx' , index_col=0)
df.index = pd.to_datetime(df.index)
df = df.drop(columns='change_tomorrow_direction')
df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-01,59.200001,60.150002,58.939999,60.110001,34542100,0.084387
2016-12-02,59.250000,59.470001,58.799999,59.080002,25515700,1.610763
2016-12-05,60.220001,60.590000,59.560001,59.700001,23552700,-0.450376
2016-12-06,59.950001,60.459999,59.799999,60.430000,19907000,2.313831
2016-12-07,61.369999,61.380001,59.799999,60.009998,30809000,-0.590068
...,...,...,...,...,...,...
2024-11-04,408.459991,410.420013,405.570007,409.799988,19672300,0.729111
2024-11-05,411.459991,414.899994,408.079987,408.369995,17626000,2.075301
2024-11-06,420.179993,420.450012,410.519989,412.420013,26681800,1.234046
2024-11-07,425.429993,426.850006,419.880005,421.279999,19901800,-0.683955


## Walk Forward Validation

### How `TimeSeriesSplit` works

In [2]:
from sklearn.model_selection import TimeSeriesSplit

In [3]:
ts = TimeSeriesSplit(test_size=200)

In [4]:
splits = ts.split(X=df)

In [5]:
split1= next(splits)

In [6]:
split1

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175, 176, 177

In [7]:
split2= next(splits)

In [8]:
split2

(array([   0,    1,    2, ..., 1195, 1196, 1197]),
 array([1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208,
        1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219,
        1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230,
        1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241,
        1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252,
        1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263,
        1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274,
        1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285,
        1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296,
        1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307,
        1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318,
        1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329,
        1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338,

In [9]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    list_df_train.append(df.iloc[index_train])
    list_df_test.append(df.iloc[index_test])

In [10]:
list_df_train[0]

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-01,59.200001,60.150002,58.939999,60.110001,34542100,0.084387
2016-12-02,59.250000,59.470001,58.799999,59.080002,25515700,1.610763
2016-12-05,60.220001,60.590000,59.560001,59.700001,23552700,-0.450376
2016-12-06,59.950001,60.459999,59.799999,60.430000,19907000,2.313831
2016-12-07,61.369999,61.380001,59.799999,60.009998,30809000,-0.590068
...,...,...,...,...,...,...
2020-11-11,216.550003,218.039993,212.199997,212.389999,29440800,-0.515225
2020-11-12,215.440002,219.110001,214.460007,217.210007,21593900,0.494200
2020-11-13,216.509995,217.419998,214.160004,216.360001,18621100,0.331447
2020-11-16,217.229996,217.740005,214.520004,214.869995,24953300,-1.291611


In [11]:
list_df_test[0]

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-18,211.080002,215.169998,210.929993,213.649994,28372800,0.630824
2020-11-19,212.419998,213.029999,209.929993,211.380005,24792700,-0.964874
2020-11-20,210.389999,213.289993,210.000000,212.199997,22843100,-0.133263
2020-11-23,210.110001,212.289993,208.160004,210.949997,25683500,1.753484
2020-11-24,213.860001,214.250000,208.860001,209.589996,33979700,0.004673
...,...,...,...,...,...,...
2021-08-30,303.589996,304.220001,301.059998,301.119995,16348100,-0.566447
2021-08-31,301.880005,304.500000,301.500000,304.420013,26285300,-0.016572
2021-09-01,301.829987,305.190002,301.489990,302.869995,18983800,-0.225799
2021-09-02,301.149994,303.359985,300.179993,302.200012,16285600,-0.003314


In [12]:
list_df_train[1]

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-01,59.200001,60.150002,58.939999,60.110001,34542100,0.084387
2016-12-02,59.250000,59.470001,58.799999,59.080002,25515700,1.610763
2016-12-05,60.220001,60.590000,59.560001,59.700001,23552700,-0.450376
2016-12-06,59.950001,60.459999,59.799999,60.430000,19907000,2.313831
2016-12-07,61.369999,61.380001,59.799999,60.009998,30809000,-0.590068
...,...,...,...,...,...,...
2021-08-30,303.589996,304.220001,301.059998,301.119995,16348100,-0.566447
2021-08-31,301.880005,304.500000,301.500000,304.420013,26285300,-0.016572
2021-09-01,301.829987,305.190002,301.489990,302.869995,18983800,-0.225799
2021-09-02,301.149994,303.359985,300.179993,302.200012,16285600,-0.003314


In [13]:
list_df_test[1]

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-09-07,300.179993,301.089996,298.200012,301.010010,17180400,0.009993
2021-09-08,300.209991,300.609985,297.470001,299.779999,15046800,-0.995792
2021-09-09,297.250000,302.140015,297.000000,300.820007,19927000,-0.520783
2021-09-10,295.709991,299.920013,295.380005,298.420013,19633400,0.430991
2021-09-13,296.989990,298.540009,294.079987,297.549988,23652900,0.933993
...,...,...,...,...,...,...
2022-06-15,251.759995,255.300003,246.419998,248.309998,33111700,-2.771765
2022-06-16,244.970001,247.419998,243.020004,245.979996,33169200,1.082169
2022-06-17,247.649994,250.500000,244.029999,244.699997,43084800,2.400099
2022-06-21,253.740005,254.750000,249.509995,250.259995,29928300,-0.240983


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [14]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [15]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]


### Simulate one computation of the ML model

- Compute the model
- Calculate predictions on the test set
- Evaluate how good the model is

In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt.fit(X_train, y_train)

y_pred = model_dt.predict(X_test)
error_mse = mean_squared_error(y_test, y_pred)
error_mse

1.8405433037825438

### Add the procedure inside the for loop

In [17]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    
    model_dt.fit(X_train, y_train)
    
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    
    error_mse_list.append(error_mse)

In [18]:
error_mse_list

[37.60386790948279,
 7.6388686681750775,
 5.651981628246065,
 4.638096856714091,
 1.8405433037825438]

In [19]:
import  numpy as np

In [20]:
np.mean(error_mse_list)

11.474671673280113

## Anchored Walk Forward evaluation in backtesting

![](<src/10_Table_Validation Methods.png>)

### Create a new strategy

In [21]:
from backtesting import Backtest, Strategy

In [22]:
df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-01,59.200001,60.150002,58.939999,60.110001,34542100,0.084387
2016-12-02,59.250000,59.470001,58.799999,59.080002,25515700,1.610763
2016-12-05,60.220001,60.590000,59.560001,59.700001,23552700,-0.450376
2016-12-06,59.950001,60.459999,59.799999,60.430000,19907000,2.313831
2016-12-07,61.369999,61.380001,59.799999,60.009998,30809000,-0.590068
...,...,...,...,...,...,...
2024-11-04,408.459991,410.420013,405.570007,409.799988,19672300,0.729111
2024-11-05,411.459991,414.899994,408.079987,408.369995,17626000,2.075301
2024-11-06,420.179993,420.450012,410.519989,412.420013,26681800,1.234046
2024-11-07,425.429993,426.850006,419.880005,421.279999,19901800,-0.683955


In [23]:
bt = Backtest(df, ???, cash=10000, commission=.002, exclusive_orders=True)

SyntaxError: invalid syntax (3269686141.py, line 1)

In [24]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    n_train = 600
    coef_retrain = 200
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.n_train, :-1]
        y_train = self.data.df.iloc[:self.n_train, -1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [25]:
class WalkForwardAnchored(Regression):
    def next(self):
        
        # we don't take any action and move on to the following day
        if len(self.data) < self.n_train:
            return
        
        # we retrain the model each 200 days
        if len(self.data) % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:, :-1]
            y_train = self.data.df.iloc[:, -1]

            self.model.fit(X_train, y_train)

            super().next()
            
        else:
            
            super().next()

In [26]:
from backtesting import Backtest
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [27]:
import multiprocessing as mp
mp.set_start_method('fork')

In [28]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff



Unnamed: 0,limit_buy,limit_sell,Return [%]
3,1,-5,127.955592
8,2,-5,106.830663
0,0,-6,100.136719
9,2,-4,93.961834
7,2,-6,80.012382
12,3,-5,79.778398
4,1,-4,79.487604
13,3,-4,68.207233
2,0,-2,67.877456
20,5,-5,63.633014


## Unanchored Walk Forward

### Create a library of strategies

### Create the unanchored walk forward class

![](<src/10_Table_Validation Methods.png>)

### Import the strategy and perform the backtest

In [34]:
%load_ext autoreload
%autoreload 2

In [36]:
import strategies

In [37]:
strategies.WalkForwardUnanchored

strategies.WalkForwardUnanchored

In [38]:
bt_unanchored = Backtest(df, strategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

Unnamed: 0,limit_buy,limit_sell,Return [%]
7,2,-6,46.134435
0,0,-6,37.798327
...,...,...,...
13,3,-5,-86.571062
14,3,-4,-86.836675


### Interpret the strategies' performance

In [39]:
bt.plot(filename='reports_backtesting/walk_forward_anchored.html')

In [40]:
bt_unanchored.plot(filename='reports_backtesting/walk_forward_unanchored.html')

## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="10D_Walk Forward Regression.ipynb">notebook</a>.