# Walk Forward: A Realistic Approach to Backtesting

In [None]:
???

![](<src/10_Table_Validation Methods.png>)

## Load the data

In [None]:
import pandas as pd

df = pd.read_excel('data/microsoft-linkedin-processed.xlsx' , index_col=0)
df.index = pd.to_datetime(df.index)

# df = pd.read_excel('data/Microsoft_LinkedIn_Processed.xlsx', parse_dates=['Date'], index_col=0)
df = df.drop(columns='change_tomorrow_direction')
df

Unnamed: 0_level_0,Close,High,Low,Open,Volume,change_tomorrow
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-01,59.200001,60.150002,58.939999,60.110001,34542100,0.084387
2016-12-02,59.250000,59.470001,58.799999,59.080002,25515700,1.610763
2016-12-05,60.220001,60.590000,59.560001,59.700001,23552700,-0.450376
2016-12-06,59.950001,60.459999,59.799999,60.430000,19907000,2.313831
2016-12-07,61.369999,61.380001,59.799999,60.009998,30809000,-0.590068
...,...,...,...,...,...,...
2024-11-04,408.459991,410.420013,405.570007,409.799988,19672300,0.729111
2024-11-05,411.459991,414.899994,408.079987,408.369995,17626000,2.075301
2024-11-06,420.179993,420.450012,410.519989,412.420013,26681800,1.234046
2024-11-07,425.429993,426.850006,419.880005,421.279999,19901800,-0.683955


## Walk Forward Validation

### How `TimeSeriesSplit` works

In [13]:
from sklearn.model_selection import TimeSeriesSplit

In [14]:
ts = TimeSeriesSplit(test_size=200)

In [15]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(df):
    list_df_train.append(df.iloc[index_train])
    list_df_test.append(df.iloc[index_test])


## Machine Learning Model

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [16]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

In [17]:
list_df_train = []
list_df_test = []

for index_train, index_test in ts.split(y):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]


### Simulate one computation of the ML model

- Compute the model
- Calculate predictions on the test set
- Evaluate how good the model is

In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)
model_dt.fit(X_train, y_train)

y_pred = model_dt.predict(X_test)
error_mse = mean_squared_error(y_test, y_pred)
error_mse

1.8405433037825438

In [19]:
for index_train, index_test in ts.split(y):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    model_dt.fit(X_train, y_train)
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    print(error_mse)


37.60386790948279
7.6388686681750775
5.651981628246065
4.638096856714091
1.8405433037825438


### Add the procedure inside the for loop

In [20]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    model_dt.fit(X_train, y_train)
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)    
    error_mse_list.append(error_mse)

error_mse_list

[37.60386790948279,
 7.6388686681750775,
 5.651981628246065,
 4.638096856714091,
 1.8405433037825438]

## Anchored Walk Forward evaluation in backtesting

![](<src/10_Table_Validation Methods.png>)

### Create a new strategy

In [22]:
from backtesting import Backtest, Strategy

In [None]:
bt = Backtest(df, ???, cash=10000, commission=.002, exclusive_orders=True)

In [198]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    n_train = 600
    coef_retrain = 1
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.n_train,:-1]
        y_train = self.data.df.iloc[:self.n_train,-1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        # print(f"predict for {explanatory_today.index} ")
                
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [199]:
class WalkForwardAnchored(Regression):
    
    def next(self):
        
        # we don't take any action and move on to the following day
        L =len(self.data)

        if L < self.n_train:
            return
        
        # we retrain the model each x days
        if L % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:-1, :-1]
            y_train = self.data.df.iloc[:-1, -1]
            # print(f"Train start {X_train.index[0]}, train end {X_train.index[-1]}")
            # print(X_train)

            self.model.fit(X_train, y_train)

            super().next()
            
        else:
            super().next()

In [200]:
from backtesting import Backtest
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [201]:
bt.run()

Start                     2016-12-01 00:00:00
End                       2024-11-08 00:00:00
Duration                   2899 days 00:00:00
Exposure Time [%]                    69.86987
Equity Final [$]                 27343.568474
Equity Peak [$]                  30046.927849
Return [%]                         173.435685
Buy & Hold Return [%]              613.750005
Return (Ann.) [%]                   13.526919
Volatility (Ann.) [%]               28.092707
Sharpe Ratio                          0.48151
Sortino Ratio                        0.808984
Calmar Ratio                         0.342173
Max. Drawdown [%]                  -39.532349
Avg. Drawdown [%]                   -3.976396
Max. Drawdown Duration      524 days 00:00:00
Avg. Drawdown Duration       31 days 00:00:00
# Trades                                   35
Win Rate [%]                        57.142857
Best Trade [%]                      86.827056
Worst Trade [%]                    -23.559127
Avg. Trade [%]                    

In [202]:
import multiprocessing as mp
try:
  mp.set_start_method('fork')
except:
  pass

In [None]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

## Unanchored Walk Forward

### Create a library of strategies

`strategies.py`

### Create the unanchored walk forward class

![](<src/10_Table_Validation Methods.png>)

### Import the strategy and perform the backtest

In [139]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [156]:
import strategies

In [165]:
strategies.WalkForwardUnanchored

strategies.WalkForwardUnanchored

In [166]:
bt_unanchored = Backtest(df, strategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)
bt_unanchored.run()

Train start 2016-12-01 00:00:00, train end 2020-02-05 00:00:00
predict for DatetimeIndex(['2020-02-06'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-07'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-05 00:00:00, train end 2020-02-07 00:00:00
predict for DatetimeIndex(['2020-02-10'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-11'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-07 00:00:00, train end 2020-02-11 00:00:00
predict for DatetimeIndex(['2020-02-12'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-13'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-09 00:00:00, train end 2020-02-13 00:00:00
predict for DatetimeIndex(['2020-02-14'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-18'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-13 00:00:00, train end 2020-02-18 00:00:00
predict for DatetimeIndex(['2020-02-19'], dtype='datetime64[n

Start                     2016-12-01 00:00:00
End                       2024-11-08 00:00:00
Duration                   2899 days 00:00:00
Exposure Time [%]                    59.85986
Equity Final [$]                 17108.322813
Equity Peak [$]                  18797.922422
Return [%]                          71.083228
Buy & Hold Return [%]              613.750005
Return (Ann.) [%]                    7.007336
Volatility (Ann.) [%]               25.850868
Sharpe Ratio                         0.271068
Sortino Ratio                         0.42543
Calmar Ratio                         0.220138
Max. Drawdown [%]                  -31.831535
Avg. Drawdown [%]                   -5.396139
Max. Drawdown Duration      415 days 00:00:00
Avg. Drawdown Duration       41 days 00:00:00
# Trades                                   23
Win Rate [%]                        47.826087
Best Trade [%]                      52.797366
Worst Trade [%]                    -13.077067
Avg. Trade [%]                    

In [167]:


stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

Train start 2016-12-01 00:00:00, train end 2020-02-05 00:00:00
predict for DatetimeIndex(['2020-02-06'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-07'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-05 00:00:00, train end 2020-02-07 00:00:00
predict for DatetimeIndex(['2020-02-10'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-11'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-07 00:00:00, train end 2020-02-11 00:00:00
predict for DatetimeIndex(['2020-02-12'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-13'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-09 00:00:00, train end 2020-02-13 00:00:00
predict for DatetimeIndex(['2020-02-14'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-18'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-13 00:00:00, train end 2020-02-18 00:00:00
predict for DatetimeIndex(['2020-02-19'], dtype='datetime64[n



Train start 2016-12-01 00:00:00, train end 2020-02-05 00:00:00
predict for DatetimeIndex(['2020-02-06'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-07'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-05 00:00:00, train end 2020-02-07 00:00:00
predict for DatetimeIndex(['2020-02-10'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-11'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-07 00:00:00, train end 2020-02-11 00:00:00
predict for DatetimeIndex(['2020-02-12'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-13'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-09 00:00:00, train end 2020-02-13 00:00:00
predict for DatetimeIndex(['2020-02-14'], dtype='datetime64[ns]', freq=None) 
predict for DatetimeIndex(['2020-02-18'], dtype='datetime64[ns]', freq=None) 
Train start 2016-12-13 00:00:00, train end 2020-02-18 00:00:00
predict for DatetimeIndex(['2020-02-19'], dtype='datetime64[n

Unnamed: 0,limit_buy,limit_sell,Return [%]
4,1,-4,123.025389
7,2,-6,110.347857
0,0,-6,87.912057
1,0,-3,83.533842
6,1,-1,81.966584
2,0,-2,79.118351
9,2,-4,75.40419
8,2,-5,75.310698
3,1,-5,71.083228
5,1,-3,70.550506


### Interpret the strategies' performance

- Both anchored and unanchored backtesting

In [168]:
bt.plot(filename="reports_backtesting/10A_anchored.html")
bt_unanchored.plot(filename="reports_backtesting/10A_UNanchored.html")






## Practice to master the knowledge

Work on the challenge with another dataset:

1. Learn the <a>mental models</a> to solve the challenge faster.
2. Complete the <a href="10C_Walk Forward Regression.ipynb">notebook</a>.