# Challenge: Walk Forward on Other Datasets

## Download data from `yfinance`

In [13]:
import yfinance as yf

ticker = 'AAPL'
df = yf.download(ticker)
df = df.droplevel('Ticker', axis=1)
df

[*********************100%***********************]  1 of 1 completed


Price,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12 00:00:00+00:00,0.098834,0.128348,0.128906,0.128348,0.128348,469033600
1980-12-15 00:00:00+00:00,0.093678,0.121652,0.122210,0.121652,0.122210,175884800
1980-12-16 00:00:00+00:00,0.086802,0.112723,0.113281,0.112723,0.113281,105728000
1980-12-17 00:00:00+00:00,0.088951,0.115513,0.116071,0.115513,0.115513,86441600
1980-12-18 00:00:00+00:00,0.091530,0.118862,0.119420,0.118862,0.118862,73449600
...,...,...,...,...,...,...
2024-11-22 00:00:00+00:00,229.869995,229.869995,230.720001,228.059998,228.059998,38168300
2024-11-25 00:00:00+00:00,232.869995,232.869995,233.250000,229.740005,231.460007,90152800
2024-11-26 00:00:00+00:00,235.059998,235.059998,235.570007,233.330002,233.330002,45986200
2024-11-27 00:00:00+00:00,234.929993,234.929993,235.690002,233.809998,234.470001,33498400


## Preprocess the data

### Filter the date range

In [14]:
df = df.loc['2018-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [15]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [16]:
df = df.dropna().copy()
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-02 00:00:00+00:00,40.524345,43.064999,43.075001,42.314999,42.540001,102223600,-0.017427
2018-01-03 00:00:00+00:00,40.517284,43.057499,43.637501,42.990002,43.132500,118071600,0.462350
2018-01-04 00:00:00+00:00,40.705486,43.257500,43.367500,43.020000,43.134998,89738400,1.125740
2018-01-05 00:00:00+00:00,41.168941,43.750000,43.842499,43.262501,43.360001,94640000,-0.372848
2018-01-08 00:00:00+00:00,41.016014,43.587502,43.902500,43.482498,43.587502,82271200,-0.011478
...,...,...,...,...,...,...,...
2024-11-21 00:00:00+00:00,228.520004,228.520004,230.160004,225.710007,228.880005,42108300,0.587284
2024-11-22 00:00:00+00:00,229.869995,229.869995,230.720001,228.059998,228.059998,38168300,1.288272
2024-11-25 00:00:00+00:00,232.869995,232.869995,233.250000,229.740005,231.460007,90152800,0.931678
2024-11-26 00:00:00+00:00,235.059998,235.059998,235.570007,233.330002,233.330002,45986200,-0.055338


## Machine Learning modelling

### Separate the data

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [17]:
y = df.change_tomorrow
X = df[['Open','High','Low','Close','Volume']]

### Time Series Split

In [18]:
from sklearn.model_selection import TimeSeriesSplit

ts = TimeSeriesSplit(test_size=200)

### Compute and evaluate model in a for loop

1. Separate the data in train and test
2. Compute the model on the train set
3. Evaluate the model (mse) on the test set
4. Append the errors (mse) in an empty list

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_dt = RandomForestRegressor(max_depth=15, random_state=42)

error_mse_list = []

for index_train, index_test in ts.split(df):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_test, y_test = X.iloc[index_test], y.iloc[index_test]
    
    model_dt.fit(X_train, y_train)
    
    y_pred = model_dt.predict(X_test)
    error_mse = mean_squared_error(y_test, y_pred)
    
    error_mse_list.append(error_mse)

In [20]:
error_mse_list

[11.19479212345935,
 4.721375921393135,
 4.171941283365242,
 1.7910142888472342,
 2.3740338363782114]

## Anchored Walk Forward evaluation in backtesting

![](<src/10_Table_Validation Methods.png>)

### Create a new strategy

In [21]:
from backtesting import Backtest, Strategy

In [22]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    n_train = 600
    coef_retrain = 200
    
    def init(self):
        self.model = RandomForestRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        X_train = self.data.df.iloc[:self.n_train, :-1]
        y_train = self.data.df.iloc[:self.n_train, -1]
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :-1]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

In [25]:
class WalkForwardAnchored(Regression):
    def next(self):
        
        # we don't take any action and move on to the following day
        if len(self.data) < self.n_train:
            return
        
        # we retrain the model each 200 days
        if len(self.data) % self.coef_retrain == 0:
            X_train = self.data.df.iloc[:-1, :-1]
            y_train = self.data.df.iloc[:-1, -1]

            self.model.fit(X_train, y_train)

            super().next()
            
        else:
            
            super().next()

### Run the backtest with optimization

In [26]:
from backtesting import Backtest
bt = Backtest(df, WalkForwardAnchored, cash=10000, commission=.002, exclusive_orders=True)

In [29]:
import multiprocessing as mp
try:
  mp.set_start_method('fork')
except:
  pass

In [28]:
stats_skopt, heatmap, optimize_result = bt.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff



Unnamed: 0,limit_buy,limit_sell,Return [%]
0,0,-6,193.307088
1,0,-3,186.678923
3,1,-5,113.858505
4,1,-4,113.858505
5,1,-3,113.858505
7,2,-6,113.858505
8,2,-5,113.858505
9,2,-4,113.858505
10,2,-3,113.858505
15,3,-2,0.0


## Unanchored Walk Forward

### Create a library of strategies

[strategies.py](strategies.py)

### Create the unanchored walk forward class

In the previously created library

![](<src/10_Table_Validation Methods.png>)

### Import the strategy and perform the backtest with optimization

In [30]:
%load_ext autoreload
%autoreload 2

In [31]:
import strategies

START


In [32]:
strategies.WalkForwardUnanchored

strategies.WalkForwardUnanchored

In [33]:
bt_unanchored = Backtest(df, strategies.WalkForwardUnanchored, cash=10000, commission=.002, exclusive_orders=True)

stats_skopt, heatmap, optimize_result = bt_unanchored.optimize(
    limit_buy = range(0, 6), limit_sell = range(-6, 0),
    maximize='Return [%]',
    max_tries=500,
    random_state=42,
    return_heatmap=True,
    return_optimization=True,
    method='skopt'
    )

dff = heatmap.reset_index()
dff = dff.sort_values('Return [%]', ascending=False)
dff

Train start 2018-01-02 00:00:00+00:00, train end 2020-05-19 00:00:00+00:00
predict for DatetimeIndex(['2020-05-20 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-03 00:00:00+00:00, train end 2020-05-20 00:00:00+00:00
predict for DatetimeIndex(['2020-05-21 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-04 00:00:00+00:00, train end 2020-05-21 00:00:00+00:00
predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+0



Train start 2018-01-02 00:00:00+00:00, train end 2020-05-19 00:00:00+00:00
predict for DatetimeIndex(['2020-05-20 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-03 00:00:00+00:00, train end 2020-05-20 00:00:00+00:00
predict for DatetimeIndex(['2020-05-21 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-04 00:00:00+00:00, train end 2020-05-21 00:00:00+00:00
predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+0



Train start 2018-01-02 00:00:00+00:00, train end 2020-05-19 00:00:00+00:00
predict for DatetimeIndex(['2020-05-20 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-03 00:00:00+00:00, train end 2020-05-20 00:00:00+00:00
predict for DatetimeIndex(['2020-05-21 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-04 00:00:00+00:00, train end 2020-05-21 00:00:00+00:00
predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+0



predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+00:00
predict for DatetimeIndex(['2020-05-28 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-10 00:00:00+00:00, train end 2020-05-28 00:00:00+00:00
predict for DatetimeIndex(['2020-05-29 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-11 00:00:00+00:00, train end 2020-05-29 00:00:00+00:00
predict for DatetimeIndex(['2020-06-01 00:00:00+00:00'], dtype='dateti



predict for DatetimeIndex(['2020-05-20 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-03 00:00:00+00:00, train end 2020-05-20 00:00:00+00:00
predict for DatetimeIndex(['2020-05-21 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-04 00:00:00+00:00, train end 2020-05-21 00:00:00+00:00
predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+00:00
predict for DatetimeIndex(['2020-05-28 00:00:00+00:00'], dtype='dateti



Train start 2018-01-02 00:00:00+00:00, train end 2020-05-19 00:00:00+00:00
predict for DatetimeIndex(['2020-05-20 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-03 00:00:00+00:00, train end 2020-05-20 00:00:00+00:00
predict for DatetimeIndex(['2020-05-21 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-04 00:00:00+00:00, train end 2020-05-21 00:00:00+00:00
predict for DatetimeIndex(['2020-05-22 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-05 00:00:00+00:00, train end 2020-05-22 00:00:00+00:00
predict for DatetimeIndex(['2020-05-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-08 00:00:00+00:00, train end 2020-05-26 00:00:00+00:00
predict for DatetimeIndex(['2020-05-27 00:00:00+00:00'], dtype='datetime64[ns, UTC]', name='Date', freq=None) 
Train start 2018-01-09 00:00:00+00:00, train end 2020-05-27 00:00:00+0

Unnamed: 0,limit_buy,limit_sell,Return [%]
10,2,-6,217.466594
0,0,-6,215.089072
1,0,-5,199.159512
5,1,-5,179.516707
26,5,-6,168.407417
11,2,-5,143.376174
15,3,-6,133.33487
16,3,-5,122.144604
21,4,-6,101.735567
22,4,-5,87.30462


### Interpret the strategies' performance

In [34]:
bt.plot(filename='reports_backtesting/walk_forward_anchored.html')




In [35]:
bt_unanchored.plot(filename='reports_backtesting/walk_forward_unanchored.html')




## Course Conclusion

Watch video → [Next steps]()