# Challenge: Backtest on Other Datasets

## Download data from `yfinance`

In [93]:
import yfinance as yf
import pandas as pd
import numpy as np

In [94]:
df = yf.download(tickers='NFLX')
df.index = pd.to_datetime(df.index)
df = df.droplevel('Ticker', axis=1)

[*********************100%***********************]  1 of 1 completed


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [96]:
df = df.loc["2010-01-01":].copy()
df

Price,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900
...,...,...,...,...,...,...
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200
2024-11-12 00:00:00+00:00,819.500000,819.500000,820.369995,803.750000,807.500000,2814100
2024-11-13 00:00:00+00:00,830.469971,830.469971,836.989990,820.539978,822.619995,2763700


### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [97]:
df["change_tomorrow"] = df["Adj Close"].pct_change(-1)*-100
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600,-3.824504
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100,3.394607
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400,-1.755734
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400,1.688563
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900,-0.131502
...,...,...,...,...,...,...,...
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600,1.291223
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200,1.715680
2024-11-12 00:00:00+00:00,819.500000,819.500000,820.369995,803.750000,807.500000,2814100,1.320935
2024-11-13 00:00:00+00:00,830.469971,830.469971,836.989990,820.539978,822.619995,2763700,0.680492


#### Drop rows with any missing data

In [99]:
df = df.dropna()
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600,-3.824504
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100,3.394607
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400,-1.755734
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400,1.688563
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900,-0.131502
...,...,...,...,...,...,...,...
2024-11-07 00:00:00+00:00,796.539978,796.539978,798.219971,779.940002,781.369995,3295100,-0.188670
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600,1.291223
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200,1.715680
2024-11-12 00:00:00+00:00,819.500000,819.500000,820.369995,803.750000,807.500000,2814100,1.320935


#### Change sign

Did the stock go up or down?

In [100]:
df.change_tomorrow = np.where(df.change_tomorrow>0,1,-1)
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600,-1
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100,1
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400,-1
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400,1
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900,-1
...,...,...,...,...,...,...,...
2024-11-07 00:00:00+00:00,796.539978,796.539978,798.219971,779.940002,781.369995,3295100,-1
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600,1
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200,1
2024-11-12 00:00:00+00:00,819.500000,819.500000,820.369995,803.750000,807.500000,2814100,1


## Compute Machine Learning model

Proposal: Random Forest within `ensemble` module of `sklearn` library

In [101]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=7, random_state=42)


In [102]:
y = df.change_tomorrow
X = df.drop(columns="change_tomorrow")

In [103]:
model.fit(X=X, y=y)
model.score(X,y)

0.7095136290753608

## Backtesting

### Create the Strategy

In [104]:
from backtesting import Strategy, Backtest


In [105]:
class ClassificationUP(Strategy):
  def init(self):
    self.model = model
    self.already_bought = False

  def next(self):
    today = self.data.df.iloc[[-1],:]
    forecast = self.model.predict(today)[0]

    if forecast == 1 and self.already_bought == False:
      self.buy()
      self.already_bought = True
    elif forecast == -1 and self.already_bought == True:
      self.sell()
      self.already_bought = False
    else:
      pass


### Run the Backtest

In [106]:
bt = Backtest( data=X, strategy=ClassificationUP, cash=10000,   commission=.002, exclusive_orders=True)

### Show the report in a DataFrame

In [107]:
results = bt.run()

In [108]:
results.to_frame(name="Values").loc[:'Return [%]']

Unnamed: 0,Values
Start,2010-01-04 00:00:00+00:00
End,2024-11-13 00:00:00+00:00
Duration,5427 days 00:00:00
Exposure Time [%],99.946553
Equity Final [$],118895535100481.109375
Equity Peak [$],118895535100481.109375
Return [%],1188955350904.811035


## Plot the backtest report

> Don't worry about this new tool just yet, we will explain in a future chapter how to interpret the following chart.

In [16]:
import os
try:
  os.mkdir("my-reports")
except:
  pass

In [17]:

bt.plot(filename="./my-reports/plot-me.html")




## How to invest based on the numerical increase?

> Instead of the direction (UP or DOWN)

Next chapter → [Backtesting with Regression Models]()

Classification Model | Regression Model
-|-
![](src/pred_classification.png) | ![](src/pred_regression.png)

Classification Strategy | Regression Strategy
-|-
![](src/res_classification.png) | ![](src/res_regression.png)

In [109]:
df["change_tomorrow"] = df["Adj Close"].pct_change(-1)*-100
df=df.dropna()
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600,-3.824504
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100,3.394607
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400,-1.755734
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400,1.688563
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900,-0.131502
...,...,...,...,...,...,...,...
2024-11-06 00:00:00+00:00,780.210022,780.210022,781.489990,757.380005,771.500000,3690800,2.050111
2024-11-07 00:00:00+00:00,796.539978,796.539978,798.219971,779.940002,781.369995,3295100,-0.188670
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600,1.291223
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200,1.715680


In [21]:
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
2020-01-02 00:00:00+00:00,329.809998,329.809998,329.980011,324.779999,326.100006,4485800,-1.199756
2020-01-03 00:00:00+00:00,325.899994,325.899994,329.859985,325.529999,326.779999,3806900,2.956851
2020-01-06 00:00:00+00:00,335.829987,335.829987,336.359985,321.200012,323.119995,5663100,-1.535899
2020-01-07 00:00:00+00:00,330.750000,330.750000,336.700012,330.299988,336.470001,4703200,2.508403
2020-01-08 00:00:00+00:00,339.260010,339.260010,342.700012,331.049988,331.489990,7104500,-1.072516
...,...,...,...,...,...,...,...
2024-11-06 00:00:00+00:00,780.210022,780.210022,781.489990,757.380005,771.500000,3690800,2.050111
2024-11-07 00:00:00+00:00,796.539978,796.539978,798.219971,779.940002,781.369995,3295100,-0.188670
2024-11-08 00:00:00+00:00,795.039978,795.039978,799.059998,788.650024,797.359985,1910600,1.291223
2024-11-11 00:00:00+00:00,805.440002,805.440002,806.820007,795.570007,795.900024,2399200,1.715680


In [42]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

In [116]:
X = df.drop(columns="change_tomorrow")
y = df.change_tomorrow


In [152]:
cut_off = "2022-12-31"
X_train = X.loc[:cut_off].copy()
X_test = X.loc[cut_off:].copy()
y_train = y.loc[:cut_off].copy()
y_test = y.loc[cut_off:].copy()
X_train


Price,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04 00:00:00+00:00,7.640000,7.640000,7.961429,7.565714,7.931429,17239600
2010-01-05 00:00:00+00:00,7.358571,7.358571,7.657143,7.258571,7.652857,23753100
2010-01-06 00:00:00+00:00,7.617143,7.617143,7.672857,7.197143,7.361429,23290400
2010-01-07 00:00:00+00:00,7.485714,7.485714,7.757143,7.462857,7.731429,9955400
2010-01-08 00:00:00+00:00,7.614286,7.614286,7.742857,7.465714,7.498571,8180900
...,...,...,...,...,...,...
2022-12-23 00:00:00+00:00,294.959991,294.959991,298.459991,291.910004,296.179993,4251100
2022-12-27 00:00:00+00:00,284.170013,284.170013,293.570007,282.130005,293.190002,5778100
2022-12-28 00:00:00+00:00,276.880005,276.880005,285.190002,273.410004,281.920013,5964400
2022-12-29 00:00:00+00:00,291.119995,291.119995,295.500000,281.010010,283.179993,9588500


In [153]:
model_lr.fit(X_train,y_train)

In [154]:
predict_lr_train  = model_lr.predict(X_train)
predict_lr_test  = model_lr.predict(X_test)

In [155]:
predict_lr_test-y_test

Date
2023-01-03 00:00:00+00:00   -4.900728
2023-01-04 00:00:00+00:00   -0.436254
2023-01-05 00:00:00+00:00   -1.747569
2023-01-06 00:00:00+00:00   -0.378153
2023-01-09 00:00:00+00:00   -3.500693
                               ...   
2024-11-06 00:00:00+00:00   -3.414942
2024-11-07 00:00:00+00:00   -0.443943
2024-11-08 00:00:00+00:00   -1.737540
2024-11-11 00:00:00+00:00   -2.080149
2024-11-12 00:00:00+00:00   -2.061801
Name: change_tomorrow, Length: 469, dtype: float64

In [156]:
from sklearn.svm import SVR
model_svr = SVR()
model_svr.fit(X_train,y_train)
predict_svr_train = model_svr.predict(X_train)
predict_svr_test = model_svr.predict(X_test)

In [157]:
from sklearn.tree import DecisionTreeRegressor
model_dtr = DecisionTreeRegressor(max_depth=15)
model_dtr.fit(X_train,y_train)
predict_dtr_train = model_dtr.predict(X_train)
predict_dtr_test = model_dtr.predict(X_test)

In [158]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor(max_depth=15)
model_rfr.fit(X_train,y_train)
predict_rfr_train = model_rfr.predict(X_train)
predict_rfr_test = model_rfr.predict(X_test)

In [159]:
from sklearn.ensemble import GradientBoostingRegressor
model_gbr = GradientBoostingRegressor(max_depth=15)
model_gbr.fit(X_train,y_train)
predict_gbr_train = model_gbr.predict(X_train)
predict_gbr_test = model_gbr.predict(X_test)

In [160]:
from sklearn.metrics import mean_squared_error

mse_lr_train = mean_squared_error(y_train,predict_lr_train)
mse_svr_train = mean_squared_error(y_train,predict_svr_train)
mse_dtr_train = mean_squared_error(y_train,predict_dtr_train)
mse_rfr_train = mean_squared_error(y_train,predict_rfr_train)
mse_gbr_train = mean_squared_error(y_train,predict_gbr_train)

mse_lr_test = mean_squared_error(y_test,predict_lr_test)
mse_svr_test = mean_squared_error(y_test,predict_svr_test)
mse_dtr_test = mean_squared_error(y_test,predict_dtr_test)
mse_rfr_test = mean_squared_error(y_test,predict_rfr_test)
mse_gbr_test = mean_squared_error(y_test,predict_gbr_test)

In [173]:
print(f" MSEs Train \n \
      MSE LR: {mse_lr_train}\n \
      MSE SVR: {mse_svr_train}\n \
      MSE DTR: {mse_dtr_train}\n \
      MSE RFR: {mse_rfr_train}\n \
      MSE GBR: {mse_gbr_train}\n \
      ")
print(f" MSEs Test \n \
      MSE LR: {mse_lr_test}\n \
      MSE SVR: {mse_svr_test}\n \
      MSE DTR: {mse_dtr_test}\n \
      MSE RFR: {mse_rfr_test}\n \
      MSE GBR: {mse_gbr_test}\n \
      ")

 MSEs Train 
       MSE LR: 11.349741353239015
       MSE SVR: 11.427990619011311
       MSE DTR: 5.736519452694331
       MSE RFR: 5.371789699093109
       MSE GBR: 0.46847286904556545
       
 MSEs Test 
       MSE LR: 4.632450952829542
       MSE SVR: 4.438294840344792
       MSE DTR: 7.377020580595589
       MSE RFR: 5.22183358566174
       MSE GBR: 6.3993999205158225
       


In [186]:
temp = pd.DataFrame()
temp["change"]=y_test
temp["lr"]=predict_lr_test
temp["svr"]=predict_svr_test
temp["dtr"]=predict_dtr_test
temp["rfr"]=predict_rfr_test
temp["gbr"]=predict_gbr_test
temp

Unnamed: 0_level_0,change,lr,svr,dtr,rfr,gbr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-03 00:00:00+00:00,4.673408,-0.227320,0.049957,0.176642,-0.003184,1.547065
2023-01-04 00:00:00+00:00,0.093642,-0.342612,0.056903,0.176642,0.337229,1.578241
2023-01-05 00:00:00+00:00,1.853898,0.106329,0.053282,0.176642,-0.198177,-0.554157
2023-01-06 00:00:00+00:00,-0.120562,-0.498715,0.055418,0.176642,-0.265588,-0.763590
2023-01-09 00:00:00+00:00,3.776636,0.275944,0.049960,0.176642,-0.172642,-0.105112
...,...,...,...,...,...,...
2024-11-06 00:00:00+00:00,2.050111,-1.364831,0.054255,-1.417844,-1.053731,-1.264162
2024-11-07 00:00:00+00:00,-0.188670,-0.632613,0.056028,-1.417844,-1.117850,-1.591159
2024-11-08 00:00:00+00:00,1.291223,-0.446317,0.064578,-1.417844,-1.237360,-2.053439
2024-11-11 00:00:00+00:00,1.715680,-0.364469,0.061142,-1.417844,-1.355018,-2.077743


In [232]:
class RegressionStrategy(Strategy):
  def init(self):
    self.model = model_svr
    self.already_bought = False
    self.min_increase = 0
    self.min_decrease = 0

  def next(self):
    today = self.data.df.iloc[[-1],:]
    forecast = self.model.predict(today)[0]

    if forecast > self.min_increase and self.already_bought == False:
      self.buy()
      self.already_bought = True
    elif forecast < self.min_decrease and self.already_bought == True:
      self.sell()
      self.already_bought = False
    else:
      pass

In [239]:
bt = Backtest( data=X_test, strategy=RegressionStrategy, cash=10000,   commission=.005, exclusive_orders=True)

In [240]:
results = bt.run()

In [241]:
results

Start                     2023-01-03 00:00...
End                       2024-11-12 00:00...
Duration                    679 days 00:00:00
Exposure Time [%]                   99.573561
Equity Final [$]                     25966.88
Equity Peak [$]                      25966.88
Return [%]                           159.6688
Buy & Hold Return [%]              177.843691
Return (Ann.) [%]                   66.983404
Volatility (Ann.) [%]               58.069289
Sharpe Ratio                         1.153508
Sortino Ratio                         3.42712
Calmar Ratio                         2.454809
Max. Drawdown [%]                  -27.286602
Avg. Drawdown [%]                   -4.504358
Max. Drawdown Duration      126 days 00:00:00
Avg. Drawdown Duration       19 days 00:00:00
# Trades                                    1
Win Rate [%]                            100.0
Best Trade [%]                     161.720712
Worst Trade [%]                    161.720712
Avg. Trade [%]                    

In [242]:
bt.plot(filename="./my-reports/nflx.html")



