# Challenge: Overfitting on Other Datasets

## Download data from `yfinance`

In [17]:
import yfinance as yf

ticker = 'META'
df = yf.download(ticker)
df = df.droplevel('Ticker', axis=1)
df

[*********************100%***********************]  1 of 1 completed


Price,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-05-18,38.115242,38.230000,45.000000,38.000000,42.049999,573576400
2012-05-21,33.927845,34.029999,36.660000,33.000000,36.529999,168192700
2012-05-22,30.906942,31.000000,33.590000,30.940001,32.610001,101786600
2012-05-23,31.903942,32.000000,32.500000,31.360001,31.370001,73600000
2012-05-24,32.930851,33.029999,33.209999,31.770000,32.950001,50237200
...,...,...,...,...,...,...
2024-11-20,565.520020,565.520020,566.349976,554.200012,562.929993,9797300
2024-11-21,563.090027,563.090027,570.000000,549.049988,569.520020,11154700
2024-11-22,559.140015,559.140015,563.830017,554.590027,563.549988,9164000
2024-11-25,565.109985,565.109985,572.590027,556.390015,562.099976,13599800


## Preprocess the data

### Filter the date range

- Since 1 year ago at least

In [18]:
df = df.loc['2020-01-01':].copy()

### Create the target variable

#### Percentage change

- Percentage change on `Adj Close` for tomorrow

In [19]:
df['change_tomorrow'] = df['Adj Close'].pct_change(-1)
df.change_tomorrow = df.change_tomorrow * -1
df.change_tomorrow = df.change_tomorrow * 100

#### Remove rows with any missing data

In [20]:
df = df.dropna().copy()
df

Price,Adj Close,Close,High,Low,Open,Volume,change_tomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,209.150269,209.779999,209.789993,206.270004,206.750000,12077100,-0.531936
2020-01-03,208.043610,208.669998,210.399994,206.949997,207.210007,11188400,1.848539
2020-01-06,211.961807,212.600006,212.779999,206.520004,206.699997,17058900,0.215894
2020-01-07,212.420410,213.059998,214.580002,211.750000,212.820007,14912400,1.003633
2020-01-08,214.573944,215.220001,216.240005,212.610001,213.000000,13475000,1.410910
...,...,...,...,...,...,...,...
2024-11-19,561.090027,561.090027,561.429993,550.599976,551.859985,9522400,0.783349
2024-11-20,565.520020,565.520020,566.349976,554.200012,562.929993,9797300,-0.431546
2024-11-21,563.090027,563.090027,570.000000,549.049988,569.520020,11154700,-0.706444
2024-11-22,559.140015,559.140015,563.830017,554.590027,563.549988,9164000,1.056426


## Machine Learning modelling

### Feature selection

1. Target: which variable do you want to predict?
2. Explanatory: which variables will you use to calculate the prediction?

In [21]:
y = df.change_tomorrow
X = df.drop(columns='change_tomorrow')

### Train test split

In [22]:
n_days = len(df.index)
n_days

1234

In [23]:
n_days_split = int(n_days*0.7)
n_days_split

863

In [24]:
X_train, y_train = X.iloc[:n_days_split], y.iloc[:n_days_split]
X_test, y_test = X.iloc[n_days_split:], y.iloc[n_days_split:]

### Fit the model on train set

In [25]:
from sklearn.tree import DecisionTreeRegressor

In [26]:
model_dt_split = DecisionTreeRegressor(max_depth=15, random_state=42)

In [27]:
model_dt_split.fit(X=X_train, y=y_train)

### Evaluate model

#### On test set

In [28]:
from sklearn.metrics import mean_squared_error

y_pred_test = model_dt_split.predict(X=X_test)
mean_squared_error(y_true=y_test, y_pred=y_pred_test)

10.129031765825912

#### On train set

In [29]:
y_pred_train = model_dt_split.predict(X=X_train)
mean_squared_error(y_true=y_train, y_pred=y_pred_train)

2.7810105264719

## Backtesting

In [30]:
from backtesting import Backtest, Strategy

### Create the `Strategy`

In [31]:
class Regression(Strategy):
    limit_buy = 1
    limit_sell = -5
    
    def init(self):
        self.model = DecisionTreeRegressor(max_depth=15, random_state=42)
        self.already_bought = False
        
        self.model.fit(X=X_train, y=y_train)

    def next(self):
        explanatory_today = self.data.df.iloc[[-1], :]
        forecast_tomorrow = self.model.predict(explanatory_today)[0]
        
        if forecast_tomorrow > self.limit_buy and self.already_bought == False:
            self.buy()
            self.already_bought = True
        elif forecast_tomorrow < self.limit_sell and self.already_bought == True:
            self.sell()
            self.already_bought = False
        else:
            pass

### Run the backtest on `test` data

In [55]:
bt_test = Backtest(X_test, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

In [56]:
results = bt_test.run(limit_buy=4, limit_sell=-0)

df_results_test = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'Out of Sample (Test)'}, axis=1)
df_results_test

Unnamed: 0,Out of Sample (Test)
Start,2023-06-07 00:00:00
End,2024-11-25 00:00:00
Duration,537 days 00:00:00
Exposure Time [%],81.940701
Equity Final [$],0.0
Equity Peak [$],11765.654802
Return [%],-100.0


### Run the backtest on `train` data

In [57]:
bt_train = Backtest(X_train, Regression,
              cash=10000, commission=.002, exclusive_orders=True)

results = bt_train.run(limit_buy=5, limit_sell=-0)

df_results_train = results.to_frame(name='Values').loc[:'Return [%]']\
    .rename({'Values':'In Sample (Train)'}, axis=1)
df_results_train

Unnamed: 0,In Sample (Train)
Start,2020-01-02 00:00:00
End,2023-06-06 00:00:00
Duration,1251 days 00:00:00
Exposure Time [%],94.322132
Equity Final [$],13472.542663
Equity Peak [$],37529.1817
Return [%],34.725427


### Compare both backtests

#### In the same DataFrame

- HINT: Concatenate the previous `DataFrames`

In [58]:
import pandas as pd

In [59]:
df_results = pd.concat([df_results_train, df_results_test], axis=1)
df_results

Unnamed: 0,In Sample (Train),Out of Sample (Test)
Start,2020-01-02 00:00:00,2023-06-07 00:00:00
End,2023-06-06 00:00:00,2024-11-25 00:00:00
Duration,1251 days 00:00:00,537 days 00:00:00
Exposure Time [%],94.322132,81.940701
Equity Final [$],13472.542663,0.0
Equity Peak [$],37529.1817,11765.654802
Return [%],34.725427,-100.0


#### Plot both backtest reports

In [60]:
bt_test.plot(filename='reports_backtesting/regression_test_set.html')
bt_train.plot(filename='reports_backtesting/regression_train_set.html')

## How to solve the overfitting problem?

> Walk Forward Validation as a realistic approach to backtesting.

Next tutorial → [Walk Forward Validation]()

![](<src/10_Table_Validation Methods.png>)