In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
import joblib 
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
path = r'../dataset/dataset_v2.csv'    
df = pd.read_csv(path, parse_dates=['Date'])
df = df.sort_values("Date")
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Date,Close,Volume,CPIAUCSL,FEDFUNDS,sp500,usd_index,Open_prev,High_prev,Low_prev
0,2000-08-31,278.299988,0.0,172.7,6.5,1517.680054,112.599998,273.899994,273.899994,273.899994
1,2000-09-01,277.0,0.0,173.6,6.52,1520.77002,111.419998,274.799988,278.299988,274.799988
2,2000-09-05,275.799988,2.0,173.6,6.52,1507.079956,112.410004,277.0,277.0,277.0
3,2000-09-06,274.200012,0.0,173.6,6.52,1492.25,114.120003,275.799988,275.799988,275.799988
4,2000-09-07,274.0,125.0,173.6,6.52,1502.51001,113.650002,274.200012,274.200012,274.200012


2. Feature engineering 

In [4]:
for i in range (2,11): 
    df[f'Close_{i}th_day'] = df['Close'].shift(i)

df.head()

Unnamed: 0,Date,Close,Volume,CPIAUCSL,FEDFUNDS,sp500,usd_index,Open_prev,High_prev,Low_prev,Close_2th_day,Close_3th_day,Close_4th_day,Close_5th_day,Close_6th_day,Close_7th_day,Close_8th_day,Close_9th_day,Close_10th_day
0,2000-08-31,278.299988,0.0,172.7,6.5,1517.680054,112.599998,273.899994,273.899994,273.899994,,,,,,,,,
1,2000-09-01,277.0,0.0,173.6,6.52,1520.77002,111.419998,274.799988,278.299988,274.799988,,,,,,,,,
2,2000-09-05,275.799988,2.0,173.6,6.52,1507.079956,112.410004,277.0,277.0,277.0,278.299988,,,,,,,,
3,2000-09-06,274.200012,0.0,173.6,6.52,1492.25,114.120003,275.799988,275.799988,275.799988,277.0,278.299988,,,,,,,
4,2000-09-07,274.0,125.0,173.6,6.52,1502.51001,113.650002,274.200012,274.200012,274.200012,275.799988,277.0,278.299988,,,,,,


In [5]:
df = df.dropna()

df.head()

Unnamed: 0,Date,Close,Volume,CPIAUCSL,FEDFUNDS,sp500,usd_index,Open_prev,High_prev,Low_prev,Close_2th_day,Close_3th_day,Close_4th_day,Close_5th_day,Close_6th_day,Close_7th_day,Close_8th_day,Close_9th_day,Close_10th_day
10,2000-09-15,272.299988,0.0,173.6,6.52,1465.810059,115.940002,272.399994,272.399994,272.399994,272.799988,272.899994,273.100006,273.299988,274.0,274.200012,275.799988,277.0,278.299988
11,2000-09-18,271.399994,0.0,173.6,6.52,1444.51001,116.0,272.299988,272.299988,272.299988,272.399994,272.799988,272.899994,273.100006,273.299988,274.0,274.200012,275.799988,277.0
12,2000-09-19,271.899994,0.0,173.6,6.52,1459.900024,116.110001,271.399994,271.399994,271.399994,272.299988,272.399994,272.799988,272.899994,273.100006,273.299988,274.0,274.200012,275.799988
13,2000-09-20,269.0,0.0,173.6,6.52,1451.339966,116.410004,271.899994,271.899994,271.899994,271.399994,272.299988,272.399994,272.799988,272.899994,273.100006,273.299988,274.0,274.200012
14,2000-09-21,270.299988,0.0,173.6,6.52,1449.050049,115.269997,269.0,269.0,269.0,271.899994,271.399994,272.299988,272.399994,272.799988,272.899994,273.100006,273.299988,274.0


In [None]:
def pct_change_lags_greedy_search(df):
    pct_change_lags = []
    r2_scores = []
    mse_scores = []
    lag = 1

    while True: 
        pct_change_lags.append(lag)

        #--- add lags to dataframe
        for lag in pct_change_lags:
            df[f'Close_pct_change{lag}'] = df['Close'].pct_change(periods=lag) * 100
        
        df = df.dropna()

        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        target = ['Close']
        features = [col for col in df.columns if col not in ['Date', target]]

        #--- scale the values 
        scaler = MinMaxScaler()
        df[features] = scaler.fit_transform(df[features])

        #--- split the dataset
        X = df[features]
        y = df[target]

        split_index = int(len(df) * 0.8)

        X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
        y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

        #--- train the model
        xgb = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
        )

        xgb.fit(X_train, y_train)

        #--- evaluate the model 
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        mse_scores.append(mse)
        r2_scores.append(r2)

        if lag>=100:
            break

        lag += 2

    plt.figure(figsize(24, 12))
    plt.title('Gridy search for knee in percentage lags and scores')
    plt.plot(pct_change_lags, mse_scores, label='mse score', color='orange')
    plt.plot(pct_change_lags, r2_score, label='r2 score', color='blue')
    plt.xlabel('Number of lags for percentage change')
    plt.ylabel('Scores')
    plt.legend()
    plt.grid(True)
    plt.show()

In [17]:
pct_change_lags_greedy_search(df)

ValueError: Input X contains infinity or a value too large for dtype('float64').