In [63]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.metrics import mean_squared_error

plt.style.use('fivethirtyeight')

df = yf.download("TSLA", start="2019-01-01", end="2020-12-31")

df.index = pd.to_datetime(df.index)
df.index


[*********************100%***********************]  1 of 1 completed


DatetimeIndex(['2019-01-02', '2019-01-03', '2019-01-04', '2019-01-07',
               '2019-01-08', '2019-01-09', '2019-01-10', '2019-01-11',
               '2019-01-14', '2019-01-15',
               ...
               '2020-12-16', '2020-12-17', '2020-12-18', '2020-12-21',
               '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-28',
               '2020-12-29', '2020-12-30'],
              dtype='datetime64[ns]', name='Date', length=504, freq=None)

In [64]:
# train_df = df.loc[df.index < '2020-09-01']
# test_df = df.loc[df.index >= '2020-09-01']

# fig, ax = plt.subplots(figsize=(15, 5))
# train_df['Close'].plot(ax=ax, label='Training')
# test_df['Close'].plot(ax=ax, label='Test')
# ax.axvline(x='2020-09-01', c='black', linestyle='--')
# ax.legend(['Training Set', 'Testing Set'])
# plt.show()


In [79]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=5, test_size=30, gap=1)
df = df.sort_index()

In [84]:
def create_features(df):
    df = df.copy()
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    return df

features = ['year', 'month', 'day', 'dayofweek', 'Open', 'High', 'Low', 'Volume']
target = 'Close'

In [97]:
# fig, axs = plt.subplots(5, 1, figsize=(15, 15), sharex=True)

fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
    train = create_features(df.iloc[train_idx])
    test = create_features(df.iloc[val_idx])

    X_train = train[features]
    y_train = train[target]

    X_test = test[features]
    y_test = test[target]

    reg = xgb.XGBRegressor(
        base_score=0.5,
        booster='gbtree',
        n_estimators=2000,
        objective='reg:tweedie', 
        learning_rate=0.01,
        early_stopping_rounds=100,
        max_depth=3,
        )
    reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
    y_pred = reg.predict(X_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)

    # train['Close'].plot(ax=axs[f old], label='Training Set', title=f'Data Train / Test Split Fold {fold}')
    # test['Close'].plot(ax=axs[fold], label='Test Set')
    # axs[fold].axvline(test.index.min(), color='black', ls='--')
    # fold += 1

[0]	validation_0-tweedie-nloglik@1.5:73.25605	validation_1-tweedie-nloglik@1.5:197.04252
[100]	validation_0-tweedie-nloglik@1.5:32.77532	validation_1-tweedie-nloglik@1.5:80.46598
[200]	validation_0-tweedie-nloglik@1.5:21.99253	validation_1-tweedie-nloglik@1.5:44.32280
[300]	validation_0-tweedie-nloglik@1.5:20.02668	validation_1-tweedie-nloglik@1.5:35.91794
[400]	validation_0-tweedie-nloglik@1.5:19.74082	validation_1-tweedie-nloglik@1.5:34.08120
[500]	validation_0-tweedie-nloglik@1.5:19.70149	validation_1-tweedie-nloglik@1.5:33.68142
[600]	validation_0-tweedie-nloglik@1.5:19.69609	validation_1-tweedie-nloglik@1.5:33.57441
[700]	validation_0-tweedie-nloglik@1.5:19.69532	validation_1-tweedie-nloglik@1.5:33.53750
[800]	validation_0-tweedie-nloglik@1.5:19.69519	validation_1-tweedie-nloglik@1.5:33.52274
[900]	validation_0-tweedie-nloglik@1.5:19.69515	validation_1-tweedie-nloglik@1.5:33.51783
[1000]	validation_0-tweedie-nloglik@1.5:19.69514	validation_1-tweedie-nloglik@1.5:33.51421
[1100]	val

In [96]:
scores

[15.108224164428881,
 17.1029217373159,
 14.501139992307396,
 3.136722442640417,
 48.509716510089966]

In [67]:
# df.loc[(df.index >= '2019-01-01') & (df.index <= '2019-01-08')]['Close'].plot()

In [69]:
# # train_df['Date'] = pd.to_datetime(train_df.index)
# # test_df['Date'] = pd.to_datetime(test_df.index)

# features = ['year', 'month', 'day', 'dayofweek', 'Open', 'High', 'Low', 'Volume']
# target = 'Close'
# X_train = create_features(train_df)[features] 
# y_train = train_df[target]
# X_test = create_features(test_df)[features]
# y_test = test_df[target]

# reg = xgb.XGBRegressor(n_estimators=2000, learning_rate=0.01)
# reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=50, verbose=100)

In [70]:
# fi = pd.DataFrame(data=reg.feature_importances_, index=reg.feature_names_in_, columns=['importance'])
# fi.plot(kind='barh', figsize=(10, 10))

In [71]:
# test_df['Prediction'] = reg.predict(X_test)
# df = df.merge(test_df[['Prediction']], how='left', left_index=True, right_index=True)

In [72]:
# ax = df[['Close']].plot(figsize=(15, 5))
# df[['Prediction']].plot(ax=ax, style='.')
# ax.legend(["Actual", "Predicted"])
# plt.show()

In [73]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,20.406668,21.008667,19.920000,20.674667,20.674667,174879000
2019-01-03,20.466667,20.626667,19.825333,20.024000,20.024000,104478000
2019-01-04,20.400000,21.200001,20.181999,21.179333,21.179333,110911500
2019-01-07,21.448000,22.449333,21.183332,22.330667,22.330667,113268000
2019-01-08,22.797333,22.934000,21.801332,22.356667,22.356667,105127500
...,...,...,...,...,...,...
2020-12-23,210.733337,217.166672,207.523331,215.326660,215.326660,99519000
2020-12-24,214.330002,222.029999,213.666672,220.589996,220.589996,68596800
2020-12-28,224.836670,227.133331,220.266663,221.229996,221.229996,96835800
2020-12-29,220.333328,223.300003,218.333328,221.996674,221.996674,68732400
