In [7]:
import pandas as pd
import yfinance
from pycaret import regression
import numpy as np

In [10]:
ibm = yfinance.download("IBM",period="max")
ibm = ibm["Adj Close"]
ibm = ibm.reset_index()
ibm["Date"] = pd.to_datetime(ibm["Date"])
ibm["Day"] = [i.day for i in ibm["Date"]]
ibm["Month"] = [i.month for i in ibm["Date"]]
ibm["Year"] = [i.year for i in ibm["Date"]]
ibm["Series"] = np.arange(1,len(ibm)+1)
ibm.drop("Date",axis=1,inplace=True,errors="ignore")
ibm = ibm[["Series","Year","Month","Day","Adj Close"]]
ibm

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Series,Year,Month,Day,Adj Close
0,1,1962,1,2,1.633034
1,2,1962,1,3,1.647309
2,3,1962,1,4,1.630892
3,4,1962,1,5,1.598775
4,5,1962,1,8,1.568797
...,...,...,...,...,...
15353,15354,2022,12,28,140.020004
15354,15355,2022,12,29,141.059998
15355,15356,2022,12,30,140.889999
15356,15357,2023,1,3,141.550003


In [11]:
ibm.to_csv("ibm.csv")

In [12]:
train = ibm[ibm["Year"]<2021]
test = ibm[ibm["Year"]>=2021]
print(train.shape)
print(test.shape)

(14853, 5)
(505, 5)


In [14]:
s = regression.setup(data = train,
          test_data = test,
          target = 'Adj Close',
          fold_strategy = 'timeseries',
          numeric_features = ['Year','Month','Series'],
          transform_target = True,
          use_gpu = True,
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Adj Close
2,Target type,Regression
3,Data shape,"(15358, 5)"
4,Train data shape,"(14853, 5)"
5,Test data shape,"(505, 5)"
6,Numeric features,3
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


In [15]:
best = regression.compare_models(sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,9.0514,257.469,10.9834,-1.7053,0.3305,0.2489,1.355
et,Extra Trees Regressor,9.1753,258.5864,11.1292,-1.8305,0.333,0.2526,0.281
gbr,Gradient Boosting Regressor,9.4575,276.7635,11.3069,-1.863,0.3355,0.256,0.33
rf,Random Forest Regressor,9.6104,269.6266,11.4247,-1.9242,0.3327,0.2576,0.228
knn,K Neighbors Regressor,9.7111,273.0527,11.4487,-2.0305,0.3366,0.2648,0.249
dt,Decision Tree Regressor,9.7963,273.2396,11.6013,-1.9655,0.3341,0.2592,0.036
ada,AdaBoost Regressor,11.0127,464.3936,12.6931,-2.7624,0.377,0.2814,0.136
lasso,Lasso Regression,17.3741,1139.3095,19.8388,-15.6005,0.4101,0.4342,0.047
en,Elastic Net,17.3957,1142.2235,19.8625,-15.6365,0.41,0.4345,0.022
omp,Orthogonal Matching Pursuit,17.4534,1149.835,19.9853,-15.63,0.4094,0.4326,0.021


In [27]:
future_dates = pd.date_range(start = '2023-01-05', end = '2025-01-01', freq = 'B')
future_df = pd.DataFrame()
future_df["Day"] = [i.day for i in future_dates]
future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates] 
future_df['Series'] = np.arange(15358,(15358+len(future_dates)))
future_df

Unnamed: 0,Day,Month,Year,Series
0,5,1,2023,15358
1,6,1,2023,15359
2,9,1,2023,15360
3,10,1,2023,15361
4,11,1,2023,15362
...,...,...,...,...
515,26,12,2024,15873
516,27,12,2024,15874
517,30,12,2024,15875
518,31,12,2024,15876


In [24]:
future_df.dtypes

Day       int64
Month     int64
Year      int64
Series    int32
dtype: object

In [28]:
predictions_future = regression.predict_model(best,data=future_df)
predictions_future

Unnamed: 0,Series,Year,Month,Day,prediction_label
0,15358.0,2023.0,1.0,5,3.841648
1,15359.0,2023.0,1.0,6,3.841648
2,15360.0,2023.0,1.0,9,3.841648
3,15361.0,2023.0,1.0,10,3.841648
4,15362.0,2023.0,1.0,11,3.841648
...,...,...,...,...,...
515,15873.0,2024.0,12.0,26,3.826010
516,15874.0,2024.0,12.0,27,3.826010
517,15875.0,2024.0,12.0,30,3.826010
518,15876.0,2024.0,12.0,31,3.826010


In [29]:
import plotly.express as px
concat_df = pd.concat([ibm,predictions_future], axis=0)
concat_df_i = pd.date_range(start='1962-01-02', end = '2025-01-01', freq = 'B')
concat_df.set_index(concat_df_i, inplace=True)
fig = px.line(concat_df, x=concat_df.index, y=["Adj Close", "Label"], template = 'plotly_dark')
fig.show()

ValueError: Length mismatch: Expected 15878 rows, received array of length 16437

In [30]:
concat_df_i

DatetimeIndex(['1962-01-02', '1962-01-03', '1962-01-04', '1962-01-05',
               '1962-01-08', '1962-01-09', '1962-01-10', '1962-01-11',
               '1962-01-12', '1962-01-15',
               ...
               '2024-12-19', '2024-12-20', '2024-12-23', '2024-12-24',
               '2024-12-25', '2024-12-26', '2024-12-27', '2024-12-30',
               '2024-12-31', '2025-01-01'],
              dtype='datetime64[ns]', length=16437, freq='B')