In [None]:
!gdown --id 1CFVbGfUKTUWocGDm8gjzJj0MijKzhCD8

Downloading...
From: https://drive.google.com/uc?id=1CFVbGfUKTUWocGDm8gjzJj0MijKzhCD8
To: /content/launchit_test
100% 18.4M/18.4M [00:00<00:00, 57.7MB/s]


In [None]:
import feather
import numpy as np
import pandas as pd

In [None]:
df = pd.read_feather('launchit_test')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR
0,2019-12-31,87.300003,87.300003,64.904999,71.699997,71.699997,33947.0,MKDTY
1,2020-01-02,72.900002,72.900002,62.25,65.550003,65.550003,13607.0,MKDTY
2,2020-01-03,63.299999,66.75,63.299999,65.25,65.25,4427.0,MKDTY
3,2020-01-06,65.550003,72.0,63.75,67.650002,67.650002,9933.0,MKDTY
4,2020-01-07,68.25,72.419998,68.099998,72.0,72.0,8347.0,MKDTY


In [None]:
df.dtypes

Date         datetime64[ns]
Open                float64
Close               float64
Adj Close           float64
Volume              float64
CA12                float64
dtype: object

In [None]:
# create 12 month close average
df['CA12'] = df['Adj Close'].rolling(2).mean()
# plot the df and CA
import plotly.express as px
fig = px.line(df, x="Date", y=["Adj Close", "CA12"], template = 'plotly_dark')
fig.show()

In [None]:
# extract month and year from dates
df['Month'] = [i.month for i in df['Date']]
df['Year'] = [i.year for i in df['Date']]
# create a sequence of numbers
df['Series'] = np.arange(1,len(df)+1)
# drop unnecessary columns and re-arrange
df.drop(['Date', 'CA12'], axis=1, inplace=True)
df = df[['Series', 'Year', 'Month', 'Adj Close']] 
# check the head of the dataset
df.head()

Unnamed: 0,Series,Year,Month,Adj Close
0,1,2019,12,71.699997
1,2,2020,1,65.550003
2,3,2020,1,65.25
3,4,2020,1,67.650002
4,5,2020,1,72.0


In [None]:
# import the regression module
from pycaret.regression import *
# initialize setup
s = setup(data = df, target = 'Adj Close', 
          data_split_shuffle = False,
          fold_strategy = 'timeseries', fold = 3, 
          transform_target = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Adj Close
2,Original Data,"(670047, 4)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(469032, 13)"


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=['Series'],
                                      ml_usecase='regression',
                                      numerical_features=[], target='Adj Close',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('clust

In [None]:
best = compare_models()
# best = compare_models(sort = 'MAE')

In [None]:
prediction_holdout = predict_model(best)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=PowerTransformedTargetRegressor(base_estimator=None, learning_rate=1.0,
                                loss='linear', n_estimators=50,
                                power_transformer_method='box-cox',
                                power_transformer_standardize=True,
                                random_state=123,
                                regressor=AdaBoostRegressor(base_estimator=None,
                                                            learning_rate=1.0,
                                                            loss='linear',
                                                            n_estimators=50,
                                                            random_state=123)), probability_threshold=None, encoded_labels=True, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.REGRESSION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:log

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,AdaBoost Regressor,8.8644,563.7703,23.7438,-0.0014,0.7312,1.1276


In [None]:
# generate predictions on the original dataset
predictions = predict_model(best, data=df)
# add a date column in the dataset
# predictions['Date'] = pd.date_range(start='2019-01-01', end = '2022-11-04', freq = 'MS')
# line plot
fig = px.line(predictions, x='Month', y=["Adj Close", "Label"], template = 'plotly_dark')
# add a vertical rectange for test-set separation
fig.add_vrect(x0="2019-01-01", x1="2022-11-01", fillcolor="grey", opacity=0.25, line_width=0)
fig.show()

In [None]:
final_best = finalize_model(best)

INFO:logs:Initializing finalize_model()
INFO:logs:finalize_model(estimator=PowerTransformedTargetRegressor(base_estimator=None, learning_rate=1.0,
                                loss='linear', n_estimators=50,
                                power_transformer_method='box-cox',
                                power_transformer_standardize=True,
                                random_state=123,
                                regressor=AdaBoostRegressor(base_estimator=None,
                                                            learning_rate=1.0,
                                                            loss='linear',
                                                            n_estimators=50,
                                                            random_state=123)), fit_kwargs=None, groups=None, model_only=True, display=None, experiment_custom_tags=None, return_train_score=False)
INFO:logs:Finalizing PowerTransformedTargetRegressor(base_estimator=None, learning_rate=1.0,
  

In [None]:
future_dates = pd.date_range(start = '2022-11-20', end = '2023-02-25', freq = 'MS')
future_df = pd.DataFrame()
future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]    
future_df['Series'] = np.arange(145,(145+len(future_dates)))
future_df.head()

Unnamed: 0,Month,Year,Series
0,12,2022,145
1,1,2023,146
2,2,2023,147


In [None]:
predictions_future = predict_model(final_best, data=future_df)
predictions_future.head()

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=PowerTransformedTargetRegressor(base_estimator=None, learning_rate=1.0,
                                loss='linear', n_estimators=50,
                                power_transformer_method='box-cox',
                                power_transformer_standardize=True,
                                random_state=123,
                                regressor=AdaBoostRegressor(base_estimator=None,
                                                            learning_rate=1.0,
                                                            loss='linear',
                                                            n_estimators=50,
                                                            random_state=123)), probability_threshold=None, encoded_labels=True, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.REGRESSION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:log

Unnamed: 0,Month,Year,Series,Label
0,12,2022,145,8.20994
1,1,2023,146,8.677079
2,2,2023,147,8.505181


In [None]:
concat_df = pd.concat([df,predictions_future], axis=0)
concat_df_i = pd.date_range(start='2019-01-01', end = '2023-02-25', freq = 'MS')
concat_df.set_index(concat_df_i, inplace=True)
fig = px.line(concat_df, x=concat_df.index, y=["Adj Close", "Label"], template = 'plotly_dark')
fig.show()

ValueError: ignored

In [None]:
df.tail()

Unnamed: 0,Series,Year,Month,Adj Close
670042,670043,2022,11,10.23
670043,670044,2022,11,10.25
670044,670045,2022,11,10.27
670045,670046,2022,11,10.23
670046,670047,2022,11,10.28


In [None]:
import sklearn

sklearn.__version__

'0.23.2'

In [None]:
df = df.drop(columns=["High","Low"])

In [None]:
!pip uninstall scikit-learn -y

Found existing installation: scikit-learn 0.23.2
Uninstalling scikit-learn-0.23.2:
  Successfully uninstalled scikit-learn-0.23.2


In [None]:
!pip install --user -U scikit-learn==0.23.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


In [None]:
# Install PyCaret.

!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import dependencies
from pycaret.regression import *
# from pycaret.classification import *
# import pandas as pd
# import numpy as np 
# import psycopg2
# from sklearn.metrics import balanced_accuracy_score
# # from sklearn.metrics import confusion_matrix
# from imblearn.metrics import classification_report_imbalanced
# import imblearn
# from config import db_password

In [None]:
 # PyCaret Regression Setup command:
reg = setup(data = df, target = 'Adj Close')

Unnamed: 0,Description,Value
0,session_id,790
1,Target,Adj Close
2,Original Data,"(670047, 7)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(469032, 21)"


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[], ml_usecase='regression',
                                      numerical_features=[], target='Adj Close',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_strateg...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('cluster_all', 'passthrough'),
             

In [None]:
compare_models()

INFO:logs:Initializing compare_models()
INFO:logs:compare_models(include=None, fold=None, round=4, cross_validation=True, sort=R2, n_select=1, budget_time=None, turbo=True, errors=ignore, fit_kwargs=None, groups=None, experiment_custom_tags=None, probability_threshold=None, verbose=True, display=None, exclude=None)
INFO:logs:Checking exceptions
INFO:logs:Preparing display monitor
INFO:logs:Preparing display monitor


IntProgress(value=0, description='Processing: ', max=94)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.5193,8.1697,2.6113,0.9919,0.0505,0.0275,151.549
lr,Linear Regression,0.6581,11.456,3.0265,0.9886,0.0809,0.1249,0.501
lasso,Lasso Regression,0.6628,11.456,3.03,0.9886,0.0825,0.1289,1.113
ridge,Ridge Regression,0.6576,11.4544,3.026,0.9886,0.0817,0.1315,0.063
en,Elastic Net,0.6627,11.4559,3.0299,0.9886,0.0824,0.1288,0.914
lar,Least Angle Regression,0.6576,11.4544,3.026,0.9886,0.0817,0.1315,0.075
omp,Orthogonal Matching Pursuit,0.6593,11.4564,3.0266,0.9886,0.0812,0.126,0.073
br,Bayesian Ridge,0.6576,11.4544,3.026,0.9886,0.0817,0.1314,0.344
dt,Decision Tree Regressor,0.662,12.3262,3.3477,0.9876,0.069,0.0345,2.251
knn,K Neighbors Regressor,6.5464,472.8584,21.7349,0.5167,0.5397,1.0069,4.157


INFO:logs:Initializing Linear Regression
INFO:logs:Total runtime is 3.5758813222249346e-05 minutes
INFO:logs:Initializing create_model()
INFO:logs:create_model(estimator=lr, fold=KFold(n_splits=10, random_state=None, shuffle=False), round=4, cross_validation=True, predict=True, fit_kwargs={}, groups=None, refit=False, verbose=False, system=False, metrics=None, experiment_custom_tags=None, add_to_model_list=True, probability_threshold=None, display=<pycaret.internal.Display.Display object at 0x7fc0d4a56b10>, return_train_score=False, kwargs={})
INFO:logs:Checking exceptions
INFO:logs:Importing libraries
INFO:logs:Copying training dataset
INFO:logs:Defining folds
INFO:logs:Declaring metric variables
INFO:logs:Importing untrained model
INFO:logs:Linear Regression Imported succesfully
INFO:logs:Starting cross validation
INFO:logs:Cross validating with KFold(n_splits=10, random_state=None, shuffle=False), n_jobs=-1
INFO:logs:Calculating mean and std
INFO:logs:Creating metrics dataframe
INFO