In [None]:
!pip install --pre pycaret

In [None]:
!gdown --id 1cijWpm7_S80zrRUJsBhTFk9btQYXuRHv

In [14]:
# read the csv file
import pandas as pd

data = pd.read_csv('launchit.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,TCKR,sector,industry,country,growth_rate,target_median_price,target_mean_price,target_high_price
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,INDO,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,15.0


In [15]:
data.dtypes

Date                   datetime64[ns]
Open                          float64
High                          float64
Low                           float64
Close                         float64
Adj Close                     float64
Volume                        float64
TCKR                           object
sector                         object
industry                       object
country                        object
growth_rate                   float64
target_median_price           float64
target_mean_price             float64
target_high_price             float64
dtype: object

In [None]:
# # data['industry'] = pd.to_numeric(data['industry'],errors = 'coerce')

# # Label Encode non-numeric columns
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# data['industry'] = le.fit_transform(data['industry'])

In [16]:
# combine TCKR and target_mean_price column as time_series
data['TCKR'] = ['TCKR_' + str(i) for i in data['TCKR']]
data['target_mean_price'] = ['target_mean_price_' + str(i) for i in data['target_mean_price']]
data['time_series'] = data[['TCKR', 'target_mean_price']].apply(lambda x: '_'.join(x), axis=1)
data.drop(['TCKR', 'target_mean_price'], axis=1, inplace=True)

In [None]:
data.sample(10)

In [17]:
# extract features from date
data['month'] = [i.month for i in data['Date']]
data['year'] = [i.year for i in data['Date']]
data['day_of_week'] = [i.dayofweek for i in data['Date']]
data['day_of_year'] = [i.dayofyear for i in data['Date']]
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sector,industry,country,growth_rate,target_median_price,target_high_price,time_series,month,year,day_of_week,day_of_year
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400.0,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,TCKR_INDO_target_mean_price_15.0,12,2019,3,353
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000.0,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,TCKR_INDO_target_mean_price_15.0,12,2019,4,354
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400.0,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,TCKR_INDO_target_mean_price_15.0,12,2019,0,357
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800.0,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,TCKR_INDO_target_mean_price_15.0,12,2019,1,358
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400.0,Energy,Oil & Gas E&P,Indonesia,1.206,15.0,15.0,TCKR_INDO_target_mean_price_15.0,12,2019,3,360


In [18]:
# check the unique time_series
data['time_series'].nunique()

547

In [None]:
# plot multiple time series with moving avgs in a loop
import plotly.express as px
for i in data['time_series'].unique():
    subset = data[data['time_series'] == i]
    subset['moving_average'] = subset['Adj Close'].rolling(60).mean()
    fig = px.line(subset, x="Date", y=["Adj Close","moving_average"], title = i, template = 'plotly_dark')
    fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [20]:
from tqdm import tqdm
from pycaret.regression import *

all_ts = data['time_series'].unique()

all_results = []
final_model = {}

for i in tqdm(all_ts):
    
    df_subset = data[data['time_series'] == i]
    
    # initialize setup from pycaret.regression
    s = setup(df_subset, target = 'Adj Close', train_size = 0.80,
              data_split_shuffle = False, fold_strategy = 'timeseries', fold = 3,
              ignore_features = ['Date', 'TCKR' 'time_series','target_high_price','target_median_price','High','Low'],
              verbose = False, session_id = 123)
    
    # compare all models and select best one based on MAE
    best_model = compare_models(sort = 'MAE', verbose=False)
    
    # capture the compare result grid and store best model in list
    p = pull().iloc[0:1]
    p['time_series'] = str(i)
    all_results.append(p)
    
    # finalize model i.e. fit on entire data including test set
    f = finalize_model(best_model)
    
    # attach final model to a dictionary
    final_model[i] = f
    
    # save transformation pipeline and model as pickle file 
    save_model(f, model_name='trained_models/' + str(i), verbose=False)


  0%|          | 0/547 [00:37<?, ?it/s]


FileNotFoundError: ignored

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !pip install --upgrade gupload


# save transformation pipeline and model as pickle file 
# save_model(f, model_name='trained_models/' + str(i), verbose=False)

In [None]:
concat_results = pd.concat(all_results,axis=0)
concat_results.sample(10)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),time_series
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.3267,TCKR_ULCC_target_mean_price_16.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.33,TCKR_FOUR_target_mean_price_56.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.54,TCKR_DNB_target_mean_price_20.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.4233,TCKR_AZEK_target_mean_price_25.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.3233,TCKR_FEMY_target_mean_price_10.5
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.3167,TCKR_FTCI_target_mean_price_6.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.34,TCKR_BEKE_target_mean_price_20.35
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.3267,TCKR_KARO_target_mean_price_585.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.3667,TCKR_ARHS_target_mean_price_12.0
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.32,TCKR_PLMR_target_mean_price_75.0


In [None]:
# create a date range from 2013 to 2019
all_dates = pd.date_range(start='2019-01-01', end = '2022-11-10', freq = 'D')
# create empty dataframe
score_df = pd.DataFrame()
# add columns to dataset
score_df['Date'] = all_dates
score_df['Month'] = [i.month for i in score_df['Date']]
score_df['Year'] = [i.year for i in score_df['Date']]
score_df['Day_of_Week'] = [i.dayofweek for i in score_df['Date']]
score_df['Day_of_Year'] = [i.dayofyear for i in score_df['Date']]
score_df.head()

Unnamed: 0,Date,Month,Year,Day_of_Week,Day_of_Year
0,2019-01-01,1,2019,1,1
1,2019-01-02,1,2019,2,2
2,2019-01-03,1,2019,3,3
3,2019-01-04,1,2019,4,4
4,2019-01-05,1,2019,5,5


In [None]:
score_df.sample(25)

Unnamed: 0,Date,Month,Year,Day_of_Week,Day_of_Year
760,2021-01-30,1,2021,5,30
1377,2022-10-09,10,2022,6,282
1228,2022-05-13,5,2022,4,133
256,2019-09-14,9,2019,5,257
1186,2022-04-01,4,2022,4,91
1132,2022-02-06,2,2022,6,37
615,2020-09-07,9,2020,0,251
532,2020-06-16,6,2020,1,168
299,2019-10-27,10,2019,6,300
1199,2022-04-14,4,2022,3,104


SyntaxError: ignored

In [None]:
from pycaret.regression import load_model, predict_model
all_score_df = []
for i in tqdm(data['time_series'].unique()):
    l = load_model('trained_models/' + str(i), verbose=False)
    p = predict_model(l, data=score_df)
    p['time_series'] = i
    all_score_df.append(p)
concat_df = pd.concat(all_score_df, axis=0)
concat_df.head()