https://pycaret.gitbook.io/docs/learn-pycaret/official-blog/multiple-time-series-forecasting-with-pycaret

In [1]:
from pycaret.regression import *

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import category_encoders as ce

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import matplotlib
import seaborn as sns
import plotly.express as px
%matplotlib inline
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRegressor, XGBRFRegressor
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.base import ClassifierMixin

# PyTorch
# import torch
# from torch.utils.data import Dataset, DataLoader, TensorDataset
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torch.autograd import Variable
# from torch.nn import Parameter
# from torch import Tensor
# from torch.utils.data import DataLoader

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

# Utility
import os
import time
import datetime
import random
import warnings; warnings.filterwarnings("ignore")
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean

# from bayes_opt import BayesianOptimization
# from num2words import num2words
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
inter = pd.read_csv('../data/international_trade.csv')
submit = pd.read_csv('../data/sample_submission.csv')

# train.drop(columns = 'ID', inplace = True)
# test.drop(columns = 'ID', inplace = True)

In [3]:
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [4]:
train = train.rename(columns={'supply(kg)':'supply','price(원/kg)':'price'})
train.columns

Index(['ID', 'timestamp', 'item', 'corporation', 'location', 'supply',
       'price'],
      dtype='object')

In [5]:
# prepare the data
'''
0. item, location. corporation을 LabelEncoding한다.
1. item, location, corporation 을 time-series 컬럼으로 결합한다.
2. timestamp로부터 month, year, day, dayofweek 를 추출
'''

data = train.copy()
test_ = test.copy()
data['timestamp'] = pd.to_datetime(data['timestamp'])
test_['timestamp'] = pd.to_datetime(test_['timestamp'])

# 0.
encoders = {}
for col in ['item','location','corporation']:
    encoder = LabelEncoder()
    data[col] = encoder.fit_transform(data[col])
    test_[col] = encoder.transform(test_[col])
    encoders[col] = encoder

# 1.
data['item_'] = ['item_' + str(i) for i in data['item']]
data['location_'] = ['location_' + str(i) for i in data['location']]
data['corporation_'] = ['corporation_' + str(i) for i in data['corporation']]
data['time_series'] = data[['item_'
                            , 'corporation_','location_'
                           ]].apply(lambda x: '_'.join(x), axis=1)
data.drop(['item_', 'corporation_','location_'], axis=1, inplace=True)
test_['item_'] = ['item_' + str(i) for i in test_['item']]
test_['location_'] = ['location_' + str(i) for i in test_['location']]
test_['corporation_'] = ['corporation_' + str(i) for i in test_['corporation']]
test_['time_series'] = test_[['item_'
                            , 'corporation_','location_'
                           ]].apply(lambda x: '_'.join(x), axis=1)
test_.drop(['item_', 'corporation_','location_'], axis=1, inplace=True)


# 2.
data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day
data['dow'] = data['timestamp'].dt.dayofweek
data['doy'] = data['timestamp'].dt.dayofyear
test_['year'] = test_['timestamp'].dt.year
test_['month'] = test_['timestamp'].dt.month
test_['day'] = test_['timestamp'].dt.day
test_['dow'] = test_['timestamp'].dt.dayofweek
test_['doy'] = test_['timestamp'].dt.dayofyear

data.head(3)


Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,time_series,year,month,day,dow,doy
0,TG_A_J_20190101,2019-01-01,4,0,0,0.0,0.0,item_4_corporation_0_location_0,2019,1,1,1,1
1,TG_A_J_20190102,2019-01-02,4,0,0,0.0,0.0,item_4_corporation_0_location_0,2019,1,2,2,2
2,TG_A_J_20190103,2019-01-03,4,0,0,60601.0,1728.0,item_4_corporation_0_location_0,2019,1,3,3,3


In [6]:
test_.head(3)

Unnamed: 0,ID,timestamp,item,corporation,location,time_series,year,month,day,dow,doy
0,TG_A_J_20230304,2023-03-04,4,0,0,item_4_corporation_0_location_0,2023,3,4,5,63
1,TG_A_J_20230305,2023-03-05,4,0,0,item_4_corporation_0_location_0,2023,3,5,6,64
2,TG_A_J_20230306,2023-03-06,4,0,0,item_4_corporation_0_location_0,2023,3,6,0,65


In [30]:
categorical_cols = ['month','dow','item','corporation','location','year','day']
numeric_cols = ['doy']
timestamp_col = 'timestamp'
target_col = 'price'
ignore_cols = ['ID','timestamp','supply','time_series']

all_results = []
final_models = {}
# all_ts = data['time_series'].unique()

# for i in tqdm(all_ts):
# df_subset = data[data['time_series']==i]
s = setup(data=data,
          target=target_col,
          session_id=123,
          train_size=0.8,
          categorical_features=categorical_cols,
          ignore_features=ignore_cols,
          numeric_features=numeric_cols,
          normalize=True,
          normalize_method='minmax',
          fold_strategy='timeseries',
          fold=5,
          fold_shuffle=False,
          data_split_shuffle=False,
          transform_target=True,
          transform_target_method='yeo-johnson',
          remove_outliers=False,
          remove_multicollinearity=True,
          multicollinearity_threshold=0.9,
          verbose=True,
          use_gpu = True
         )

best_top3_model = compare_models(sort='RMSE',verbose=True,n_select=3)

# p = pull().iloc[0:1]
# p['time_series'] = str(i)
# all_results.append(p)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: gfx1035, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,506.5697,1156438.3785,929.4698,0.0045,3.2432,0.6683,21.804
gbr,Gradient Boosting Regressor,512.4335,1179779.6599,944.7057,-0.0778,3.3027,0.726,3.484
br,Bayesian Ridge,554.6063,1291865.6932,968.917,-0.0246,3.4095,0.7616,0.624
ridge,Ridge Regression,554.4996,1291984.977,968.9541,-0.0246,3.4087,0.7617,0.636
lightgbm,Light Gradient Boosting Machine,500.2465,1231787.0288,981.5754,-0.3351,3.1482,0.7309,1.768
xgboost,Extreme Gradient Boosting,501.5418,1305940.7281,991.519,-0.2206,2.984,0.7421,1.44
huber,Huber Regressor,570.974,1489914.0934,1024.5214,-0.0872,3.353,0.8462,0.86
ada,AdaBoost Regressor,575.409,1600951.3765,1039.5426,-0.0517,2.7644,0.804,1.044
omp,Orthogonal Matching Pursuit,642.4143,1699336.7127,1070.2506,-0.1125,3.1994,0.9059,0.592
rf,Random Forest Regressor,521.3894,1469223.7506,1073.7201,-0.9211,3.105,0.763,1.97


In [11]:
best_model

[<catboost.core.CatBoostRegressor at 0x286ff4de190>,
 GradientBoostingRegressor(random_state=123),
 LGBMRegressor(device='gpu', n_jobs=-1, random_state=123)]

In [13]:
blended = blend_models(best_top3_model, optimize='RMSE')

f = finalize_model(blended)

save_model(f, model_name=f'../models/pycaret_regression_itemseries/pycaret_regression')

Processing:   0%|          | 0/6 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
best = best_model[0]
cbg_final = finalize_model(best)
save_model(cbg_final, model_name='../models/pycaret_regression_itemseries/pycaret_cbg')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('target_transformation',
                  TransformerWrapperWithInverse(transformer=TargetTransformer(estimator=PowerTransformer(standardize=False)))),
                 ('numerical_imputer',
                  TransformerWrapper(include=['doy'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['month', 'dow', 'item',
                                              'corporation'...
                  TransformerWrapper(include=['day'],
                                     transformer=TargetEncoder(cols=[],
                                                               handle_missing='return_nan'))),
                 ('remove_multicollinearity',
                  TransformerWrapper(exclude=[],
                                     transformer=RemoveMulticollinearity(threshold=0.9))),
                 ('normalize', TransformerWrapp

In [17]:
test_features = ['item','corporation','location','year','month','day','dow','doy']

test_set = test_[test_features]
test_set

Unnamed: 0,item,corporation,location,year,month,day,dow,doy
0,4,0,0,2023,3,4,5,63
1,4,0,0,2023,3,5,6,64
2,4,0,0,2023,3,6,0,65
3,4,0,0,2023,3,7,1,66
4,4,0,0,2023,3,8,2,67
...,...,...,...,...,...,...,...,...
1087,3,5,0,2023,3,27,0,86
1088,3,5,0,2023,3,28,1,87
1089,3,5,0,2023,3,29,2,88
1090,3,5,0,2023,3,30,3,89


In [18]:
p = predict_model(cbg_final, test_set)
p

Unnamed: 0,item,corporation,location,year,month,day,dow,doy,prediction_label
0,4,0,0,2023,3,4,5,63,964.534937
1,4,0,0,2023,3,5,6,64,0.358099
2,4,0,0,2023,3,6,0,65,2084.704619
3,4,0,0,2023,3,7,1,66,3325.690601
4,4,0,0,2023,3,8,2,67,2367.090843
...,...,...,...,...,...,...,...,...,...
1087,3,5,0,2023,3,27,0,86,292.244225
1088,3,5,0,2023,3,28,1,87,257.668657
1089,3,5,0,2023,3,29,2,88,225.003791
1090,3,5,0,2023,3,30,3,89,343.138650


In [25]:
p['prediction_label'].describe()

count     1092.000000
mean      1031.005397
std       2330.399645
min         -0.787849
25%          1.931603
50%        117.565071
75%        933.201151
max      23006.055945
Name: prediction_label, dtype: float64

In [21]:
p_ = p['prediction_label'].apply(lambda x:x if x>=0 else 0)
p_.describe()

count     1092.000000
mean      1031.026131
std       2330.390462
min          0.000000
25%          1.931603
50%        117.565071
75%        933.201151
max      23006.055945
Name: prediction_label, dtype: float64

In [22]:
submit['answer'] = p_.values
submit

Unnamed: 0,ID,answer
0,TG_A_J_20230304,964.534937
1,TG_A_J_20230305,0.358099
2,TG_A_J_20230306,2084.704619
3,TG_A_J_20230307,3325.690601
4,TG_A_J_20230308,2367.090843
...,...,...
1087,RD_F_J_20230327,292.244225
1088,RD_F_J_20230328,257.668657
1089,RD_F_J_20230329,225.003791
1090,RD_F_J_20230330,343.138650


In [23]:
submit.set_index('ID').to_csv('../data/pycaret_231115_1946.csv')


Unnamed: 0_level_0,answer
ID,Unnamed: 1_level_1
TG_A_J_20230304,4374.717606
TG_A_J_20230305,0.000000
TG_A_J_20230306,3531.892550
TG_A_J_20230307,2978.573759
TG_A_J_20230308,3317.646750
...,...
RD_F_J_20230327,424.031780
RD_F_J_20230328,418.857285
RD_F_J_20230329,389.916084
RD_F_J_20230330,383.990342


In [27]:
ag = pd.read_csv('../data/autogloun.csv')
ag

Unnamed: 0,ID,answer,answer_plus,answer_plus_multi,answer_plus_multi_devide,answer_plus_multi_trade,plus feature,multi feautre,devide feature,trade feature,trade multi
0,TG_A_J_20230304,3303.832109,3348.149784,3266.106169,3230.151070,3344.764245,44.32,37.73,35.96,40.93,78.66
1,TG_A_J_20230305,619.871556,600.211059,618.999558,649.469790,593.595174,19.66,0.87,30.47,26.28,25.40
2,TG_A_J_20230306,3094.916454,3158.511358,3459.401166,3337.368953,3173.494423,63.59,364.48,122.03,78.58,285.91
3,TG_A_J_20230307,3384.701295,3437.620075,3386.925661,3397.044875,3406.580379,52.92,2.22,10.12,21.88,19.65
4,TG_A_J_20230308,3338.841519,3379.721990,3358.584199,3317.912097,3345.909894,40.88,19.74,40.67,7.07,12.67
...,...,...,...,...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,520.106794,527.559868,522.657351,511.037501,517.385892,7.45,2.55,11.62,2.72,5.27
1088,RD_F_J_20230328,519.110925,522.948168,522.876663,516.228913,515.669217,3.84,3.77,6.65,3.44,7.21
1089,RD_F_J_20230329,517.419009,522.287128,520.389423,512.846080,516.285062,4.87,2.97,7.54,1.13,4.10
1090,RD_F_J_20230330,494.706201,505.508347,495.119578,487.530243,495.280268,10.80,0.41,7.59,0.57,0.16


In [29]:
ag.describe()

Unnamed: 0,answer,answer_plus,answer_plus_multi,answer_plus_multi_devide,answer_plus_multi_trade,plus feature,multi feautre,devide feature,trade feature,trade multi
count,1092.0,1092.0,1092.0,1092.0,1092.0,1092.0,1092.0,1092.0,1092.0,1092.0
mean,1421.202798,1438.365355,1415.431645,1418.816889,1422.737775,38.547344,25.515027,28.478874,23.78076,28.513526
std,1412.453814,1447.251722,1427.645944,1415.086684,1427.921318,44.595641,36.796387,30.340998,26.628342,41.8632
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,180.327346,169.228262,171.744338,194.210905,163.074052,5.5825,1.6375,6.5275,4.5275,3.61
50%,621.344333,607.666449,610.829444,631.67715,614.768402,23.635,11.005,20.36,15.745,12.89
75%,2645.799602,2702.538301,2667.632997,2651.983045,2670.51545,57.695,36.8475,40.6825,35.19,34.2775
max,4983.580942,5072.72655,5077.583515,5028.13702,5022.837998,536.62,364.48,254.16,322.98,350.78


In [31]:
ag.answer.dtype

dtype('float64')