In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import calendar
import lightgbm as lgb
from lightgbm import LGBMRegressor
from itertools import permutations, combinations
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
import warnings
warnings.filterwarnings('ignore')
from prophet import Prophet
import torch

Importing plotly failed. Interactive plots will not work.


In [2]:
num_seed_tr = 3
num_seed_hp = 1
splits_hp = 5
splits_tr = 10
seed_hp = 42
basic_seed = 42
num_trial = 20

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
international_trade_csv = pd.read_csv('data/international_trade.csv')
submission_df = pd.read_csv('data/sample_submission.csv')
# train_df = train_df.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
# test_df = test_df.replace([np.inf, -np.inf, np.nan, -np.nan], 0)

In [4]:
dic_train = {}
for idx, time in enumerate(train_df['timestamp'].unique()):
    time = time.replace('-', '')
    dic_train[time] = idx+1
    
dic_test = {}
for idx, time in enumerate(test_df['timestamp'].unique()):
    time = time.replace('-', '')
    dic_test[time] = idx+1524

In [5]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))
train_df['weekday'] = train_df['timestamp'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d').weekday())
train_df['prod_ID'] = train_df['ID'].apply(lambda x: x[0:6])
train_df['d'] = train_df['ID'].apply(lambda x: f'd_{dic_train[x[7:]]}')
# train_df.drop(['supply(kg)', 'timestamp'], axis=1, inplace=True)

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))
test_df['weekday'] = test_df['timestamp'].apply(lambda x : datetime.strptime(x, '%Y-%m-%d').weekday())
test_df['prod_ID'] = test_df['ID'].apply(lambda x: x[0:6])
test_df['d'] = test_df['ID'].apply(lambda x: f'd_{dic_test[x[7:]]}')
# test_df.drop(['timestamp'], axis=1, inplace=True)

이동 평균

In [6]:
train_df.rename(columns={'price(원/kg)':'price', 'supply(kg)': 'supply'}, inplace=True)

In [7]:
def get_moving_average(df):
    for win in [1, 2, 4, 8]:
        df['rm_diff_price_{}'.format(win)] = df[['item', 'corporation', 'location', 'price']].groupby(
            ['item', 'corporation', 'location'])['price'].transform(lambda x : x.rolling(win).mean())
        df['rm_diff_price_{}'.format(win)] = ((df['price'] - df['rm_diff_price_{}'.format(win)]
                                                  )/df['price']).round(3)
    return df
train_df = get_moving_average(train_df)

In [8]:
def lags_wins(df):
    lags = [7, 14, 28]
    lag_cols = [f"lag_{lag}" for lag in lags]
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[["prod_ID","price"]].groupby("prod_ID")["price"].shift(lag)

    wins = [7, 14, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            df[f"rmean_{lag}_{win}"] = df[["prod_ID", lag_col]].groupby("prod_ID")[lag_col].transform(lambda x : x.rolling(win).mean())
    return df
train_df = lags_wins(train_df)
train_df = train_df.replace([np.inf, -np.inf, np.nan, -np.nan], 0)

In [9]:
test_df['price'] = 0
test_df[['rm_diff_price_1', 'rm_diff_price_2', 'rm_diff_price_4',
       'rm_diff_price_8', 'lag_7', 'lag_14', 'lag_28', 'rmean_7_7',
       'rmean_14_7', 'rmean_28_7', 'rmean_7_14', 'rmean_14_14', 'rmean_28_14',
       'rmean_7_28', 'rmean_14_28', 'rmean_28_28']] = 0.0
test_df = pd.concat([train_df, test_df])

In [10]:
train_df = train_df.astype({'ID': 'category', 'item': 'category', 'corporation': 'category', 'location': 'category',
                            'year': 'category', 'month': 'category', 'day': 'category', 'weekday': 'category'})
test_df = test_df.astype({'ID': 'category', 'item': 'category', 'corporation': 'category', 'location': 'category',
                            'year': 'category', 'month': 'category', 'day': 'category', 'weekday': 'category'})

In [11]:
valid_df = train_df.copy()
# for day in range(1496, 1524):
#     valid_df.loc[valid_df['d']==f'd_{day}', ['price', 'rm_diff_price_1', 'rm_diff_price_2', 'rm_diff_price_4',
#        'rm_diff_price_8', 'lag_7', 'lag_14', 'lag_28', 'rmean_7_7',
#        'rmean_14_7', 'rmean_28_7', 'rmean_7_14', 'rmean_14_14', 'rmean_28_14',
#        'rmean_7_28', 'rmean_14_28', 'rmean_28_28']] = 0

# LightGBM

In [12]:
category_cols = ['item','corporation', 'location', 'year', 'month', 'day', 'weekday']
useless_cols = ['ID', 'prod_ID', 'd', 'supply', 'timestamp', 'price']
train_cols = train_df.columns[~train_df.columns.isin(useless_cols)]
df = train_df.copy()
days_train = ['d_'+str(c) for c in range(1, 1496)]
days_val = ['d_'+str(c) for c in range(1496, 1524)]
df = df.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
#df.iloc[:, -9:] = df.iloc[:, -9:].fillna(0.0)
X_train = df[df['d'].isin(days_train)==True][train_cols]
Y_train = df[df['d'].isin(days_train)==True]["price"]
X_val_df = df[df['d'].isin(days_val)==True]
X_val_df["timestamp"] = pd.to_datetime(X_val_df["timestamp"])
X_val_dff = pd.DataFrame()
for delta in range(0, 28):
    day = datetime(2023, 2, 4) + timedelta(days=delta)
    vl = X_val_df.loc[X_val_df.timestamp == day]
    X_val_dff = pd.concat([X_val_dff, vl])

X_val = X_val_dff[train_cols]
Y_val = X_val_dff["price"]

In [13]:
def lgb_objective(trial: Trial) -> float:
    score_hp = []
    for seed_hp in np.random.randint(0, 1000, num_seed_hp):
        params_lgb = {
            "random_state": seed_hp,
            "verbosity": -1,
            "n_estimators": 10000,
            "objective": "tweedie",
            "metric": "rmse",
            "learning_rate": trial.suggest_loguniform("learning_rate", 5e-3, 5e-2), # default=0.1, range=[0,1]
            "max_depth": trial.suggest_int("max_depth", 4, 10), # default=-1
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1e+2), # default=0
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1e+2), # default=0
            "num_leaves": trial.suggest_int("num_leaves", 31, 5000), # default=31, range=(1,130172]
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.3, 1.0), # feature_fraction, default=1
            "subsample": trial.suggest_uniform("subsample", 0.3, 1.0), # bagging_fraction, default=1, range=[0,1]
            "subsample_freq": trial.suggest_int("subsample_freq", 1, 20), # bagging_freq, default=0
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 40), # min_data_in_leaf, default=20 
#             "max_bin": trial.suggest_int("max_bin", 100, 500),
        }

        kfold = KFold(n_splits=splits_hp, random_state=seed_hp, shuffle=True)
        cv = np.zeros(X_train.shape[0])

        for n, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):

            x_train, x_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train, y_val = Y_train.iloc[train_idx].values, Y_train.iloc[val_idx].values

            lgbmodel = LGBMRegressor(**params_lgb)
                                                                                           
            lgbmodel.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=30, verbose=-1) 
            cv[val_idx] = lgbmodel.predict(x_val)
            
        cv = np.where(cv<100, 0, cv)
        score_hp.append(mean_squared_error(Y_train, cv)**0.5)
        print(f'Seed{seed_hp} RMSE: {mean_squared_error(Y_train, cv)**0.5}')
    
    np.mean(score_hp)
    return np.mean(score_hp)

In [None]:
sampler = TPESampler(seed=basic_seed)
lgb_study = optuna.create_study(study_name="lgb_parameter_opt", direction="minimize", sampler=sampler)
lgb_study.optimize(lgb_objective, n_trials=num_trial)

lgb_best_hyperparams = lgb_study.best_trial.params
lgb_base_hyperparams = {'n_estimators':10000}
lgb_best_hyperparams.update(lgb_base_hyperparams)

# with open('../pkl/lgb_best_hyperparams.pickle', 'wb') as fw:
#     pickle.dump(lgb_best_hyperparams, fw)
print("The best hyperparameters are:\n", lgb_best_hyperparams)

In [14]:
lgb_best_hyperparams = {'metric': 'rmse', 'learning_rate': 0.01, 
                        'objective': 'rmse', 'boost_from_average': False,
                        'n_estimators': 10000}

In [None]:
optuna.visualization.matplotlib.plot_slice(lgb_study);

In [None]:
optuna.visualization.matplotlib.plot_param_importances(lgb_study);

In [15]:
train_data = lgb.Dataset(X_train, label = Y_train, categorical_feature=category_cols)
valid_data = lgb.Dataset(X_val, label = Y_val, categorical_feature=category_cols)

lgbmodel = lgb.train(lgb_best_hyperparams, train_data, valid_sets=[valid_data],
                     early_stopping_rounds=200, verbose_eval=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3900
[LightGBM] [Info] Number of data points in the train set: 58305, number of used features: 22
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 863.186
[200]	valid_0's rmse: 543.209
[300]	valid_0's rmse: 451.403
[400]	valid_0's rmse: 420.445
[500]	valid_0's rmse: 400.215
[600]	valid_0's rmse: 393.525
[700]	valid_0's rmse: 394.139
[800]	valid_0's rmse: 392.468
[900]	valid_0's rmse: 390.233
[1000]	valid_0's rmse: 388.475
[1100]	valid_0's rmse: 386.394
[1200]	valid_0's rmse: 384.923
[1300]	valid_0's rmse: 383.429
[1400]	valid_0's rmse: 381.393
[1500]	valid_0's rmse: 380.537
[1600]	valid_0's rmse: 379.971
[1700]	valid_0's rmse: 378.738
[1800]	valid_0's rmse: 376.612
[1900]	valid_0's rmse: 376.471
[2000]	valid_0's rmse: 376.326
[2100]	valid_0's rmse: 376.184
[2200]	valid_0's rmse: 375.746
[2300]	valid_0's rmse: 374.821
[2400]	valid_0's rmse: 374.621
[2500]	valid_0's r

## Predict

In [30]:
cols = [f"F{i}" for i in range(1,29)]
valid_df["timestamp"] = pd.to_datetime(valid_df["timestamp"])
for time_delta in range(0, 28):
    day = datetime(2023, 2, 4) + timedelta(days=time_delta)
    print("Predict", day.date())
    tst = valid_df[(valid_df.timestamp >= day - timedelta(days=60)) & (valid_df.timestamp <= day)].copy()
    lags_wins(tst)
    tst = tst.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
    tst = tst.loc[tst.timestamp == day , train_cols]
    valid_df.loc[valid_df.timestamp == day, "price"] = lgbmodel.predict(tst)
    del(tst)
valid_pred_y_lgb = np.zeros((39, 1))
for i in range(0, 28):
    valid_pred_y_lgb = np.hstack([
        valid_pred_y_lgb, valid_df.loc[valid_df.timestamp == (datetime(2023, 2, 4) + timedelta(days=i)), 'price'].values.reshape(-1, 1)])
valid_pred_y_lgb = np.delete(valid_pred_y_lgb, 0, axis=1)

Predict 2023-02-04
Predict 2023-02-05
Predict 2023-02-06
Predict 2023-02-07
Predict 2023-02-08
Predict 2023-02-09
Predict 2023-02-10
Predict 2023-02-11
Predict 2023-02-12
Predict 2023-02-13
Predict 2023-02-14
Predict 2023-02-15
Predict 2023-02-16
Predict 2023-02-17
Predict 2023-02-18
Predict 2023-02-19
Predict 2023-02-20
Predict 2023-02-21
Predict 2023-02-22
Predict 2023-02-23
Predict 2023-02-24
Predict 2023-02-25
Predict 2023-02-26
Predict 2023-02-27
Predict 2023-02-28
Predict 2023-03-01
Predict 2023-03-02
Predict 2023-03-03


In [24]:
cols = [f"F{i}" for i in range(1,29)]
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])
for time_delta in range(0, 28):
    day = datetime(2023, 3, 4) + timedelta(days=time_delta)
    print("Predict", day.date())
    tst = test_df[(test_df.timestamp >= day - timedelta(days=60)) & (test_df.timestamp <= day)].copy()
    lags_wins(tst)
    tst = tst.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
    tst = tst.loc[tst.timestamp == day , train_cols]
    test_df.loc[test_df.timestamp == day, "price"] = lgbmodel.predict(tst)
    del(tst)
test_pred_y_lgb = np.zeros((39, 1))
for i in range(0, 28):
    test_pred_y_lgb = np.hstack([
        test_pred_y_lgb, test_df.loc[test_df.timestamp == (datetime(2023, 2, 4) + timedelta(days=i)), 'price'].values.reshape(-1, 1)])
test_pred_y_lgb = np.delete(test_pred_y_lgb, 0, axis=1)

Predict 2023-03-04
Predict 2023-03-05
Predict 2023-03-06
Predict 2023-03-07
Predict 2023-03-08
Predict 2023-03-09
Predict 2023-03-10
Predict 2023-03-11
Predict 2023-03-12
Predict 2023-03-13
Predict 2023-03-14
Predict 2023-03-15
Predict 2023-03-16
Predict 2023-03-17
Predict 2023-03-18
Predict 2023-03-19
Predict 2023-03-20
Predict 2023-03-21
Predict 2023-03-22
Predict 2023-03-23
Predict 2023-03-24
Predict 2023-03-25
Predict 2023-03-26
Predict 2023-03-27
Predict 2023-03-28
Predict 2023-03-29
Predict 2023-03-30
Predict 2023-03-31


# 시계열 모델

## Simple Moving Average

In [35]:
valid_df_pivot = valid_df.pivot(index=['item', 'corporation', 'location'], columns='d', values='price').reset_index()
test_df_pivot = test_df.pivot(index=['item', 'corporation', 'location'], columns='d', values='price').reset_index()
valid_df_pivot = valid_df_pivot.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
test_df_pivot = test_df_pivot.replace([np.inf, -np.inf, np.nan, -np.nan], 0)
valid_df_pivot = valid_df_pivot[train_df.d.unique()]
test_df_pivot = test_df_pivot[test_df.d.unique()]

In [36]:
train_dataset = valid_df_pivot.iloc[:, -28*4:-28]
val_dataset = valid_df_pivot.iloc[:, -28:]
train_data = test_df_pivot.iloc[:, -28-365:-28]
val_data = test_df_pivot.iloc[:, -28:]

In [37]:
period = 30
prediction_sma = train_dataset.iloc[:, -period:].copy() 
for i in range(len(val_dataset.loc[0])): 
    prediction_sma['F'+str(i+1)] = prediction_sma.iloc[:, -period:].mean(axis=1) 
prediction_sma = prediction_sma[['F'+str(i+1) for i in range(len(val_dataset.loc[0]))]] 
sma_rmse = mean_squared_error(prediction_sma, val_dataset)**0.5
print('rmse:', sma_rmse)

rmse: 988.7071339446336


In [38]:
period = 28
test_pred_y_sma = train_data.iloc[:, -period:].copy() 
for i in range(len(val_data.loc[0])): 
    test_pred_y_sma['F'+str(i+1)] = test_pred_y_sma.iloc[:, -period:].mean(axis=1) 
test_pred_y_sma = test_pred_y_sma[['F'+str(i+1) for i in range(len(val_data.loc[0]))]].values

## Prophet

In [39]:
train = df[df['d'].isin(days_train)==True]
prophet_data = train.rename(columns={'timestamp': 'ds', 'price': 'y'})
prophet_data = prophet_data[['ID', 'ds', 'y']]
prophet_data['ID'] = prophet_data['ID'].str.replace(r'_\d{8}$', '', regex=True)

In [40]:
pred_list = []  
for code in prophet_data['ID'].unique():
    d = prophet_data[prophet_data['ID'] == code].reset_index().drop(['ID'], axis=1).sort_values('ds')
    model = Prophet(
        growth = 'linear',
        seasonality_mode = 'additive',
        yearly_seasonality = True,
        weekly_seasonality = True,
        daily_seasonality = True,
#         holidays = True,
#         changepoint_prior_scale = 0.1
                   )
    model.fit(d)
    future = pd.DataFrame()
    future['ds'] = pd.date_range(start='2023-02-04', periods=28, freq='D') 
    forecast = model.predict(future)   
    pred_y = forecast['yhat'].values
    pred_code = [str(code)] * len(pred_y)
    for y_val, id_val in zip(pred_y, pred_code):
        pred_list.append({'ID': id_val, 'y': y_val})
pred = pd.DataFrame(pred_list)
valid_pred_y_pro = np.where(pred['y']<0, 0, pred['y'])

15:15:43 - cmdstanpy - INFO - Chain [1] start processing
15:15:43 - cmdstanpy - INFO - Chain [1] done processing
15:15:44 - cmdstanpy - INFO - Chain [1] start processing
15:15:44 - cmdstanpy - INFO - Chain [1] done processing
15:15:44 - cmdstanpy - INFO - Chain [1] start processing
15:15:44 - cmdstanpy - INFO - Chain [1] done processing
15:15:44 - cmdstanpy - INFO - Chain [1] start processing
15:15:44 - cmdstanpy - INFO - Chain [1] done processing
15:15:45 - cmdstanpy - INFO - Chain [1] start processing
15:15:45 - cmdstanpy - INFO - Chain [1] done processing
15:15:45 - cmdstanpy - INFO - Chain [1] start processing
15:15:45 - cmdstanpy - INFO - Chain [1] done processing
15:15:46 - cmdstanpy - INFO - Chain [1] start processing
15:15:46 - cmdstanpy - INFO - Chain [1] done processing
15:15:46 - cmdstanpy - INFO - Chain [1] start processing
15:15:46 - cmdstanpy - INFO - Chain [1] done processing
15:15:46 - cmdstanpy - INFO - Chain [1] start processing
15:15:46 - cmdstanpy - INFO - Chain [1]

In [41]:
pred_list = []  
for code in prophet_data['ID'].unique():
    d = prophet_data[prophet_data['ID'] == code].reset_index().drop(['ID'], axis=1).sort_values('ds')
    model = Prophet(
        growth = 'linear',
        seasonality_mode = 'additive',
        yearly_seasonality = True,
        weekly_seasonality = True,
        daily_seasonality = True,
#         holidays = True,
#         changepoint_prior_scale = 0.1
                   )
    model.fit(d)
    future = pd.DataFrame()
    future['ds'] = pd.date_range(start='2023-03-04', periods=28, freq='D') 
    forecast = model.predict(future)   
    pred_y = forecast['yhat'].values
    pred_code = [str(code)] * len(pred_y)
    for y_val, id_val in zip(pred_y, pred_code):
        pred_list.append({'ID': id_val, 'y': y_val})
pred = pd.DataFrame(pred_list)
test_pred_y_pro = np.where(pred['y']<0, 0, pred['y'])

15:15:59 - cmdstanpy - INFO - Chain [1] start processing
15:16:00 - cmdstanpy - INFO - Chain [1] done processing
15:16:00 - cmdstanpy - INFO - Chain [1] start processing
15:16:00 - cmdstanpy - INFO - Chain [1] done processing
15:16:00 - cmdstanpy - INFO - Chain [1] start processing
15:16:00 - cmdstanpy - INFO - Chain [1] done processing
15:16:01 - cmdstanpy - INFO - Chain [1] start processing
15:16:01 - cmdstanpy - INFO - Chain [1] done processing
15:16:01 - cmdstanpy - INFO - Chain [1] start processing
15:16:01 - cmdstanpy - INFO - Chain [1] done processing
15:16:01 - cmdstanpy - INFO - Chain [1] start processing
15:16:01 - cmdstanpy - INFO - Chain [1] done processing
15:16:02 - cmdstanpy - INFO - Chain [1] start processing
15:16:02 - cmdstanpy - INFO - Chain [1] done processing
15:16:02 - cmdstanpy - INFO - Chain [1] start processing
15:16:02 - cmdstanpy - INFO - Chain [1] done processing
15:16:03 - cmdstanpy - INFO - Chain [1] start processing
15:16:03 - cmdstanpy - INFO - Chain [1]

In [None]:
class moving_avg(torch.nn.Module):
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = torch.nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class series_decomp(torch.nn.Module):
    def __init__(self, kernel_size):
        super(series_decomp, self).__init__()
        self.moving_avg = moving_avg(kernel_size, stride=1)

    def forward(self, x):
        moving_mean = self.moving_avg(x)
        residual = x - moving_mean
        return moving_mean, residual 
        
class LTSF_DLinear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, kernel_size, individual, feature_size):
        super(LTSF_DLinear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.decompsition = series_decomp(kernel_size)
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear_Seasonal = torch.nn.ModuleList()
            self.Linear_Trend = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear_Trend.append(torch.nn.Linear(self.window_size, self.forcast_size))
                self.Linear_Trend[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
                self.Linear_Seasonal.append(torch.nn.Linear(self.window_size, self.forcast_size))
                self.Linear_Seasonal[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
        else:
            self.Linear_Trend = torch.nn.Linear(self.window_size, self.forcast_size)
            self.Linear_Trend.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
            self.Linear_Seasonal = torch.nn.Linear(self.window_size,  self.forcast_size)
            self.Linear_Seasonal.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))

    def forward(self, x):
        trend_init, seasonal_init = self.decompsition(x)
        trend_init, seasonal_init = trend_init.permute(0,2,1), seasonal_init.permute(0,2,1)
        if self.individual:
            trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forcast_size], dtype=trend_init.dtype).to(trend_init.device)
            seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forcast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)
            for idx in range(self.channels):
                trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
                seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])                
        else:
            trend_output = self.Linear_Trend(trend_init)
            seasonal_output = self.Linear_Seasonal(seasonal_init)
        x = seasonal_output + trend_output
        return x.permute(0,2,1)
    
class LTSF_NLinear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, individual, feature_size):
        super(LTSF_NLinear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        seq_last = x[:,-1:,:].detach()
        x = x - seq_last
        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        x = x + seq_last
        return x

In [None]:
def standardization(train_df, test_df, not_col, target):
    train_df_ = train_df.copy()
    test_df_ = test_df.copy()
    col =  [col for col in list(train_df.columns) if col not in [not_col]]
    mean_list = []
    std_list = []
    for x in col:
        mean, std = train_df_.agg(["mean", "std"]).loc[:,x]
        mean_list.append(mean)
        std_list.append(std)
        train_df_.loc[:, x] = (train_df_[x] - mean) / std
        test_df_.loc[:, x] = (test_df_[x] - mean) / std
    return train_df_, test_df_, mean_list[col.index(target)], std_list[col.index(target)]

def time_slide_df(df, window_size, forcast_size, date, target):
    df_ = df.copy()
    data_list = []
    dap_list = []
    date_list = []
    for idx in range(0, df_.shape[0]-window_size-forcast_size+1):
        x = df_.loc[idx:idx+window_size-1, target].values.reshape(window_size, 1)
        y = df_.loc[idx+window_size:idx+window_size+forcast_size-1, target].values
        date_ = df_.loc[idx+window_size:idx+window_size+forcast_size-1, date].values
        data_list.append(x)
        dap_list.append(y)
        date_list.append(date_)
    return np.array(data_list, dtype='float32'), np.array(dap_list, dtype='float32'), np.array(date_list)

class Data(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
        
### Univariable ###
### 데이터 셋 생성 ###
window_size = 72
forcast_size= 24
batch_size = 32
targets = '전력사용량(kWh)'
date = 'date_time'

train_df_fe, test_df_fe, mean_, std_ = standardization(train_df, test_df, 'date_time', targets)
train_x, train_y, train_date = time_slide_df(train_df_fe, window_size, forcast_size, date, targets)
test_x, test_y, test_date = time_slide_df(test_df_fe, window_size, forcast_size, date, targets)

train_ds = Data(train_x[:1000], train_y[:1000])
valid_ds = Data(train_x[1000:], train_y[1000:])
test_ds = Data(test_x, test_y)

train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle=True,)
valid_dl = DataLoader(valid_ds, batch_size = train_x[1000:].shape[0], shuffle=False)
test_dl  = DataLoader(test_ds,  batch_size = test_x.shape[0], shuffle=False)


### 모델 학습 ###
train_loss_list = []
valid_loss_list = []
test_loss_list = []
epoch = 50
lr = 0.001
DLinear_model = LTSF_DLinear(
                            window_size=window_size,
                            forcast_size=forcast_size,
                            kernel_size=25,
                            individual=False,
                            feature_size=1,
                            )
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(DLinear_model.parameters(), lr=lr)
max_loss = 999999999

for epoch in tqdm(range(1, epoch+1)):
    loss_list = []
    DLinear_model.train()
    for batch_idx, (data, target) in enumerate(train_dl):
        optimizer.zero_grad()
        output = DLinear_model(data)
        loss = criterion(output, target.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())    
    train_loss_list.append(np.mean(loss_list))

    DLinear_model.eval()
    with torch.no_grad():
        for data, target in valid_dl:
            output = DLinear_model(data)
            valid_loss = criterion(output, target.unsqueeze(-1))
            valid_loss_list.append(valid_loss)
        
        for data, target in test_dl:
            output = DLinear_model(data)
            test_loss = criterion(output, target.unsqueeze(-1))
            test_loss_list.append(test_loss)

    if valid_loss < max_loss:
        torch.save(DLinear_model, 'DLinear_model.pth')
        max_loss = valid_loss
        print("valid_loss={:.3f}, test_los{:.3f}, Model Save".format(valid_loss, test_loss))
        dlinear_best_epoch = epoch
        dlinear_best_train_loss = np.mean(loss_list)
        dlinear_best_valid_loss = np.mean(valid_loss.item())
        dlinear_best_test_loss = np.mean(test_loss.item())

    print("epoch = {}, train_loss : {:.3f}, valid_loss : {:.3f}, test_loss : {:.3f}".format(epoch, np.mean(loss_list), valid_loss, test_loss))

In [None]:
prediction_arima = []
for row in tqdm(train_dataset.values):
    arima = sm.tsa.statespace.SARIMAX(row, order=(1,1,1), seasonal_order=(1, 1, 1, 12)).fit()
    prediction_arima.append(arima.predict(0, 27, typ='levels'))
# prediction_arima = np.where(np.array(prediction_arima)<0, 0, prediction_arima)
sma_rmse = mean_squared_error(prediction_arima, val_dataset)**0.5
print('rmse:', sma_rmse)

In [42]:
valid_pred_y_sma = prediction_sma.values.copy().reshape(-1)
valid_pred_y_lgb = valid_pred_y_lgb.reshape(-1)
valid_pred_y_pro = valid_pred_y_pro.reshape(-1)
val_dataset = val_dataset.values.reshape(-1)

In [None]:
valid_pred_y_sma = np.where(valid_pred_y_sma<100, 0, valid_pred_y_sma)
valid_pred_y_lgb = np.where(valid_pred_y_lgb<100, 0, valid_pred_y_lgb)
valid_pred_y_pro = np.where(valid_pred_y_pro<100, 0, valid_pred_y_pro)

In [43]:
# candidate = np.arange(0, 1000)
candidate = [0, 1, 2, 3, 4, 5, 6]
permute = permutations(candidate, 3)
score = {}
for i in tqdm(list(permute)):
    pred_permute = (
                    valid_pred_y_sma * i[0] +
                    valid_pred_y_lgb * i[1] +
                    valid_pred_y_pro * i[2]
                   )
    score[i] = mean_squared_error(val_dataset, pred_permute/sum(i))**0.5

score = dict(sorted(score.items(), key=lambda x: x[1], reverse=False)[:5])
score

  0%|          | 0/210 [00:00<?, ?it/s]

{(5, 1, 0): 962.7758764835003,
 (6, 1, 0): 962.8106687907305,
 (4, 1, 0): 964.8079755242035,
 (3, 1, 0): 972.3701421974372,
 (6, 2, 0): 972.3701421974372}

In [44]:
final_rmse = mean_squared_error(
    valid_pred_y_sma * 0.8 +
    valid_pred_y_lgb * 0.2,
    val_dataset) ** 0.5
final_rmse

964.8079755242035

In [45]:
final_rmse = mean_squared_error(
    valid_pred_y_sma,
    val_dataset) ** 0.5
final_rmse

988.7071339446336

In [46]:
final_rmse = mean_squared_error(
    valid_pred_y_lgb,
    val_dataset) ** 0.5
final_rmse

1558.4144884769391

In [47]:
final_rmse = mean_squared_error(
    valid_pred_y_pro,
    val_dataset) ** 0.5
final_rmse

1574.9352873349228

In [48]:
test_pred_y_sma = test_pred_y_sma.reshape(-1)
test_pred_y_lgb = test_pred_y_lgb.reshape(-1)
test_pred_y_pro = test_pred_y_pro.reshape(-1)

In [49]:
test_pred_y_sma

array([2058.60714286, 2050.09311224, 2123.3107234 , ..., 2809.38628044,
       2758.47150474, 2715.55977277])

In [50]:
test_pred_y_pro

array([3984.74076715, 1434.12877025, 4582.75220801, ...,  403.58865146,
        396.12781505,  390.02126666])

In [51]:
test_pred_y_lgb

array([1864.,    0., 1837., ...,  574.,  523.,  529.])

In [58]:
pred_test = (
    test_pred_y_sma * 0 + 
    test_pred_y_lgb * 1 + 
    test_pred_y_pro * 0
            )

In [60]:
submission_df['answer'] = pred_test
submission_df.to_csv('3.csv',index=False)

In [61]:
submission_df

Unnamed: 0,ID,answer
0,TG_A_J_20230304,1864.0
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,1837.0
3,TG_A_J_20230307,1595.0
4,TG_A_J_20230308,1747.0
...,...,...
1087,RD_F_J_20230327,468.0
1088,RD_F_J_20230328,531.0
1089,RD_F_J_20230329,574.0
1090,RD_F_J_20230330,523.0
