In [1]:
import sys

sys.path.append("..")

import warnings

warnings.simplefilter("ignore")

from sklearn.model_selection import train_test_split
import pandas as pd

from src.data_preprocess.preprocessor import (
    DataPreprocessor,
    get_label2item,
)
from src.predict.predictor import PredictorHandler


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## シミュレーションの設定

In [2]:
dataset_name = "breakfast"
data_settings = {
    "store_num": 25027,
    "category": "COLD CEREAL",
    "manufacturer": "PRIVATE LABEL",
    "target_col": "UNITS",
    "base_cols": ["YEAR", "MONTH", "DAY", "PRICE", "DESCRIPTION", "UNITS"],
    "master_cols": ["YEAR", "MONTH", "DAY"],
    "num_of_prices": 5,
}
predictor_name = "linear_regression"
train_size = 0.5
test_size = 1 - train_size

# 商品ごとに使う特徴量（指定しない場合は他の商品の価格と年月日を利用）
item2features = {
    "PL BT SZ FRSTD SHRD WHT": ["PRICE_PL BT SZ FRSTD SHRD WHT"],
    "PL HONEY NUT TOASTD OATS": ["PRICE_PL HONEY NUT TOASTD OATS"],
    "PL RAISIN BRAN": ["PRICE_PL RAISIN BRAN"],
}
# item2features = None

## データの前処理

In [3]:
# データの読み込みと前処理
dp = DataPreprocessor(dataset_name)
processed_df = dp.preprocess(**data_settings)

# 後続の処理に必要な情報を作成
target_cols = dp.get_target_cols(prefix=data_settings["target_col"])
feature_cols = dp.get_feature_cols(target_cols=target_cols)
label2item = get_label2item(target_cols=target_cols)
items = list(label2item.values())

[I 220828 13:06:31 preprocessor:34] # of rows [raw data]: 524950
[I 220828 13:06:32 preprocessor:36] # of rows [processed data]: 156


In [4]:
# 訓練データ，検証データに分割
train_df, test_df = train_test_split(
    processed_df, train_size=train_size, test_size=test_size, shuffle=False
)
test_df.reset_index(drop=True, inplace=True)

In [5]:
train_df.head()

Unnamed: 0,YEAR,MONTH,DAY,PRICE_PL BT SZ FRSTD SHRD WHT,PRICE_PL HONEY NUT TOASTD OATS,PRICE_PL RAISIN BRAN,UNITS_PL BT SZ FRSTD SHRD WHT,UNITS_PL HONEY NUT TOASTD OATS,UNITS_PL RAISIN BRAN
0,2009,1,14,1.98,1.85,1.88,100,50,69
1,2009,1,21,1.97,1.87,1.88,100,66,47
2,2009,1,28,1.99,1.88,1.88,129,59,63
3,2009,2,4,1.99,1.89,1.88,122,54,59
4,2009,2,11,1.94,1.85,1.87,113,20,61


In [6]:
test_df.head()

Unnamed: 0,YEAR,MONTH,DAY,PRICE_PL BT SZ FRSTD SHRD WHT,PRICE_PL HONEY NUT TOASTD OATS,PRICE_PL RAISIN BRAN,UNITS_PL BT SZ FRSTD SHRD WHT,UNITS_PL HONEY NUT TOASTD OATS,UNITS_PL RAISIN BRAN
0,2010,7,14,2.3,1.84,1.86,73,40,55
1,2010,7,21,2.3,1.88,1.87,75,60,98
2,2010,7,28,2.36,1.87,1.88,79,65,54
3,2010,8,4,2.35,1.88,1.87,82,61,74
4,2010,8,11,1.95,1.71,1.7,176,52,106


## 予測モデルの構築

In [8]:
# 学習データに対する予測モデルを構築
train_predictors = PredictorHandler(
    train_df=train_df,
    test_df=test_df,
    label2item=label2item,
    predictor_name=predictor_name,
    prefix="train",
)
train_predictors.run(item2features)  # 結果はdata/results/realworld/predict　に格納

# テストデータに対する予測モデルを構築
test_predictors = PredictorHandler(
    train_df=test_df,
    label2item=label2item,
    predictor_name=predictor_name,
    prefix="test",
)
test_predictors.run(item2features)  # 結果はdata/results/realworld/predict　に格納

[I 220828 13:04:42 predictor:123] RMSE for train data [PL BT SZ FRSTD SHRD WHT]: 25.4
[I 220828 13:04:42 predictor:127] R^2 for train data [PL BT SZ FRSTD SHRD WHT]: 0.27
[I 220828 13:04:43 predictor:123] RMSE for test data [PL BT SZ FRSTD SHRD WHT]: 32.0
[I 220828 13:04:43 predictor:127] R^2 for test data [PL BT SZ FRSTD SHRD WHT]: 0.13
[I 220828 13:04:44 predictor:123] RMSE for train data [PL HONEY NUT TOASTD OATS]: 18.4
[I 220828 13:04:44 predictor:127] R^2 for train data [PL HONEY NUT TOASTD OATS]: 0.21
[I 220828 13:04:44 predictor:123] RMSE for test data [PL HONEY NUT TOASTD OATS]: 26.5
[I 220828 13:04:44 predictor:127] R^2 for test data [PL HONEY NUT TOASTD OATS]: -0.79
[I 220828 13:04:45 predictor:123] RMSE for train data [PL RAISIN BRAN]: 17.7
[I 220828 13:04:45 predictor:127] R^2 for train data [PL RAISIN BRAN]: 0.34
[I 220828 13:04:46 predictor:123] RMSE for test data [PL RAISIN BRAN]: 26.0
[I 220828 13:04:46 predictor:127] R^2 for test data [PL RAISIN BRAN]: -0.42
[I 220828 

<Figure size 432x288 with 0 Axes>

In [9]:
train_predictors.result

defaultdict(<function src.predict.predictor.PredictorHandler.__init__.<locals>.<lambda>()>,
            {'rmse': defaultdict(dict,
                         {'train': {'PL BT SZ FRSTD SHRD WHT': 25.4,
                           'PL HONEY NUT TOASTD OATS': 18.4,
                           'PL RAISIN BRAN': 17.7,
                           'mean': 20.5},
                          'test': {'PL BT SZ FRSTD SHRD WHT': 32.0,
                           'PL HONEY NUT TOASTD OATS': 26.5,
                           'PL RAISIN BRAN': 26.0,
                           'mean': 28.17}}),
             'r2': defaultdict(dict,
                         {'train': {'PL BT SZ FRSTD SHRD WHT': 0.27,
                           'PL HONEY NUT TOASTD OATS': 0.21,
                           'PL RAISIN BRAN': 0.34,
                           'mean': 0.27},
                          'test': {'PL BT SZ FRSTD SHRD WHT': 0.13,
                           'PL HONEY NUT TOASTD OATS': -0.79,
                           'PL 