### Forecasting Mini-Course Sales
特徴量生成 + 基本的に2020年は削除  
https://www.kaggle.com/code/gauthamupadhyaya/mini-course-sales
- 学習は2021年を予測
- <b>国、店舗、製品を分けて学習してみる</b>
- 前年の売上げを特徴量として組み込んでみる

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
import make_graph    # 自前
from bokeh.plotting import figure, output_notebook, show
output_notebook()

In [3]:
df_train = pd.read_csv("./data/train.csv", parse_dates=["date"])
df_test = pd.read_csv("./data/test.csv", parse_dates=["date"])
df_subm = pd.read_csv("./data/sample_submission.csv")

In [4]:
# make_graph.trends(df_train, "num_sold", 1200, 300, "graph0.html", "origin")

In [5]:
CSP = {col: list(df_train[col].unique()) for col in ["country", "store", "product"]}
CSP

{'country': ['Argentina', 'Canada', 'Estonia', 'Japan', 'Spain'],
 'store': ['Kaggle Learn', 'Kaggle Store', 'Kagglazon'],
 'product': ['Using LLMs to Improve Your Coding',
  'Using LLMs to Train More LLMs',
  'Using LLMs to Win Friends and Influence People',
  'Using LLMs to Win More Kaggle Competitions',
  'Using LLMs to Write Better']}

In [6]:
def select_csp(df, idx):
    """インデックス指定で各名称取得、DataFrameフィルタ"""
    if len(idx) != 3:
        return (), pd.DataFrame()
    df_ret = df.copy()
    for i, col in zip(idx, ["country", "store", "product"]):
        if i < 0:
            continue
        idx_buf = i
        if i >= len(CSP[col]):
            print(f"[error] 指定インデックスが範囲外です。 ({col}: {CSP[col]})")
            idx_buf = 0
        df_ret = df_ret[df_ret[col] == CSP[col][idx_buf]]

    sel = {col: list(df_ret[col].unique()) for col in ["country", "store", "product"]}
    return sel, df_ret

select_csp(df_train, [0,0,0])[0]

{'country': ['Argentina'],
 'store': ['Kaggle Learn'],
 'product': ['Using LLMs to Improve Your Coding']}

In [7]:
def divide_csp(df):
    """分割"""
    df_arr = []
    csp_arr = []
    for idxc in range(len(CSP["country"])):
        for idxs in range(len(CSP["store"])):
            for idxp in range(len(CSP["product"])):
                csp, df_buf = select_csp(df, [idxc, idxs, idxp])
                df_arr.append(df_buf)
                csp_arr.append([csp[param][0] for param in ["country", "store", "product"]])
    return csp_arr, df_arr

csp_arr, df_trains = divide_csp(df_train)
print(csp_arr[3])
df_trains[3].sample(3)

['Argentina', 'Kaggle Learn', 'Using LLMs to Win More Kaggle Competitions']


Unnamed: 0,id,date,country,store,product,num_sold
126753,126753,2021-08-18,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,20
121878,121878,2021-06-14,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,20
107253,107253,2020-12-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,17


In [9]:
# def make_dummies(df_temp):
#     """国、店舗、製品情報をダミー変数化"""
#     df_dummy = pd.get_dummies(df_temp[["country", "store", "product"]]).astype(np.uint8)
#     df_temp = pd.merge(df_temp, df_dummy, left_index=True, right_index=True)
#     return df_temp

In [8]:
def datetime_features(df_temp):
    """時間情報の特徴量生成"""
    df_temp['year'] = df_temp['date'].dt.year
    df_temp['month'] = df_temp['date'].dt.month
    df_temp['day'] = df_temp['date'].dt.day
    df_temp['dayofyear'] = df_temp['date'].dt.day_of_year
    df_temp['dayofweek'] = df_temp['date'].dt.dayofweek
    df_temp['quarter'] = df_temp['date'].dt.quarter
    # df_temp['weekofyear'] = df_temp['date'].dt.weekofyear  # 見つからない
    df_temp['weekofyear'] = df_temp['date'].apply(lambda x: x.isocalendar()[1])
    df_temp['is_weekend'] = (df_temp['dayofweek'] >= 5).astype(np.int8)
    df_temp['is_month_start'] = df_temp['date'].dt.is_month_start.astype(np.int8)
    df_temp['is_month_end'] = df_temp['date'].dt.is_month_end.astype(np.int8)
    df_temp['monday'] = df_temp['date'].dt.weekday.eq(0).astype(np.uint8)
    df_temp['tuesday'] = df_temp['date'].dt.weekday.eq(1).astype(np.uint8)
    df_temp['wednesday'] = df_temp['date'].dt.weekday.eq(2).astype(np.uint8)
    df_temp['thursday'] = df_temp['date'].dt.weekday.eq(3).astype(np.uint8)
    df_temp['friday'] = df_temp['date'].dt.weekday.eq(4).astype(np.uint8)
    df_temp['saturday'] = df_temp['date'].dt.weekday.eq(5).astype(np.uint8)
    df_temp['sunday'] = df_temp['date'].dt.weekday.eq(6).astype(np.uint8)
    df_temp['is_quarter_end'] = df_temp['date'].dt.is_quarter_end.astype(np.uint8)
    df_temp['is_quarter_start'] = df_temp['date'].dt.is_quarter_start.astype(np.uint8)
    df_temp['is_year_end'] = df_temp['date'].dt.is_year_end.astype(np.uint8)
    df_temp['is_year_start'] = df_temp['date'].dt.is_year_start.astype(np.uint8)
    return df_temp

In [9]:
def seasonality_features(df_temp):
    df_temp['month_sin'] = np.sin(2*np.pi*df_temp.month/12)
    df_temp['month_cos'] = np.cos(2*np.pi*df_temp.month/12)
    df_temp['day_sin'] = np.sin(2*np.pi*df_temp.day/24)    # 24 -> 30 -> 24
    df_temp['day_cos'] = np.cos(2*np.pi*df_temp.day/24)
    return df_temp

In [10]:
def make_features(df_temp):
    """全特徴量を追加"""
    # df_temp = make_dummies(df_temp)
    df_temp = datetime_features(df_temp)
    df_temp = seasonality_features(df_temp)
    return df_temp

In [11]:
# 学習用特徴量生成
df_train_feats = [make_features(df_buf) for df_buf in df_trains]
df_train_feats[5].sample(3)

Unnamed: 0,id,date,country,store,product,num_sold,year,month,day,dayofyear,...,saturday,sunday,is_quarter_end,is_quarter_start,is_year_end,is_year_start,month_sin,month_cos,day_sin,day_cos
97130,97130,2020-07-19,Argentina,Kaggle Store,Using LLMs to Improve Your Coding,39,2020,7,19,201,...,0,1,0,0,0,0,-0.5,-0.866025,-0.965926,0.258819
78080,78080,2019-11-08,Argentina,Kaggle Store,Using LLMs to Improve Your Coding,44,2019,11,8,312,...,0,0,0,0,0,0,-0.5,0.866025,0.866025,-0.5
105755,105755,2020-11-11,Argentina,Kaggle Store,Using LLMs to Improve Your Coding,37,2020,11,11,316,...,0,0,0,0,0,0,-0.5,0.866025,0.258819,-0.965926


グラフで確認

In [8]:
# make_graph.trends(df_train, "day_sin", 1200, 300, "graph_tmp.html", "day_sin")
# make_graph.check_feature(df_train, "day_sin")

In [10]:
csp, df_buf = select_csp(df_train, [1, 0, -1])
print(csp)

{'country': ['Canada'], 'store': ['Kaggle Learn'], 'product': ['Using LLMs to Improve Your Coding', 'Using LLMs to Train More LLMs', 'Using LLMs to Win Friends and Influence People', 'Using LLMs to Win More Kaggle Competitions', 'Using LLMs to Write Better']}


In [11]:
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]}"
p = make_graph.make_trend(df_buf, "num_sold", "product", 900, 300, ttl)
show(p)

各モデルの性能確認

In [12]:
import time
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression, ElasticNet
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [13]:
def get_smape(actual, forecast):
    """自前で評価値(SMAPE:https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error)産出"""
    # smape = return 100/len(actual) * np.sum(2 * np.abs(forecast - actual) / (np.abs(forecast) + np.abs(actual)))
    smape = np.sum(np.abs(forecast - actual)) / np.sum(forecast + actual)
    return smape

In [14]:
# 2020年抜いて、学習とテスト分離
remove_cols = ["id", "date", "country", "store", "product"]
XY_trains, XY_valids = [], []
# 各々で戻せないので、スケーリングは無し
for df_buf in df_train_feats:
    train = df_buf[df_buf["year"].isin([2017, 2018, 2019])].drop(columns=remove_cols)
    valid = df_buf[df_buf["year"] == 2021].drop(columns=remove_cols)

    XY_trains.append([train.drop(columns=["num_sold"]), train["num_sold"].values])
    XY_valids.append([valid.drop(columns=["num_sold"]), valid["num_sold"].values])

In [15]:
df_test.sample(3)

Unnamed: 0,id,date,country,store,product
20476,157426,2022-10-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs
21877,158827,2022-10-19,Japan,Kaggle Store,Using LLMs to Win Friends and Influence People
1135,138085,2022-01-16,Argentina,Kagglazon,Using LLMs to Improve Your Coding


In [16]:
# テスト用特徴量生成
csp_arr, df_tests = divide_csp(df_test)    # (国、店舗、製品)で分割
X_tests = [make_features(df_buf).drop(columns=remove_cols) for df_buf in df_tests]
print(len(X_tests), csp_arr[5])
X_tests[5].head(3)

75 ['Argentina', 'Kaggle Store', 'Using LLMs to Improve Your Coding']


Unnamed: 0,year,month,day,dayofyear,dayofweek,quarter,weekofyear,is_weekend,is_month_start,is_month_end,...,saturday,sunday,is_quarter_end,is_quarter_start,is_year_end,is_year_start,month_sin,month_cos,day_sin,day_cos
5,2022,1,1,1,5,1,52,1,1,0,...,1,0,0,1,0,1,0.5,0.866025,0.258819,0.965926
80,2022,1,2,2,6,1,52,1,0,0,...,0,1,0,0,0,0,0.5,0.866025,0.5,0.866025
155,2022,1,3,3,0,1,1,0,0,0,...,0,0,0,0,0,0,0.5,0.866025,0.707107,0.707107


In [17]:
regressors = [LinearRegression(), 
              Lasso(), 
              DecisionTreeRegressor(), 
              Ridge(), 
              ElasticNet(), 
              RandomForestRegressor(), 
              XGBRegressor(), 
              CatBoostRegressor(), 
              LGBMRegressor(), 
              GradientBoostingRegressor(), 
              HistGradientBoostingRegressor()]

In [18]:
len(csp_arr)

75

In [19]:
# 国、店舗、製品で、学習させ、SMAPE上位のモデルを確認
scores = []
idx_csp = 0
for XY_train, XY_valid in zip(XY_trains[:2], XY_valids[:2]):
    X_train, Y_train = XY_train
    X_valid, Y_valid = XY_valid
    for model in regressors:
        start = time.time()
        model.fit(X_train, Y_train)
        train_time = time.time() - start
        y_pred = model.predict(X_valid)
        name = str(model)[:-2]
        name = "XGBRegressor" if "XGBRegressor" in name else name
        name = "CatBoostRegressor" if "CatBoostRegressor" in name else name
        print(f"{name}, {train_time:.3f}s")
        scores.append([idx_csp, name, train_time, 
                    explained_variance_score(Y_valid, y_pred),
                    mean_squared_error(Y_valid, y_pred, squared=False), 
                    mean_absolute_error(Y_valid, y_pred),
                    r2_score(Y_valid, y_pred),
                    get_smape(Y_valid, y_pred)])
    idx_csp += 1
df_score = pd.DataFrame(scores, columns=["idx_csp", "model", "train_time", "explained_variance", "mae", "rmse", "r2", "smape"])

LinearRegression, 0.003s
Lasso, 0.004s
DecisionTreeRegressor, 0.014s
Ridge, 0.004s
ElasticNet, 0.004s
RandomForestRegressor, 0.435s
XGBRegressor, 0.095s
Learning rate set to 0.041534
0:	learn: 7.1668103	total: 54.4ms	remaining: 54.3s
1:	learn: 6.9452616	total: 57.8ms	remaining: 28.8s
2:	learn: 6.7443427	total: 61.3ms	remaining: 20.4s
3:	learn: 6.5479970	total: 66.8ms	remaining: 16.6s
4:	learn: 6.3590384	total: 70ms	remaining: 13.9s
5:	learn: 6.1948407	total: 73.2ms	remaining: 12.1s
6:	learn: 6.0177121	total: 76.4ms	remaining: 10.8s
7:	learn: 5.8527577	total: 79.3ms	remaining: 9.84s
8:	learn: 5.7066316	total: 82.8ms	remaining: 9.12s
9:	learn: 5.5710690	total: 85.8ms	remaining: 8.49s
10:	learn: 5.4255934	total: 88.8ms	remaining: 7.99s
11:	learn: 5.2826019	total: 92ms	remaining: 7.58s
12:	learn: 5.1513475	total: 95.3ms	remaining: 7.23s
13:	learn: 5.0231365	total: 98.2ms	remaining: 6.92s
14:	learn: 4.9028688	total: 101ms	remaining: 6.64s
15:	learn: 4.7867931	total: 104ms	remaining: 6.4s
16

In [20]:
df_score.sort_values(["idx_csp", "smape"], ascending=True)

Unnamed: 0,idx_csp,model,train_time,explained_variance,mae,rmse,r2,smape
9,0,GradientBoostingRegressor,0.117699,0.754416,2.960738,2.316807,0.539363,0.04138
10,0,HistGradientBoostingRegressor,0.175828,0.761326,2.918502,2.337536,0.552412,0.041729
8,0,LGBMRegressor,0.220678,0.754304,2.942355,2.352911,0.545065,0.042005
6,0,XGBRegressor,0.094774,0.754921,2.944647,2.389187,0.544356,0.042657
7,0,CatBoostRegressor,1.021694,0.758475,2.991119,2.40801,0.529861,0.043058
5,0,RandomForestRegressor,0.434615,0.723947,3.092886,2.43137,0.497326,0.043468
2,0,DecisionTreeRegressor,0.014428,0.599416,3.468054,2.750685,0.367981,0.049196
4,0,ElasticNet,0.003614,0.593371,6.463536,5.834304,-1.195326,0.111818
1,0,Lasso,0.00393,0.587546,11.327712,10.975791,-5.742844,0.233353
3,0,Ridge,0.003675,0.704093,15.61638,15.435031,-11.815027,0.36253


In [21]:
def disp_scores(Y_test, test_pred):
    print("Test Scores")
    print("\tExplained variance:", explained_variance_score(Y_test, test_pred))
    print("\tMean absolute error:", mean_absolute_error(Y_test, test_pred))
    print("\tMean squared error:", mean_squared_error(Y_test, test_pred))
    print("\tR2 score:", r2_score(Y_test, test_pred))
    print("\tSMAPE score:", get_smape(Y_test, test_pred))

In [21]:
# def get_prediction(train, valid, test, model):
#     """各予測値を返す"""
#     # スケーリングして学習させたので、逆変換する
#     return [minmax.inverse_transform(model.predict(x).reshape(-1,1)) for x in [train, valid, test]]

# def learning(model, X_train, Y_train, X_valid, Y_valid, X):
#     """学習、予測値出力"""
#     model.fit(X_train, Y_train)
#     valid_pred = model.predict(X_valid)
#     disp_scores(Y_valid, valid_pred)
#     train_pred, valid_pred, pred = get_prediction(X_train, X_valid, X, model)
#     return (train_pred, valid_pred, pred)

In [22]:
def get_prediction(train, valid, test, model):
    """各予測値を返す"""
    # 小さい値で差分が出るのを抑えるため、四捨五入してみる
    ret_arr = []
    for x in [train, valid, test]:
        prediction = model.predict(x)
        ret_arr.append(((prediction * 2 + 1) // 2).astype(int))
    return ret_arr

def learning(model, X_train, Y_train, X_valid, Y_valid, X):
    """学習、予測値出力"""
    model.fit(X_train, Y_train)
    train_pred, valid_pred, pred = get_prediction(X_train, X_valid, X, model)
    disp_scores(Y_valid, valid_pred)
    return (train_pred, valid_pred, pred)

In [23]:
## LightGBM
lgbm = LGBMRegressor(data_sample_strategy='goss', max_depth=15, n_estimators=150, num_leaves=31, force_row_wise=True)    # warning 抑制
train_pred, valid_pred, pred = learning(lgbm, X_train, Y_train, X_valid, Y_valid, X)

NameError: name 'X' is not defined

In [34]:
## Category Boosting
catb = CatBoostRegressor()
train_pred, valid_pred, pred = learning(catb, X_train, Y_train, X_valid, Y_valid, X)

Learning rate set to 0.082161
0:	learn: 0.1279366	total: 3.18ms	remaining: 3.17s
1:	learn: 0.1183730	total: 5.88ms	remaining: 2.94s
2:	learn: 0.1095623	total: 8.63ms	remaining: 2.87s
3:	learn: 0.1015776	total: 11.4ms	remaining: 2.83s
4:	learn: 0.0942607	total: 14.2ms	remaining: 2.82s
5:	learn: 0.0875941	total: 16.8ms	remaining: 2.78s
6:	learn: 0.0816091	total: 19.5ms	remaining: 2.76s
7:	learn: 0.0761291	total: 22.1ms	remaining: 2.74s
8:	learn: 0.0711956	total: 24.8ms	remaining: 2.73s
9:	learn: 0.0666927	total: 27.6ms	remaining: 2.73s
10:	learn: 0.0625684	total: 30.3ms	remaining: 2.72s
11:	learn: 0.0588600	total: 33.1ms	remaining: 2.73s
12:	learn: 0.0554737	total: 35.9ms	remaining: 2.72s
13:	learn: 0.0523696	total: 38.6ms	remaining: 2.72s
14:	learn: 0.0496212	total: 41.3ms	remaining: 2.71s
15:	learn: 0.0471692	total: 43.9ms	remaining: 2.7s
16:	learn: 0.0449425	total: 46.9ms	remaining: 2.71s
17:	learn: 0.0429512	total: 49.7ms	remaining: 2.71s
18:	learn: 0.0411569	total: 52.5ms	remaining:

In [35]:
## RandomForest
rf = RandomForestRegressor()
train_pred, valid_pred, pred = learning(rf, X_train, Y_train, X_valid, Y_valid, X)

  return fit_method(estimator, *args, **kwargs)


Test Scores
	Explained variance: 0.9810310838091447
	Mean absolute error: 0.011069923240055365
	Mean squared error: 0.0004159436278943716
	R2 score: 0.9799908321390576
	SMAPE score: 0.5494684788808257


In [36]:
## Ridge
ridge = Ridge()
train_pred, valid_pred, pred = learning(ridge, X_train, Y_train, X_valid, Y_valid, X)

Test Scores
	Explained variance: 0.753603614376493
	Mean absolute error: 0.05032981394611865
	Mean squared error: 0.005122359608950401
	R2 score: 0.7535864324248567
	SMAPE score: 0.19799265465438717


In [22]:
## Lasso()
idx_csp = 0
print(csp_arr[idx_csp])
X_train, Y_train = XY_trains[idx_csp]
X_valid, Y_valid = XY_valids[idx_csp]
lasso = Lasso()
train_pred, valid_pred, pred = learning(lasso, X_train, Y_train, X_valid, Y_valid, X_tests[idx_csp])

['Argentina', 'Kaggle Learn', 'Using LLMs to Improve Your Coding']
Test Scores
	Explained variance: 0.31728827532410653
	Mean absolute error: 0.11138609038918636
	Mean squared error: 0.02094464506716574
	R2 score: 0.2559905157787117
	SMAPE score: 0.22061323777529598


In [41]:
XY_valids[idx_csp][1]

array([[0.65384615],
       [0.53846154],
       [0.65384615],
       [0.34615385],
       [0.30769231],
       [0.30769231],
       [0.38461538],
       [0.26923077],
       [0.30769231],
       [0.42307692],
       [0.26923077],
       [0.23076923],
       [0.11538462],
       [0.23076923],
       [0.34615385],
       [0.26923077],
       [0.42307692],
       [0.15384615],
       [0.19230769],
       [0.15384615],
       [0.23076923],
       [0.23076923],
       [0.26923077],
       [0.34615385],
       [0.19230769],
       [0.15384615],
       [0.23076923],
       [0.11538462],
       [0.15384615],
       [0.30769231],
       [0.26923077],
       [0.15384615],
       [0.11538462],
       [0.15384615],
       [0.15384615],
       [0.11538462],
       [0.34615385],
       [0.30769231],
       [0.15384615],
       [0.19230769],
       [0.11538462],
       [0.11538462],
       [0.11538462],
       [0.30769231],
       [0.30769231],
       [0.11538462],
       [0.15384615],
       [0.269

In [36]:
# 可視化用 DataFrame 作成
cols = ["id", "date", "country", "store", "product"]
c, s, p = csp_arr[idx_csp]
# オリジナル
df_plt = df_train[cols].copy()
df_plt = df_plt[(df_plt["country"]==c) & (df_plt["store"]==s) & (df_plt["product"]==p)]
df_plt["num_sold"] = df_train["num_sold"]
df_plt["kind"] = "orig"
# 学習結果
df_train_pred = pd.merge(X_train, df_train, left_index=True, right_index=True)[cols]
df_train_pred["num_sold"] = train_pred
df_train_pred["kind"] = "train"
# 検証結果
df_valid_pred = pd.merge(X_valid, df_train, left_index=True, right_index=True)[cols]
df_valid_pred["num_sold"] = valid_pred
df_valid_pred["kind"] = "test"
# 実際の予測
df_pred = pd.merge(df_test, X_tests[idx_csp], left_index=True, right_index=True)[cols]
df_pred["num_sold"] = pred
df_pred["kind"] = "pred"

df_plt = pd.concat([df_plt, df_train_pred, df_valid_pred, df_pred]).reset_index(drop=True)
df_plt.shape

(3651, 7)

In [37]:
df_plt.sample(5)

Unnamed: 0,id,date,country,store,product,num_sold,kind
32,2400,2017-02-02,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,34.0,orig
3050,119250,2021-05-10,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,330.045392,test
36,2700,2017-02-06,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,30.0,orig
2710,66300,2019-06-04,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,334.36439,train
2366,40500,2018-06-25,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,337.992348,train


In [38]:
ttl = f"{csp_arr[idx_csp]} (Lasso)"
p = make_graph.make_trend(df_plt, "num_sold", "kind", 900, 300, ttl)
show(p)

↓ 値低い製品も悪くない気がするけど、、、他にズレているところがあるのか？  
　各々で学習させてみる？

In [40]:
df_buf = df_plt[df_plt["kind"]=="test"].drop(columns=["kind"])
df_buf.rename(columns={"num_sold": "num_sold_test"}, inplace=True)
df_buf = pd.merge(df_buf, df_plt[df_plt["kind"]=="orig"].drop(columns=["kind"]))

In [58]:
buf = []
for c in df_buf["country"].unique():
    for s in df_buf["store"].unique():
        for p in df_buf["product"].unique():
            df_p = df_buf[(df_buf["country"]==c) & (df_buf["store"]==s) & (df_buf["product"]==p)]
            rmse = mean_squared_error(df_p["num_sold"], df_p["num_sold_test"], squared=False)
            smape = get_smape(df_p["num_sold"], df_p["num_sold_test"])
            buf.append([c, s, p, rmse, smape])
df_mse = pd.DataFrame(buf, columns=["country", "store", "product", "rmse", "smape"])            

In [59]:
df_mse.sort_values("smape")

Unnamed: 0,country,store,product,rmse,smape
4,Argentina,Kaggle Learn,Using LLMs to Write Better,55.950176,-5.187059
67,Spain,Kaggle Store,Using LLMs to Win Friends and Influence People,59.447323,-2.872823
37,Estonia,Kaggle Store,Using LLMs to Win Friends and Influence People,98.392282,-1.561222
62,Spain,Kaggle Learn,Using LLMs to Win Friends and Influence People,86.613494,-1.395996
32,Estonia,Kaggle Learn,Using LLMs to Win Friends and Influence People,126.236302,-1.216974
...,...,...,...,...,...
72,Spain,Kagglazon,Using LLMs to Win Friends and Influence People,152.508713,0.518106
12,Argentina,Kagglazon,Using LLMs to Win Friends and Influence People,98.745449,0.662363
9,Argentina,Kaggle Store,Using LLMs to Write Better,34.885151,0.901769
3,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,40.905185,4.288972


In [97]:
CSP

{'country': ['Argentina', 'Canada', 'Estonia', 'Japan', 'Spain'],
 'store': ['Kaggle Learn', 'Kaggle Store', 'Kagglazon'],
 'product': ['Using LLMs to Improve Your Coding',
  'Using LLMs to Train More LLMs',
  'Using LLMs to Win Friends and Influence People',
  'Using LLMs to Win More Kaggle Competitions',
  'Using LLMs to Write Better']}

In [99]:
csp, df_buf = select_csp(df_plt, [0, 0, 2])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (RandomForest)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [100]:
csp, df_buf = select_csp(df_plt, [1, 2, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (RandomForest)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

↑　絶対値が大きいと、RMSEとしては悪い方向になるかも？

In [29]:
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]}"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [60]:
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (LGBM)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

submit

In [45]:
print(df_test.shape, df_subm.shape, len(pred))

(27375, 5) (27375, 2) 27375


In [61]:
# df_subm = pd.merge(df_subm.drop(columns=["num_sold"]), df_test[["id", "num_sold"]].copy(), how="left")
# df_subm["num_sold"] = pred
df_subm["num_sold"] = ((pred * 2 + 1) // 2).astype(int)    # 四捨五入int
df_subm["num_sold"].isna().sum()

0

In [62]:
df_subm

Unnamed: 0,id,num_sold
0,136950,40
1,136951,39
2,136952,1
3,136953,42
4,136954,32
...,...,...
27370,164320,752
27371,164321,722
27372,164322,97
27373,164323,696


In [63]:
df_subm.to_csv("./submission.csv", index=False)

`Public score: 50.61907` year入れて、CatBoost。でもまだ50点台。。。  
`Public score: 50.35902` RandomForest。CatBoostより下振れしてなさそうに見えたけど。。
