### Forecasting Mini-Course Sales
特徴量生成 + ~~基本的に2020年は削除~~ (https://www.kaggle.com/code/gauthamupadhyaya/mini-course-sales)  
2020年3-6月は何かで代用したい (https://www.kaggle.com/code/yeoyunsianggeremie/s3e19-catboost-smoothing-post-processing)
 
- 学習は2021年を予測
- 国、店舗、製品を分けて学習してみる
- <b>前年の売上げを特徴量として組み込んでみる</b> (2017年はNanか、学習から外す) (Nanのままだと学習できない。。)
- (train, valid, test ラベルを付けて、全部くっつけて特徴量を生成してみる)

In [1]:
import numpy as np
import pandas as pd
from holidays import CountryHoliday
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [14]:
CountryHoliday(country="JPN", years=2022)

{datetime.date(2022, 1, 1): '元日', datetime.date(2022, 1, 10): '成人の日', datetime.date(2022, 2, 11): '建国記念の日', datetime.date(2022, 2, 23): '天皇誕生日', datetime.date(2022, 3, 21): '春分の日', datetime.date(2022, 4, 29): '昭和の日', datetime.date(2022, 5, 3): '憲法記念日', datetime.date(2022, 5, 4): 'みどりの日', datetime.date(2022, 5, 5): 'こどもの日', datetime.date(2022, 7, 18): '海の日', datetime.date(2022, 8, 11): '山の日', datetime.date(2022, 9, 19): '敬老の日', datetime.date(2022, 9, 23): '秋分の日', datetime.date(2022, 10, 10): 'スポーツの日', datetime.date(2022, 11, 3): '文化の日', datetime.date(2022, 11, 23): '勤労感謝の日'}

In [2]:
import make_graph    # 自前
from bokeh.plotting import figure, output_notebook, show
output_notebook()

In [3]:
df_train = pd.read_csv("./data/train.csv", parse_dates=["date"])
df_test = pd.read_csv("./data/test.csv", parse_dates=["date"])
df_subm = pd.read_csv("./data/sample_submission.csv")

In [4]:
df_test["num_sold"] = np.nan
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

df_all['year'] = df_all['date'].dt.year
df_all['month'] = df_all['date'].dt.month
df_all['day'] = df_all['date'].dt.day

df_all["type"] = "train"
df_all["type"] = np.where(df_all["year"]==2021, "valid", df_all["type"])
df_all["type"] = np.where(df_all["year"]==2022, "test", df_all["type"])

In [5]:
print(df_all.shape)
df_all.head(5)

(164325, 10)


Unnamed: 0,id,date,country,store,product,num_sold,year,month,day,type
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63.0,2017,1,1,train
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66.0,2017,1,1,train
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9.0,2017,1,1,train
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59.0,2017,1,1,train
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49.0,2017,1,1,train


In [6]:
# 2020年3-6月を置換 (2020年も含めて平均してみる)
df_buf = df_all[(df_all["year"]<2021) & (df_all["month"].isin([3,4,5,6]))]
df_buf = df_buf.groupby(by=["month", "day", "country", "store", "product"])["num_sold"].mean().reset_index()
df_buf["year"] = 2020
df_buf = pd.merge(df_all[["id", "year", "month", "day", "country", "store", "product"]], df_buf).sort_values("id")
df_all.loc[((df_all["year"]==2020) & (df_all["month"].isin([3,4,5,6]))), "num_sold"] = df_buf["num_sold"].values    # df_allはソート済なので、そのまま置換
print(df_all.shape)

(164325, 10)


In [76]:
# make_graph.trends(df_all, "num_sold", 1200, 300, "graph1.html", "コロナ置換")

In [7]:
CSP = {col: list(df_all[col].unique()) for col in ["country", "store", "product"]}
CSP

{'country': ['Argentina', 'Canada', 'Estonia', 'Japan', 'Spain'],
 'store': ['Kaggle Learn', 'Kaggle Store', 'Kagglazon'],
 'product': ['Using LLMs to Improve Your Coding',
  'Using LLMs to Train More LLMs',
  'Using LLMs to Win Friends and Influence People',
  'Using LLMs to Win More Kaggle Competitions',
  'Using LLMs to Write Better']}

In [8]:
def select_csp(df, idx):
    """インデックス指定で各名称取得、DataFrameフィルタ"""
    if len(idx) != 3:
        return (), pd.DataFrame()
    df_ret = df.copy()
    for i, col in zip(idx, ["country", "store", "product"]):
        if i < 0:
            continue
        idx_buf = i
        if i >= len(CSP[col]):
            print(f"[error] 指定インデックスが範囲外です。 ({col}: {CSP[col]})")
            idx_buf = 0
        df_ret = df_ret[df_ret[col] == CSP[col][idx_buf]]

    sel = {col: list(df_ret[col].unique()) for col in ["country", "store", "product"]}
    return sel, df_ret

In [9]:
csp, df_buf = select_csp(df_all, [-1, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]}"
p = make_graph.make_trend(df_buf, "num_sold", "country", 900, 300, ttl)
show(p)

In [9]:
def make_dummies(df_temp):
    """国、店舗、製品情報をダミー変数化"""
    df_dummy = pd.get_dummies(df_temp[["country", "store", "product"]]).astype(np.uint8)
    df_temp = pd.merge(df_temp, df_dummy, left_index=True, right_index=True)
    return df_temp

In [10]:
def datetime_features(df_temp):
    """時間情報の特徴量生成"""
    # df_temp['month'] = df_temp['date'].dt.month
    # df_temp['day'] = df_temp['date'].dt.day
    # df_temp['year'] = df_temp['date'].dt.year
    df_temp['dayofyear'] = df_temp['date'].dt.day_of_year
    df_temp['dayofweek'] = df_temp['date'].dt.dayofweek
    df_temp['quarter'] = df_temp['date'].dt.quarter
    # df_temp['weekofyear'] = df_temp['date'].dt.weekofyear  # 見つからない
    df_temp['weekofyear'] = df_temp['date'].apply(lambda x: x.isocalendar()[1])
    df_temp['is_weekend'] = (df_temp['dayofweek'] >= 5).astype(np.int8)
    df_temp['is_month_start'] = df_temp['date'].dt.is_month_start.astype(np.int8)
    df_temp['is_month_end'] = df_temp['date'].dt.is_month_end.astype(np.int8)
    df_temp['monday'] = df_temp['date'].dt.weekday.eq(0).astype(np.uint8)
    df_temp['tuesday'] = df_temp['date'].dt.weekday.eq(1).astype(np.uint8)
    df_temp['wednesday'] = df_temp['date'].dt.weekday.eq(2).astype(np.uint8)
    df_temp['thursday'] = df_temp['date'].dt.weekday.eq(3).astype(np.uint8)
    df_temp['friday'] = df_temp['date'].dt.weekday.eq(4).astype(np.uint8)
    df_temp['saturday'] = df_temp['date'].dt.weekday.eq(5).astype(np.uint8)
    df_temp['sunday'] = df_temp['date'].dt.weekday.eq(6).astype(np.uint8)
    df_temp['is_quarter_end'] = df_temp['date'].dt.is_quarter_end.astype(np.uint8)
    df_temp['is_quarter_start'] = df_temp['date'].dt.is_quarter_start.astype(np.uint8)
    df_temp['is_year_end'] = df_temp['date'].dt.is_year_end.astype(np.uint8)
    df_temp['is_year_start'] = df_temp['date'].dt.is_year_start.astype(np.uint8)    
    return df_temp

In [11]:
def seasonality_features(df_temp):
    df_temp['month_sin'] = np.sin(2*np.pi*df_temp.month/12)
    df_temp['month_cos'] = np.cos(2*np.pi*df_temp.month/12)
    df_temp['day_sin'] = np.sin(2*np.pi*df_temp.day/24)    # 24 -> 30 -> 24
    df_temp['day_cos'] = np.cos(2*np.pi*df_temp.day/24)
    return df_temp

In [12]:
def set_prev_sold(df_temp):
    """前年売上げ (2020年3-6月を除く)"""
    df_prev = df_temp[["year", "month", "day", "country", "store", "product", "num_sold"]].copy()
    df_prev["year"] += 1
    df_prev.rename(columns={"num_sold": "prev_sold"}, inplace=True)
    df_temp = pd.merge(df_temp, df_prev, how="left")
    df_temp["prev_sold"] = np.where(df_temp["year"]==2017, df_temp["num_sold"], df_temp["prev_sold"])

    # # df_temp['month'] = df_temp['date'].dt.month
    # # df_temp['day'] = df_temp['date'].dt.day
    # # df_temp['year'] = df_temp['date'].dt.year
    
    # if len(df_temp[df_temp["year"]==2022]) > 0:    # テストデータ用
    #     df_prev = df_train[["year", "month", "day", "country", "store", "product", "num_sold"]].copy()
    #     # df_prev['month'] = df_prev['date'].dt.month
    #     # df_prev['day'] = df_prev['date'].dt.day
    #     # df_prev['year'] = df_prev['date'].dt.year
    #     df_prev = df_prev[df_prev['year']==2021]
    #     df_prev["year"] = 2022
    # else:                                          # 学習データ用
    #     df_prev = df_temp[["year", "month", "day", "country", "store", "product", "num_sold"]].copy()
    #     buf_arr = []
    #     for y in range(2018, 2022):    # 2017は初年なので前年は無し
    #         df_buf = df_prev[df_prev["year"]==y-1].copy()
    #         if y == 2021:
    #             df_buf = df_prev[df_prev["year"]==2019].copy()    # 2020年は問題年なので、その前年を使用
    #         df_buf["year"] = y
    #         buf_arr.append(df_buf)
    #     df_prev = pd.concat(buf_arr)
    # df_prev.rename(columns={"num_sold": "prev_sold"}, inplace=True)
    # df_temp = pd.merge(df_temp, df_prev, how="left")
    return df_temp


In [13]:
print(df_train.shape, df_test.shape, len(df_train)+len(df_test))

(136950, 6) (27375, 6) 164325


In [14]:
def make_features(df_temp):
    """全特徴量を追加"""
    df_temp = make_dummies(df_temp)
    df_temp = datetime_features(df_temp)
    df_temp = seasonality_features(df_temp)
    df_temp = set_prev_sold(df_temp)
    return df_temp

# 特徴量生成
df_feat = make_features(df_all)
print(df_feat.shape)
df_feat.head(3)

(164325, 46)


Unnamed: 0,id,date,country,store,product,num_sold,year,month,day,type,...,sunday,is_quarter_end,is_quarter_start,is_year_end,is_year_start,month_sin,month_cos,day_sin,day_cos,prev_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63.0,2017,1,1,train,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,63.0
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66.0,2017,1,1,train,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,66.0
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9.0,2017,1,1,train,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,9.0


In [34]:
csp, df_buf = select_csp(df_feat, [1, 0, -1])
print(csp)

{'country': ['Canada'], 'store': ['Kaggle Learn'], 'product': ['Using LLMs to Improve Your Coding', 'Using LLMs to Train More LLMs', 'Using LLMs to Win Friends and Influence People', 'Using LLMs to Win More Kaggle Competitions', 'Using LLMs to Write Better']}


In [35]:
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]}"
p = make_graph.make_trend(df_buf, "num_sold", "product", 900, 300, ttl)
show(p)

各モデルの性能確認

In [16]:
import time
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression, ElasticNet
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, cross_val_score
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [17]:
def get_smape(actual, forecast):
    """自前で評価値(SMAPE:https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error)産出"""
    smape = 100/len(actual) * np.sum(2 * np.abs(forecast - actual) / (np.abs(forecast) + np.abs(actual)))
    # smape = np.sum(np.abs(forecast - actual)) / np.sum(forecast + actual)
    return smape

In [18]:
# 学習とテスト用分離
remove_cols = ["id", "date", "country", "store", "product", "num_sold", "type"]
train = df_feat[df_feat["type"]=="train"].dropna(subset=["prev_sold"])    # 2020年は閏年のため、Nanになってる
# 2020年3-6月を抜いてみる
train = train[~((train["year"]==2020) & (train["month"].isin([3,4,5,6])))]
valid = df_feat[df_feat["type"]=="valid"].dropna(subset=["prev_sold"])

X_train = train.drop(columns=remove_cols)
y_train = train["num_sold"].values
X_valid = valid.drop(columns=remove_cols)
y_valid = valid["num_sold"].values
X_test = df_feat[df_feat["type"]=="test"].drop(columns=remove_cols)

In [19]:
print(X_train.shape, X_valid.shape, X_test.shape)
X_train.head(3)

(100350, 39) (27375, 39) (27375, 39)


Unnamed: 0,year,month,day,country_Argentina,country_Canada,country_Estonia,country_Japan,country_Spain,store_Kagglazon,store_Kaggle Learn,...,sunday,is_quarter_end,is_quarter_start,is_year_end,is_year_start,month_sin,month_cos,day_sin,day_cos,prev_sold
0,2017,1,1,1,0,0,0,0,0,1,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,63.0
1,2017,1,1,1,0,0,0,0,0,1,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,66.0
2,2017,1,1,1,0,0,0,0,0,1,...,1,0,1,0,1,0.5,0.866025,0.258819,0.965926,9.0


In [20]:
regressors = [LinearRegression(), 
              Lasso(), 
              DecisionTreeRegressor(), 
              Ridge(), 
              ElasticNet(), 
              RandomForestRegressor(), 
              XGBRegressor(), 
              CatBoostRegressor(), 
              LGBMRegressor(), 
              GradientBoostingRegressor(), 
              HistGradientBoostingRegressor()]

In [21]:
scores = []
for model in regressors:
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start
    # start = time.time()
    y_pred = model.predict(X_valid)
    y_pred = ((y_pred * 2 + 1) // 2).astype(int)    # 四捨五入int
    # predict_time = time.time()-start
    name = str(model)[:-2]
    name = "XGBRegressor" if "XGBRegressor" in name else name
    name = "CatBoostRegressor" if "CatBoostRegressor" in name else name
    print(f"{name}, {train_time:.3f}s")
    scores.append([name, train_time, 
                   explained_variance_score(y_valid, y_pred),
                   mean_squared_error(y_valid, y_pred, squared=False), 
                   mean_absolute_error(y_valid, y_pred),
                   r2_score(y_valid, y_pred),
                   get_smape(y_valid, y_pred)])
df_score = pd.DataFrame(scores, columns=["model", "train_time", "explained_variance", "mae", "rmse", "r2", "smape"])

LinearRegression, 0.087s
Lasso, 0.196s
DecisionTreeRegressor, 0.760s
Ridge, 0.022s
ElasticNet, 0.257s
RandomForestRegressor, 47.856s
XGBRegressor, 2.194s
Learning rate set to 0.084805
0:	learn: 167.4842140	total: 54ms	remaining: 53.9s
1:	learn: 154.2625354	total: 60.6ms	remaining: 30.2s
2:	learn: 142.3092713	total: 67.8ms	remaining: 22.5s
3:	learn: 131.1646474	total: 75.7ms	remaining: 18.9s
4:	learn: 121.0300647	total: 83.6ms	remaining: 16.6s
5:	learn: 111.7904919	total: 91.7ms	remaining: 15.2s
6:	learn: 103.2343038	total: 99.7ms	remaining: 14.1s
7:	learn: 95.4478435	total: 108ms	remaining: 13.4s
8:	learn: 88.3394700	total: 116ms	remaining: 12.8s
9:	learn: 81.9295615	total: 125ms	remaining: 12.3s
10:	learn: 76.0405839	total: 133ms	remaining: 11.9s
11:	learn: 70.7071718	total: 141ms	remaining: 11.6s
12:	learn: 65.8882733	total: 149ms	remaining: 11.3s
13:	learn: 61.4313458	total: 158ms	remaining: 11.2s
14:	learn: 57.4156239	total: 166ms	remaining: 10.9s
15:	learn: 53.7213124	total: 172ms

In [22]:
df_score.sort_values("smape", ascending=True)

Unnamed: 0,model,train_time,explained_variance,mae,rmse,r2,smape
5,RandomForestRegressor,47.855569,0.954789,49.909469,29.018411,0.936804,18.704594
2,DecisionTreeRegressor,0.760361,0.944322,54.105152,30.874009,0.925732,19.825019
6,XGBRegressor,2.193843,0.953657,50.533631,29.761498,0.935213,20.560654
10,HistGradientBoostingRegressor,0.650098,0.950837,52.202021,30.586995,0.930864,20.634752
8,LGBMRegressor,0.330847,0.951308,51.902889,30.436639,0.931655,20.645809
7,CatBoostRegressor,3.829769,0.954931,49.96617,29.831452,0.93666,21.053776
9,GradientBoostingRegressor,9.294339,0.955029,50.95955,30.850192,0.934116,26.667663
4,ElasticNet,0.25738,0.952583,52.384247,32.312986,0.930381,34.331037
1,Lasso,0.196065,0.952823,53.394022,33.915507,0.927671,39.461358
0,LinearRegression,0.086945,0.952952,54.116967,35.240365,0.925699,46.232841


In [23]:
def disp_scores(Y_test, test_pred):
    print("Test Scores")
    print("\tExplained variance:", explained_variance_score(Y_test, test_pred))
    print("\tMean absolute error:", mean_absolute_error(Y_test, test_pred))
    print("\tMean squared error:", mean_squared_error(Y_test, test_pred))
    print("\tR2 score:", r2_score(Y_test, test_pred))
    print("\tSMAPE score:", get_smape(Y_test, test_pred))

In [24]:
def get_prediction(train, valid, test, model):
    """各予測値を返す"""
    # # スケーリングして学習させたので、逆変換する
    # return [minmax.inverse_transform(model.predict(x).reshape(-1,1)) for x in [train, valid, test]]
    # 小さい値で差分が出るのを抑えるため、四捨五入してみる
    ret_arr = []
    for x in [train, valid, test]:
        prediction = model.predict(x)
        ret_arr.append(((prediction * 2 + 1) // 2).astype(int))
    return ret_arr

def learning(model, X_train, Y_train, X_valid, Y_valid, X):
    """学習、予測値出力"""
    model.fit(X_train, Y_train)
    train_pred, valid_pred, pred = get_prediction(X_train, X_valid, X, model)
    disp_scores(Y_valid, valid_pred)
    return (train_pred, valid_pred, pred)

In [25]:
def get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred):
    """学習とテスト確認用 DataFrame 作成"""
    cols = ["id", "date", "country", "store", "product"]
    # オリジナル
    df_plt = df_train[cols].copy()
    df_plt["num_sold"] = df_train["num_sold"]
    df_plt["kind"] = "orig"
    # 学習結果
    df_train_pred = pd.merge(X_train, df_train, left_index=True, right_index=True)[cols]
    df_train_pred["num_sold"] = train_pred
    df_train_pred["kind"] = "train"
    # 検証結果
    df_valid_pred = pd.merge(X_valid, df_train, left_index=True, right_index=True)[cols]
    df_valid_pred["num_sold"] = valid_pred
    df_valid_pred["kind"] = "valid"
    # 実際の予測
    df_pred = df_test.copy()
    df_pred["num_sold"] = pred
    df_pred["kind"] = "test"

    df_plt = pd.concat([df_plt, df_train_pred, df_valid_pred, df_pred]).reset_index(drop=True)
    # print(df_plt.shape)
    return df_plt

In [26]:
## LightGBM
model_name = "LightGBM"
lgbm = LGBMRegressor(data_sample_strategy='goss', max_depth=15, n_estimators=150, num_leaves=31, force_row_wise=True)    # warning 抑制
train_pred, valid_pred, pred = learning(lgbm, X_train, y_train, X_valid, y_valid, X_test)

[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 100350, number of used features: 39
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 164.975635


Test Scores
	Explained variance: 0.9513631089390558
	Mean absolute error: 30.362995433789955
	Mean squared error: 2678.2847488584475
	R2 score: 0.9320509451919344
	SMAPE score: 20.65814319485536


In [27]:
df_plt = get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred)
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} ({model_name})"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [28]:
## XGB
model_name = "XGB"
xgb = XGBRegressor()
train_pred, valid_pred, pred = learning(xgb, X_train, y_train, X_valid, y_valid, X_test)

Test Scores
	Explained variance: 0.9536565105872613
	Mean absolute error: 29.761497716894976
	Mean squared error: 2553.647890410959
	R2 score: 0.9352130274646887
	SMAPE score: 20.560653790656556


In [29]:
df_plt = get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred)
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} ({model_name})"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [64]:
## Category Boosting
model_name = "CatBoost"
catb = CatBoostRegressor()
train_pred, valid_pred, pred = learning(catb, X_train, Y_train, X_valid, Y_valid, X)

Learning rate set to 0.082161
0:	learn: 169.2600607	total: 7.67ms	remaining: 7.67s
1:	learn: 156.6074469	total: 10.7ms	remaining: 5.34s
2:	learn: 144.9509373	total: 13.5ms	remaining: 4.5s
3:	learn: 134.3871611	total: 16.3ms	remaining: 4.07s
4:	learn: 124.7068696	total: 19.1ms	remaining: 3.81s
5:	learn: 115.8870236	total: 21.9ms	remaining: 3.63s
6:	learn: 107.9688713	total: 24.8ms	remaining: 3.51s
7:	learn: 100.7188619	total: 27.5ms	remaining: 3.41s
8:	learn: 94.1917450	total: 30.3ms	remaining: 3.34s
9:	learn: 88.2344217	total: 33.2ms	remaining: 3.29s
10:	learn: 82.7779772	total: 36ms	remaining: 3.24s
11:	learn: 77.8717469	total: 38.8ms	remaining: 3.19s
12:	learn: 73.3917039	total: 41.6ms	remaining: 3.15s
13:	learn: 69.2849315	total: 44.1ms	remaining: 3.11s
14:	learn: 65.6488053	total: 46.8ms	remaining: 3.07s
15:	learn: 62.4048266	total: 49.5ms	remaining: 3.04s
16:	learn: 59.4589145	total: 52.2ms	remaining: 3.02s
17:	learn: 56.8244631	total: 54.9ms	remaining: 3s
18:	learn: 54.4505914	to

In [68]:
df_plt = get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred)
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} ({model_name})"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [30]:
## RandomForest
model_name = "RandomForest"
rf = RandomForestRegressor()
train_pred, valid_pred, pred = learning(rf, X_train, y_train, X_valid, y_valid, X_test)

Test Scores
	Explained variance: 0.9549164562785248
	Mean absolute error: 29.0009497716895
	Mean squared error: 2483.53099543379
	R2 score: 0.9369919185037411
	SMAPE score: 18.742160492388567


In [31]:
df_plt = get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred)
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} ({model_name})"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [36]:
## Ridge
ridge = Ridge()
train_pred, valid_pred, pred = learning(ridge, X_train, Y_train, X_valid, Y_valid, X)

Test Scores
	Explained variance: 0.753603614376493
	Mean absolute error: 0.05032981394611865
	Mean squared error: 0.005122359608950401
	R2 score: 0.7535864324248567
	SMAPE score: 0.19799265465438717


In [37]:
pred

array([[  84.9970054 ],
       [  81.38259867],
       [-111.84305776],
       ...,
       [ 294.41778338],
       [ 452.87481392],
       [ 435.45568248]])

↓ 値低い製品も悪くない気がするけど、、、他にズレているところがあるのか？  
　各々で学習させてみる？

In [32]:
df_buf = df_plt[df_plt["kind"]=="test"].drop(columns=["kind"])
df_buf.rename(columns={"num_sold": "num_sold_test"}, inplace=True)
df_buf = pd.merge(df_buf, df_plt[df_plt["kind"]=="orig"].drop(columns=["kind"]))

In [35]:
df_plt.head(3)

Unnamed: 0,id,date,country,store,product,num_sold,kind
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63,orig
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66,orig
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9,orig


In [33]:
buf = []
for c in df_buf["country"].unique():
    for s in df_buf["store"].unique():
        for p in df_buf["product"].unique():
            df_p = df_buf[(df_buf["country"]==c) & (df_buf["store"]==s) & (df_buf["product"]==p)]
            rmse = mean_squared_error(df_p["num_sold"], df_p["num_sold_test"], squared=False)
            smape = get_smape(df_p["num_sold"], df_p["num_sold_test"])
            buf.append([c, s, p, rmse, smape])
df_mse = pd.DataFrame(buf, columns=["country", "store", "product", "rmse", "smape"])            

In [34]:
df_mse.sort_values("smape")

Unnamed: 0,country,store,product,rmse,smape


In [97]:
CSP

{'country': ['Argentina', 'Canada', 'Estonia', 'Japan', 'Spain'],
 'store': ['Kaggle Learn', 'Kaggle Store', 'Kagglazon'],
 'product': ['Using LLMs to Improve Your Coding',
  'Using LLMs to Train More LLMs',
  'Using LLMs to Win Friends and Influence People',
  'Using LLMs to Win More Kaggle Competitions',
  'Using LLMs to Write Better']}

In [75]:
# csp, df_buf = select_csp(df_plt, [0, 0, 2])
csp, df_buf = select_csp(df_plt, [2, 0, 0])
# df_buf["num_sold"] = df_buf["num_sold"].astype(int)
# df_buf["num_sold"] = ((df_buf["num_sold"] * 2 + 1) // 2).astype(int)    # 四捨五入int
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} ({model_name})"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [100]:
csp, df_buf = select_csp(df_plt, [1, 2, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (RandomForest)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

↑　絶対値が大きいと、RMSEとしては悪い方向になるかも？

In [29]:
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]}"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

In [60]:
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (LGBM)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

submit

In [36]:
print(df_test.shape, df_subm.shape, len(pred))

(27375, 6) (27375, 2) 27375


In [37]:
# df_subm = pd.merge(df_subm.drop(columns=["num_sold"]), df_test[["id", "num_sold"]].copy(), how="left")
df_subm["num_sold"] = pred
# df_subm["num_sold"] = ((pred * 2 + 1) // 2).astype(int)    # 四捨五入int
df_subm["num_sold"].isna().sum()

0

In [38]:
df_subm

Unnamed: 0,id,num_sold
0,136950,32
1,136951,31
2,136952,4
3,136953,32
4,136954,27
...,...,...
27370,164320,693
27371,164321,739
27372,164322,114
27373,164323,652


In [39]:
df_subm.to_csv("./submission.csv", index=False)

`Public score: 50.61907` year入れて、CatBoost。でもまだ50点台。。。  
`Public score: 50.35902` RandomForest。CatBoostより下振れしてなさそうに見えたけど。。
