### Forecasting Mini-Course Sales
- https://www.kaggle.com/code/akioonodera/ps-3-19-lgbm-reg  
- <a href="https://www.kaggle.com/code/zhukovoleksiy/ps-s3e19-eda-simple-solution">[PS S3E19] EDA + Simple Solution</a> ← コレ

In [1]:
# Misc
import numpy as np
import pandas as pd
import random
import os
from copy import deepcopy
from functools import partial
from itertools import combinations
import random
import gc
import holidays
from datetime import datetime, timedelta
# from tqdm.notebook import tqdm
from tqdm import tqdm
from holidays import CountryHoliday

# Import libraries for Ploting
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder, CatBoostEncoder
from sklearn.preprocessing import LabelEncoder
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from sklearn.model_selection import cross_validate
from sklearn.ensemble import StackingRegressor
from typing import List

# Improt libraries for Deep Learning
# import tensorflow as tf
# from tensorflow import keras

# Import libraries for Hypertuning
# import optuna

#Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LassoCV
from sklearn.linear_model import PassiveAggressiveRegressor, ARDRegression, RidgeCV, ElasticNetCV
from sklearn.linear_model import TheilSenRegressor, RANSACRegressor, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSRegression
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

# Useful line of code to set the display option so we could see all the columns in pd dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train = pd.read_csv("./data/train.csv", parse_dates=["date"], index_col=[0])
df_test = pd.read_csv("./data/test.csv", parse_dates=["date"], index_col=[0])
submission = pd.read_csv("./data/sample_submission.csv")

# Set columns
target_col = 'num_sold'
cat_cols = ['country', 'store', 'product']

# Repalece "Using LLMs to"
df_train['product'] = df_train['product'].str.replace('Using LLMs to ', '')
df_test['product'] = df_test['product'].str.replace('Using LLMs to ', '')

print(f'[INFO] Shapes:'
      f'\n train: {df_train.shape}'
      f'\n test: {df_test.shape}\n')

print(f'[INFO] Any missing values:'
      f'\n train: {df_train.isna().any().any()}'
      f'\n test: {df_test.isna().any().any()}\n')

df_train.head()

[INFO] Shapes:
 train: (136950, 5)
 test: (27375, 4)

[INFO] Any missing values:
 train: False
 test: False



Unnamed: 0_level_0,date,country,store,product,num_sold
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2017-01-01,Argentina,Kaggle Learn,Improve Your Coding,63
1,2017-01-01,Argentina,Kaggle Learn,Train More LLMs,66
2,2017-01-01,Argentina,Kaggle Learn,Win Friends and Influence People,9
3,2017-01-01,Argentina,Kaggle Learn,Win More Kaggle Competitions,59
4,2017-01-01,Argentina,Kaggle Learn,Write Better,49


---

In [3]:
df_train["num_sold"] = np.log(df_train["num_sold"])

In [5]:
def feature_engineering(df):
    
    df.date = pd.to_datetime(df.date)
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['dayofmonth'] = df.date.dt.day
    df['dayofweek'] = df.date.dt.dayofweek
    df['dayname'] = df.date.dt.strftime('%A')
    df['dayofyear'] = df.date.dt.dayofyear
    
    return df

def fix_covid(df):
    # Fix Covid effect
    df_copy = df.copy()
    df_copy = df_copy[df_copy['date'].between('2020-04-01', '2020-04-30')]
    df_copy['num_sold'] = df_copy['num_sold'] * 1.2 # try different values between 1.0 and 1.3
    df[df['date'].between('2020-04-01', '2020-04-30')] = df_copy
    
    df_copy = df_train.copy()
    df_copy = df_copy[df_copy['date'].between('2020-05-01', '2020-05-31')]
    df_copy['num_sold'] = df_copy['num_sold'] * 1.1 # try different values between 1.0 and 1.15
    df[df['date'].between('2020-05-01', '2020-05-31')] = df_copy
    
    return df

def get_holidays(df):
    years_list = [2017, 2018, 2019, 2020, 2021, 2022]

    holiday_BE = holidays.CountryHoliday('BE', years = years_list)
    holiday_FR = holidays.CountryHoliday('FR', years = years_list)
    holiday_DE = holidays.CountryHoliday('DE', years = years_list)
    holiday_IT = holidays.CountryHoliday('IT', years = years_list)
    holiday_PL = holidays.CountryHoliday('PL', years = years_list)
    holiday_ES = holidays.CountryHoliday('ES', years = years_list)

    holiday_dict = holiday_BE.copy()
    holiday_dict.update(holiday_FR)
    holiday_dict.update(holiday_DE)
    holiday_dict.update(holiday_IT)
    holiday_dict.update(holiday_PL)
    holiday_dict.update(holiday_ES)

    df['holiday_name'] = df['date'].map(holiday_dict)
    df['is_holiday'] = np.where(df['holiday_name'].notnull(), 1, 0)
    df['holiday_name'] = df['holiday_name'].fillna('Not Holiday')
    df = df.drop(columns=["holiday_name"])
    
    return df

In [6]:
# def periodic_spline_transformer(period, n_splines=None, degree=3):
#     """
#     Kaynak: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html
#     """
    
#     if n_splines is None:
#         n_splines = period
#     n_knots = n_splines + 1  # periodic and include_bias is True
#     return SplineTransformer(
#         degree=degree,
#         n_knots=n_knots,
#         knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
#         extrapolation="periodic",
#         include_bias=True)

In [7]:
# def seasonality_spline_features(hours=np.arange(1,32)):
#     hour_df = pd.DataFrame(np.linspace(1, 32, 32).reshape(-1, 1),columns=["dayofmonth"])
#     splines = periodic_spline_transformer(32, n_splines=4).fit_transform(hour_df)
#     splines_df = pd.DataFrame(splines,columns=[f"spline_{i}" for i in range(splines.shape[1])])
#     splines_df =pd.concat([pd.Series(hours,name='dayofmonth'), splines_df], axis="columns")
    
#     return splines_df

In [8]:
def seasonality_features(df):
    df['month_sin'] = np.sin(2*np.pi*df.month/12)
    df['month_cos'] = np.cos(2*np.pi*df.month/12)
    df['day_sin'] = np.sin(2*np.pi*df.dayofmonth/24)
    df['day_cos'] = np.cos(2*np.pi*df.dayofmonth/24)
    return df

In [9]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

df_train = seasonality_features(df_train)
df_test = seasonality_features(df_test)

df_train = get_holidays(df_train)
df_test = get_holidays(df_test)

df_train = fix_covid(df_train)

print(f'[INFO] Shapes:'
      f'\n train: {df_train.shape}'
      f'\n test: {df_test.shape}\n')

print(f'[INFO] Any missing values:'
      f'\n train: {df_train.isna().any().any()}'
      f'\n test: {df_test.isna().any().any()}\n')

df_train.head()

[INFO] Shapes:
 train: (136950, 16)
 test: (27375, 15)

[INFO] Any missing values:
 train: False
 test: False



Unnamed: 0_level_0,date,country,store,product,num_sold,year,month,dayofmonth,dayofweek,dayname,dayofyear,month_sin,month_cos,day_sin,day_cos,is_holiday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,2017-01-01,Argentina,Kaggle Learn,Improve Your Coding,4.143135,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1
1,2017-01-01,Argentina,Kaggle Learn,Train More LLMs,4.189655,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1
2,2017-01-01,Argentina,Kaggle Learn,Win Friends and Influence People,2.197225,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1
3,2017-01-01,Argentina,Kaggle Learn,Win More Kaggle Competitions,4.077537,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1
4,2017-01-01,Argentina,Kaggle Learn,Write Better,3.89182,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1


In [10]:
#Get list of major holidays for ['Argentina' 'Canada' 'Estonia' 'Japan' 'Spain'] and append to df 

def GetHolidays(df):
    
    # Analyzing country specific holidays:-
    Holidays = pd.DataFrame(columns = ['date', 'holiday_name', 'country'])
    country_list = df.country.unique()
    min_year = df.year.min()
    max_year = df.year.max()
    number_of_days = (df['date'].max() - df['date'].min()).days + 1
    date_list = [df['date'].min() + timedelta(days=day) for day in range(number_of_days)]
    
    if min_year == max_year:
        years = [min_year]
    else:
        years = np.arange(min_year, max_year, 1)
        
    for country in tqdm(country_list):
        for h in CountryHoliday(country, years = years).items():   
            i=len(Holidays)
            Holidays.loc[i,'date']=h[0]
            Holidays.loc[i,'holiday_name']=h[1]
            Holidays.loc[i,'country']=country
    #print(Holidays)
    Holidays['isHoliday'] = 1

    # Merge on unique combinations of date and country
    date_country = df[['date', 'country']].drop_duplicates().reset_index(drop=True)
    date_country['date_str'] = date_country['date'].astype(str)
    Holidays['date_str'] = Holidays['date'].astype(str)
    date_country_holidays = pd.merge(date_country, Holidays[['date_str', 'country', 'isHoliday','holiday_name']], how='left', on=['date_str', 'country'])

    # Merge back to the original DataFrame
    df = pd.merge(df, date_country_holidays[['date', 'country', 'isHoliday','holiday_name']], how='left', on=['date', 'country'])
    df['isHoliday'] = df['isHoliday'].fillna(0).astype(int)
    df['holiday_name'] = df['holiday_name'].fillna('Not Holiday')
    
    #Add Weekend
    #df['Friday']=(df['dayname']=='Friday').astype(int) 
    #df['Saturday']=(df['dayname']=='Saturday').astype(int) 
    #df['Sunday']=(df['dayname']=='Sunday').astype(int)  
    return df,Holidays

In [11]:
df_train, TrainHolidays = GetHolidays(df_train)
df_test, TestHolidays = GetHolidays(df_test)

100%|██████████| 5/5 [00:00<00:00, 58.01it/s]
100%|██████████| 5/5 [00:00<00:00, 175.80it/s]


In [12]:
#df_train = df_train.loc[df_train['year'] != 2020]
X_test = df_test.reset_index(drop=True)

X_train_ = df_train[df_train.date < "2021-01-01"].drop(columns = ["num_sold"])
y_train_ = df_train[df_train.date < "2021-01-01"]["num_sold"].copy()
X_val = df_train[df_train.date >= "2021-01-01"].drop(columns = ["num_sold"])
y_val = df_train[df_train.date >= "2021-01-01"]["num_sold"].copy()

X_train = df_train.drop(columns=["num_sold"]).reset_index(drop=True)
y_train = df_train["num_sold"].reset_index(drop=True)

# Create an instance of LabelEncoder
le = LabelEncoder()

X_train['store'] = le.fit_transform(X_train['store'])
X_train['product'] = le.fit_transform(X_train['product'])

X_test['store'] = le.fit_transform(X_test['store'])
X_test['product'] = le.fit_transform(X_test['product'])

print(f"X_train shape :{X_train.shape} , y_train shape :{y_train.shape}")
print(f"X_test shape :{X_test.shape}")

X_train.head()

X_train shape :(136950, 17) , y_train shape :(136950,)
X_test shape :(27375, 17)


Unnamed: 0,date,country,store,product,year,month,dayofmonth,dayofweek,dayname,dayofyear,month_sin,month_cos,day_sin,day_cos,is_holiday,isHoliday,holiday_name
0,2017-01-01,Argentina,1,0,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1,1,Año Nuevo
1,2017-01-01,Argentina,1,1,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1,1,Año Nuevo
2,2017-01-01,Argentina,1,2,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1,1,Año Nuevo
3,2017-01-01,Argentina,1,3,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1,1,Año Nuevo
4,2017-01-01,Argentina,1,4,2017,1,1,6,Sunday,1,0.5,0.866025,0.258819,0.965926,1,1,Año Nuevo


---

In [13]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [14]:
# cat_params = {
#     'n_estimators': 195,
#     'learning_rate': 0.07725732658711602,
#     'depth': 7,
#     'l2_leaf_reg': 8.601133541582584,
#     'subsample': 0.4279526734063217,
#     'colsample_bylevel': 0.6767696482697301,
#     'random_state': 42,
#     'verbose': False
# }

cat_params={
    #'n_estimators': 295,
    'learning_rate': 0.07725732658711602,
    'depth': 15,
    'l2_leaf_reg': 8.601133541582584,
    'subsample': 0.4279526734063217,
    'colsample_bylevel': 0.6767696482697301,
    "random_state":42,
    'verbose':False}


#model = lgb.LGBMRegressor(**lgb_optuna)

cat_features = ["country", "store", "product", "dayname", "holiday_name"]
catboost_reg = CatBoostRegressor(**cat_params, cat_features = cat_features)
catboost_reg.fit(X_train_, y_train_, eval_set=(X_val, y_val), silent=True)

training_predictions = np.exp(catboost_reg.predict(X_val))

In [15]:
training_predictions.sum()

4564827.999391194

In [16]:
print("SMAPE score Jan to Mar 2021:",smape(np.exp(y_val)[:6750],training_predictions[:6750]))
print("SMAPE score Apr to Dec 2021:",smape(np.exp(y_val)[6750:],training_predictions[6750:]))

SMAPE score Jan to Mar 2021: 10.682869041003775
SMAPE score Apr to Dec 2021: 16.64263729649153


---
確認

In [17]:
import make_graph    # 自前
from bokeh.plotting import figure, output_notebook, show
output_notebook()

In [22]:
CSP = {col: list(df_train[col].unique()) for col in ["country", "store", "product"]}

def select_csp(df, idx):
    """インデックス指定で各名称取得、DataFrameフィルタ"""
    if len(idx) != 3:
        return (), pd.DataFrame()
    df_ret = df.copy()
    for i, col in zip(idx, ["country", "store", "product"]):
        if i < 0:
            continue
        idx_buf = i
        if i >= len(CSP[col]):
            print(f"[error] 指定インデックスが範囲外です。 ({col}: {CSP[col]})")
            idx_buf = 0
        df_ret = df_ret[df_ret[col] == CSP[col][idx_buf]]

    sel = {col: list(df_ret[col].unique()) for col in ["country", "store", "product"]}
    return sel, df_ret

In [25]:
def get_plot_dataframe(X_train, X_valid, train_pred, valid_pred, pred):
    """学習とテスト確認用 DataFrame 作成"""
    cols = ["date", "country", "store", "product"]
    # オリジナル
    df_plt = df_train[cols + ["num_sold"]].copy()
    df_plt["kind"] = "orig"
    # 学習結果
    df_train_pred = X_train[cols].copy()
    df_train_pred["num_sold"] = train_pred
    df_train_pred["kind"] = "train"
    # 検証結果
    df_valid_pred = X_valid[cols].copy()
    df_valid_pred["num_sold"] = valid_pred
    df_valid_pred["kind"] = "valid"
    # 実際の予測
    df_pred = df_test.copy()
    df_pred["num_sold"] = pred
    df_pred["kind"] = "test"

    df_plt = pd.concat([df_plt, df_train_pred, df_valid_pred, df_pred]).reset_index(drop=True)
    df_plt["num_sold"] = np.exp(df_plt["num_sold"])
    # print(df_plt.shape)
    return df_plt

In [31]:
X_train_.tail(3)

Unnamed: 0,date,country,store,product,year,month,dayofmonth,dayofweek,dayname,dayofyear,month_sin,month_cos,day_sin,day_cos,is_holiday,isHoliday,holiday_name
109572,2020-12-31,Spain,Kagglazon,Win Friends and Influence People,2020,12,31,3,Thursday,366,-2.449294e-16,1.0,0.965926,-0.258819,0,0,Not Holiday
109573,2020-12-31,Spain,Kagglazon,Win More Kaggle Competitions,2020,12,31,3,Thursday,366,-2.449294e-16,1.0,0.965926,-0.258819,0,0,Not Holiday
109574,2020-12-31,Spain,Kagglazon,Write Better,2020,12,31,3,Thursday,366,-2.449294e-16,1.0,0.965926,-0.258819,0,0,Not Holiday


In [32]:
train_pred = catboost_reg.predict(X_train_)
valid_pred = catboost_reg.predict(X_val)
pred = catboost_reg.predict(X_test)

df_plt = get_plot_dataframe(X_train_, X_val, train_pred, valid_pred, pred)
csp, df_buf = select_csp(df_plt, [0, 0, 0])
ttl = f"{[csp[k][0] for k in csp.keys() if len(csp[k]) < 2]} (CatBoost)"
p = make_graph.make_trend(df_buf, "num_sold", "kind", 900, 300, ttl)
show(p)

---

In [20]:
cat_features = ["country", "store", "product", "dayname", "holiday_name"]
catboost_reg = CatBoostRegressor(**cat_params, cat_features = cat_features)
catboost_reg.fit(X_train, y_train, silent=True)

pred = np.exp(catboost_reg.predict(X_test))
X_test["pred"] = pred

In [21]:
#https://www.kaggle.com/code/iqbalsyahakbar/ps3e19-time-series-for-beginners

def multipliers(predictors, prediction, canada = 1, japan = 1, spain = 1, estonia = 1, argentina = 1):
    prediction[predictors.country == 'Canada'] *= canada
    prediction[predictors.country == 'Japan'] *= japan
    prediction[predictors.country == 'Spain'] *= spain
    prediction[predictors.country == 'Estonia'] *= estonia
    prediction[predictors.country == 'Argentina'] *= argentina
    return prediction

In [22]:
X_test["pred"] = np.round(multipliers(X_test, X_test["pred"] * 1.5, .58, .76, 1.01, 1.08, 2.8))

In [3]:
import numpy as np
1.5 * np.array([.58, .76, 1.01, 1.08, 2.8])

array([0.87 , 1.14 , 1.515, 1.62 , 4.2  ])

In [23]:
submission['num_sold'] = X_test["pred"]
submission.to_csv('submission_.csv', index=False)
submission

Unnamed: 0,id,num_sold
0,136950,16.0
1,136951,16.0
2,136952,8.0
3,136953,16.0
4,136954,15.0
...,...,...
27370,164320,10.0
27371,164321,10.0
27372,164322,7.0
27373,164323,10.0
