In [None]:
import numpy as np
import pandas as pd

In [None]:
def load_data(path):
    return pd.read_csv(path, parse_dates=['date'])

In [None]:
train = load_data('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = load_data('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
cat_features = ["country", "store", "product", "holiday_name"]

In [None]:
train["country"].value_counts()

In [None]:
train["store"].value_counts()

In [None]:
train["product"].value_counts()

# Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.boxplot(y="num_sold", data=train)
plt.tight_layout()
plt.show()

In [None]:
train[train['num_sold'] > 1800].sort_values(by="num_sold", ascending=False)

# Feature Engineering

In [None]:
import holidays

years = [2015, 2016, 2017, 2018, 2019]
country_list = [
    ("Finland", "FI"),
    ("Norway", "NO"),
    ("Sweden", "SE")
]
holiday_dict = {country[0]: holidays.CountryHoliday(country[1], years=years) for country in country_list}

In [None]:
def get_holiday_name(row):
    try:
        return holiday_dict[row["country"]][row["date"]]
    except:
        return "NA"

In [None]:
def engineer(df):
    df = df.copy()
    
    t0 = np.datetime64('2015-01-01')
    df['time_step'] = (df["date"] - t0).astype('timedelta64[D]').astype(np.int)
    
    df['year'] = df['date'].dt.year
    
    df['quarter'] = df['date'].dt.quarter
    df['quarter_sin'] = np.sin(df['quarter'] * (2 * np.pi / 4))
    df['quarter_cos'] = np.cos(df['quarter'] * (2 * np.pi / 4))
    
    df['month'] = df['date'].dt.month
    df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
    df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
    
    df['week'] = df['date'].dt.week
    df['week_sin'] = np.sin(df['week'] * (2 * np.pi / 52))
    df['week_cos'] = np.cos(df['week'] * (2 * np.pi / 52))
    
    df['day'] = df['date'].dt.day
    df['day_sin'] = np.sin(df['day'] * (2 * np.pi / 31))
    df['day_cos'] = np.cos(df['day'] * (2 * np.pi / 31))
    
    df['day_of_year'] = df['date'].dt.day_of_year
    df['day_of_year_sin'] = np.sin(df['day_of_year'] * (2 * np.pi / 366))
    df['day_of_year_cos'] = np.cos(df['day_of_year'] * (2 * np.pi / 366))
    
    df['day_of_week'] = df['date'].dt.weekday
    df['day_of_week_sin'] = np.sin(df['day_of_week'] * (2 * np.pi / 7))
    df['day_of_week_cos'] = np.cos(df['day_of_week'] * (2 * np.pi / 7))
    
    df["is_weekend"] = df['day_of_week'] >= 5
    
    df['holiday_name'] = df.apply(get_holiday_name, axis=1)
    df['is_holiday'] = np.where(df['holiday_name'] != "NA", 1, 0)
    
    df = df.drop(columns=["row_id", "date"])
    
    return df

In [None]:
train_proc = engineer(train)

In [None]:
y = train_proc["num_sold"]
X = train_proc.drop(columns=["num_sold"])

# CatBoost Model

In [None]:
from catboost import CatBoostRegressor

In [None]:
params = {
    "loss_function": "MAPE",
    "eval_metric": "SMAPE",
    "cat_features": cat_features,
    "iterations": 1800,
    "random_seed": 63
}

# Cross-Validation

In [None]:
def cv(new_params):
    scores = []

    for year in [2015, 2016, 2017, 2018]:
#         print(f"Fold: {year}")
#         print("-" * 80)

        val_index = X['year'] == year
        val_X = X[val_index]
        val_y = y[val_index]

        train_index = X['year'] != year
        train_X = X[train_index]
        train_y = y[train_index]

        model = CatBoostRegressor(**params)
        model.set_params(**new_params)
        model.fit(train_X, train_y, eval_set=(val_X, val_y), use_best_model=True, verbose=False)

        score = model.get_best_score()["validation"]["SMAPE"]
        best_iteration = model.get_best_iteration()

        scores.append(score)

#         print(f"Best iteration: {best_iteration}")
#         print(f"Best score: {score}")
#         print()

    avg_score = sum(scores) / len(scores)
    
#     print("-" * 80)
#     print(f"Avg score: {avg_score}")
#     print("-" * 80)
#     print()
    
    return avg_score

# Grid Search

In [None]:
from itertools import product

def get_grid_permutations(grid_dict):
    keys, vals = list(grid_dict.keys()), list(grid_dict.values())
    permutations = list(product(*vals))
    result = [{keys[index]: entry[index] for index in range(len(entry))} for entry in permutations]
    return result

In [None]:
%%time

grid = {'depth': [10, 11, 12],
        'l2_leaf_reg': [3, 4, 5],
        'border_count': [512, 768, 1024]}

grid_permutations = get_grid_permutations(grid)

best_score = float("inf")
best_params = grid_permutations[0]

for permutation in grid_permutations:
    print(permutation)
    print("-" * 80)
    
    score = cv(permutation)

    print(f"CV score: {score}")
    print()
    
    if score < best_score:
        best_score = score
        best_params = permutation

print("-" * 80)
print(f"Best score: {best_score}")
print(f"Best params: {best_params}")
print("-" * 80)
print()

## Fit Model with best params

In [None]:
model = CatBoostRegressor(**params)
model.set_params(**best_params)
model.fit(X, y, verbose=200)

# Feature Importance

In [None]:
model.get_feature_importance(prettified=True)

# Submission

In [None]:
test_X = engineer(test)

In [None]:
test_preds = model.predict(test_X)

In [None]:
output = pd.DataFrame({"row_id": test["row_id"], "num_sold": test_preds})
output.to_csv('submission.csv', index=False)

In [None]:
output