In [None]:
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import random
import os

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
SEED = 42

seed_everything(SEED)

In [None]:
train_path = "../input/tabular-playground-series-jan-2022/train.csv"
train_df = pd.read_csv(train_path)
train_ids = train_df.pop("row_id")

<h1>Feature Engineering</h1>

In [None]:
train_df["date"] = pd.to_datetime(train_df["date"])
train_df["day"] = train_df["date"].dt.day
train_df["month"] = train_df["date"].dt.month
train_df["year"] = train_df["date"].dt.year
train_df = train_df.drop("date", axis=1)
train_df = pd.get_dummies(train_df, prefix=None)

In [None]:
targets = train_df.pop("num_sold")

<h1>Train & Validation Split</h1>

In [None]:
train_data, validation_data, train_targets, validation_targets = train_test_split(train_df, targets, shuffle=False, test_size=0.5, random_state=SEED)

<h1>Modeling & Training</h1>

In [None]:
model_parameters = {
    "learning_rate": 1e-3,
    "iterations": 1000,
    "verbose": 250,
    "early_stopping_rounds": 100,
    "random_state": SEED,
    "eval_metric": "SMAPE",
}
model = CatBoostRegressor(**model_parameters).fit(train_data, 
                                                  train_targets, 
                                                  eval_set=(validation_data, validation_targets))

<h1>Inferencing</h1>

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [None]:
test_path = "../input/tabular-playground-series-jan-2022/test.csv"
test_df = pd.read_csv(test_path)
test_ids = test_df.pop("row_id")

In [None]:
test_df["date"] = pd.to_datetime(test_df["date"])
test_df["day"] = test_df["date"].dt.day
test_df["month"] = test_df["date"].dt.month
test_df["year"] = test_df["date"].dt.year
test_df = test_df.drop("date", axis=1)
test_df = pd.get_dummies(test_df, prefix=None)

In [None]:
test_predictions = model.predict(test_df)

In [None]:
def make_submission(ids, predictions, path="submission.csv"):
    assert len(ids) == len(predictions), f"Lengths of `ids` ({len(ids)}) and `predictions` ({len(predictions)}) aren't the same."
    df = pd.DataFrame({
        "row_id": ids,
        "num_sold": predictions,
    })
    
    df.to_csv(path, index=False)
    
    return df

In [None]:
make_submission(test_ids, test_predictions, path="submission_05.csv")

In [None]:
n_folds = 10
strategy = TimeSeriesSplit(n_splits=n_folds)
folds = strategy.split(train_df, targets)

scores = []
for i, (train_indexes, validation_indexes) in enumerate(folds):
    if isinstance(train_df, pd.DataFrame):
        train_data = train_df.iloc[train_indexes]
        validation_data = train_df.iloc[validation_indexes]
    else:
        train_data = train_df[train_indexes]
        validation_data = train_df[validation_indexes]
        
    train_targets = targets[train_indexes]
    validation_targets = targets[validation_indexes]
    
    model_parameters = {
        "learning_rate": 1e-3,
        "iterations": 1000,
        "verbose": 250,
        "early_stopping_rounds": 100,
        "random_state": SEED,
        "eval_metric": "SMAPE",
    }
    
    model = CatBoostRegressor(**model_parameters).fit(train_data, 
                                                      train_targets, 
                                                      eval_set=(validation_data, validation_targets))
    
    validation_predictions = model.predict(validation_data)
    validation_score = SMAPE(validation_targets, validation_predictions)
    scores.append(validation_score)
    
    
    
    oof_predictions = model.predict(test_df)
    if i == 0:
        test_predictions = oof_predictions
    else:
        test_predictions += oof_predictions
        
test_predictions /= n_folds

scores = np.array(scores)

In [None]:
scores.mean()

In [None]:
make_submission(test_ids, test_predictions, path="submission_10folds.csv")