# Introduction

In this simple model, we'll use CatBoost with mostly default parameters to train a model. We'll use all of the continuous values, and some of the categorical ones. Details below. 

# Import Data

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
train

# Category Conversion

For CatBoost, setting categorical columns to be of type `category` speeds up training.

In [None]:
train["cat0"] = train["cat0"].astype("category")
train["cat1"] = train["cat1"].astype("category")
train["cat2"] = train["cat2"].astype("category")
train["cat3"] = train["cat3"].astype("category")
train["cat4"] = train["cat4"].astype("category")
train["cat5"] = train["cat5"].astype("category")
train["cat6"] = train["cat6"].astype("category")
train["cat7"] = train["cat7"].astype("category")
train["cat8"] = train["cat8"].astype("category")
train["cat9"] = train["cat9"].astype("category")

test["cat0"] = test["cat0"].astype("category")
test["cat1"] = test["cat1"].astype("category")
test["cat2"] = test["cat2"].astype("category")
test["cat3"] = test["cat3"].astype("category")
test["cat4"] = test["cat4"].astype("category")
test["cat5"] = test["cat5"].astype("category")
test["cat6"] = test["cat6"].astype("category")
test["cat7"] = test["cat7"].astype("category")
test["cat8"] = test["cat8"].astype("category")
test["cat9"] = test["cat9"].astype("category")

# Features

In [None]:
cat_features = [
    "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9"
]

cont_features = [
    "cont0", "cont1", "cont2", "cont3", "cont4",
    "cont5", "cont6", "cont7", "cont8", "cont9", "cont10", 
    "cont11", "cont12", "cont13"
]

# Model Building

We'll build 10 CatBoost models over 10 folds. We'll use out-of-fold predictions for our final submission.

In [None]:
import gc

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

n_folds = 10

skf = KFold(n_splits=n_folds, random_state=2021, shuffle=True)

train_oof = np.zeros((300000,))
test_preds = 0

features = []
features.extend(cat_features)
features.extend(cont_features)

cat_params = {
    "cat_features": cat_features,
    "verbose": 500,
    "eval_metric": "RMSE",
    "loss_function": "RMSE",
    "random_state": 2021,
    "num_boost_round": 20000,
    "od_type": "Iter",
    "od_wait": 200,    
    "task_type": "GPU",
    "devices": "0",
    "max_depth": 4,
    "learning_rate": 0.010155932673991064,
    "l2_leaf_reg": 9.083564967810792,
    "bagging_temperature": 1.5604104823748561,
    "penalties_coefficient": 2.0037171127502633,
    "grow_policy": "Depthwise",
}

importances = pd.DataFrame()

for fold, (train_index, test_index) in enumerate(skf.split(train, train["target"])):
    print("-------> Fold {} <--------".format(fold + 1))
    x_train, x_valid = pd.DataFrame(train.iloc[train_index]), pd.DataFrame(train.iloc[test_index])
    y_train, y_valid = train["target"].iloc[train_index], train["target"].iloc[test_index]
    
    x_train_features = pd.DataFrame(x_train[features])
    x_valid_features = pd.DataFrame(x_valid[features])

    model = CatBoostRegressor(
        **cat_params
    )
    model.fit(
        x_train_features, 
        y_train,
        eval_set=[(x_valid_features, y_valid)],
        verbose=500,
        early_stopping_rounds=200,
    )
    oof_preds = model.predict(x_valid_features)
    test_preds += model.predict(test[features]) / n_folds
    train_oof[test_index] = oof_preds
    print("")
    
print("--> Overall results for out of fold predictions")
print(": RMSE = {}".format(mean_squared_error(train_oof, train["target"], squared=False)))

# Generate Submission

In [None]:
preds = test_preds.tolist()
test_ids = test["id"].tolist()

submission = pd.DataFrame({"id": test_ids, "target": preds})
submission.to_csv("sumbission.csv", index=False)