In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Packages

In [None]:
def submit(model, test_features, test_ids, filename):
    loss_pred = model.predict(test_features)
    submission = pd.DataFrame({"id": test_ids, "loss": loss_pred.reshape(-1)})
    submission.to_csv(filename, index = False)

# Common Functions

In [None]:
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")

test_data = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

# EDA

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe().transpose()

In [None]:
train_data.shape

In [None]:
corr_score = train_data.corr()

In [None]:
corr_score["loss"].sort_values(ascending = False)

# Data Preprocessing

### Drop Id Column

In [None]:
train_data.pop("id")
test_ids = test_data.pop("id")

In [None]:
train_mean = train_data.mean()
train_std = train_data.std()

In [None]:
train_target_mean = train_mean.pop("loss")
train_targets_std = train_std.pop("loss")

### Train Validation Split

In [None]:
validation_split = 0.2

In [None]:
train_features, validation_features = train_test_split(train_data, test_size = validation_split)

In [None]:
train_targets, validation_targets = train_features.pop("loss"), validation_features.pop("loss")

### Data Scaling

In [None]:
should_scale = False
if should_scale == True:
    train_features = (train_features - train_mean) / train_std
    validation_features = (validation_features - train_mean) / train_std
    test_features = (test - train_mean) / train_std
    print(test_features.head())
    print(train_features.head())
    print(validation_features.head())
else:
    test_features = test_data

### Model Develpoment

#### Using Catboost

In [None]:
import catboost
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
begin = time.time()
parameters = {
    "depth": [6, 7, 8],
    "learning_rate": [0.08, 0.1],
    "iterations": [300, 350], 
}
def train_catboost(hyperparameters, X_train, X_val, y_train, y_val):
    keys = hyperparameters.keys()
    best_index = {key:0 for key in keys}
    best_cat = None
    best_score = 10e8
    for (index, key) in enumerate(keys):
        print("Find best parameter for %s" %(key))
        items = hyperparameters[key]
        best_parameter = None
        temp_best = 10e8
        for (key_index, item) in enumerate(items):
            iterations = hyperparameters["iterations"][best_index["iterations"]] if key != "iterations" else item
            learning_rate = hyperparameters["learning_rate"][best_index["learning_rate"]] if key != "learning_rate" else item
            depth = hyperparameters["depth"][best_index["depth"]] if key != "depth" else item
            print("Train with iterations: %d learning_rate: %.2f depth:%d"%(iterations, learning_rate, depth))
            cat = catboost.CatBoostRegressor(
                iterations = iterations, 
                learning_rate = learning_rate,
                depth = depth
            )
            cat.fit(X_train, y_train, verbose=False)
            y_pred = cat.predict(X_val)
            score = np.sqrt(mean_squared_error(y_val, y_pred))
            print("RMSE: %.2f"%(score))
            if score < temp_best:
                temp_best = score
                best_index[key] = key_index
                best_parameter = item
            if score < best_score:
                best_score = score
                best_cat = cat
        print("Best Parameter for %s: "%(key), best_parameter)
    best_parameters = {
        "iterations": hyperparameters["iterations"][best_index["iterations"]],
        "learning_rate": hyperparameters["learning_rate"][best_index["learning_rate"]],
        "depth": hyperparameters["depth"][best_index["depth"]]
    }
    return best_cat, best_score, best_parameters
best_cat, best_score, best_parameters = train_catboost(parameters, train_features, validation_features, train_targets, validation_targets)
print("Best CatBoost Model: ", best_cat)
print("Best MAE: ", best_score)
elapsed = time.time() - begin 
print("Elapsed time: ", elapsed)
submit(best_cat, test_features, test_ids, "submission.csv")

In [None]:
from sklearn.model_selection import KFold
fold = 1
for train_indices, val_indices in KFold(n_splits=5, shuffle=True).split(train_data):
    print("Training with Fold %d"%(fold))
    X_train = train_data.iloc[train_indices]
    X_val = train_data.iloc[val_indices]
    y_train = X_train.pop("loss")
    y_val = X_val.pop("loss")
    if should_scale:
        X_train = (X_train - train_mean) / train_std
        X_val = (X_val - train_mean) / train_std
    cat = catboost.CatBoostRegressor(
        iterations = best_parameters["iterations"], 
        learning_rate = best_parameters["learning_rate"],
        depth = best_parameters["depth"]
    )
    cat.fit(X_train, y_train, verbose=False)
    y_pred = cat.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    print("RMSE: %.2f"%(score))
    submit(cat, test_features, test_ids, "submission_fold%d.csv"%(fold))
    fold += 1

# **Thank You**