In [None]:
# import standard libraries
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import optuna
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import seaborn as sns
from functools import partial

# show all columns
pd.set_option('max_columns', None)

In [None]:
# read the data
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

# drop id columns from train and test sets
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# Simple EDA

In [None]:
# histogramse for all variables with KDE
plt.figure(figsize=(24, 6*(104/4)))
for i in range(len(train.columns.tolist())):
    plt.subplot(26, 4, i+1)
    if i <= 99:
        sns.histplot(train[f'f{i}'], kde=True)
    else:
        sns.histplot(train['loss'], kde=True)
plt.show()

In [None]:
# correlation matrix with heat map
corr = train.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr)
plt.show()

In [None]:
# the ten highest correlated features for each feature
cols = train.columns.tolist()
for col in cols:
    print(col)
    print(corr[col].sort_values(ascending=False)[1:11])
    print('=======================')

In [None]:
# extract X and y for training set
X = train.drop('loss', axis=1).values
y = train['loss'].values

# CatBoost with Optimized Hyperparameters

In [None]:
params = {
        "depth": 5,
        "grow_policy": "Lossguide",
        "l2_leaf_reg": 29.6703901792061,
        "random_strength": 0.00156814441573572
        }


# KFold
n_splits=5
kf = model_selection.KFold(n_splits=n_splits)
scores_train = []
scores_valid = []
preds_valid_array = np.zeros((X.shape[0], ))
preds_test_array = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model = CatBoostRegressor(
                           **params,
                           learning_rate=0.03,
                           iterations=10000,
                           loss_function='RMSE',
                           eval_metric='RMSE',
                           use_best_model=True,
                           early_stopping_rounds=100,
                           task_type="GPU",
                           devices='0:1'
                           )

        
    model.fit(
          x_train, y=y_train,
          #embedding_features=None,
          use_best_model=True,
          eval_set=[(x_valid, y_valid)],
          verbose=100
             )



    preds_train = np.clip(model.predict(x_train), 0, 50)
    preds_valid = np.clip(model.predict(x_valid), 0, 50)
    preds_test = np.clip(model.predict(test), 0, 50)
    
    preds_valid_array[valid_idx] += preds_valid
    preds_test_array += preds_test / n_splits
    
    try:
        score_train = np.sqrt(metrics.mean_squared_error(y_train, preds_train))
        score_valid = np.sqrt(metrics.mean_squared_error(y_valid, preds_valid))
        print(score_valid)
        scores_train.append(score_train)
        scores_valid.append(score_valid)
    except:
        pass

print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'loss': preds_valid_array}).to_csv('catboost1_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array
sample.to_csv('catboost1_test.csv', index=False)