# Libraries import

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import itertools
import time

# Exploratory data analysis

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
# Correlation matrix
train_df.corr()

In [None]:
# A little bit more beautiful correlation matrix
sns.heatmap(train_df.corr(), cmap='coolwarm')

In [None]:
train_df = train_df.drop('id', axis=1)
X = train_df.drop('target', axis=1)
Y = train_df.target

In [None]:
X_test = test_df.drop('id', axis=1)
test_df['target'] = np.nan

In [None]:
float_cols_train = [c for c in X if X[c].dtype == "float32"]
int_cols_train =   [c for c in X if X[c].dtype == "int32"]
    
# Upcast to avoid some problems with the number of digits after the point
X[float_cols_train] = X[float_cols_train].astype(np.float64)
X[int_cols_train] = X[int_cols_train].astype(np.int64)

In [None]:
float_cols_test = [c for c in X if X[c].dtype == "float32"]
int_cols_test =   [c for c in X if X[c].dtype == "int32"]
    
# Upcast to avoid some problems with the number of digits after the point
X_test[float_cols_test] = X_test[float_cols_train].astype(np.float64)
X_test[int_cols_test] = X_test[int_cols_train].astype(np.int64)

In [None]:
# Feature generation for train. It's a little bit straightforward but why not.
s = 1
f = 14
for i in range(s, f+1):
    X['cont' + str(i) + '_p2'] = X['cont' + str(i)] ** 2
    for j in range(i+1, f+1):
        X[str(i) + '_' + str(j)] = X['cont' + str(i)] * X['cont' + str(j)]
X.describe()

In [None]:
# Feature generation for test. It's a little bit straightforward but why not.
for i in range(s, f+1):
    X_test['cont' + str(i) + '_p2'] = X_test['cont' + str(i)] ** 2
    for j in range(i+1, f+1):
        X_test[str(i) + '_' + str(j)] = X_test['cont' + str(i)] * X_test['cont' + str(j)]
X_test.describe()

# Training

In this notebook I'm using catboost with that weird data, which was created in the EDA. As we usually say in Russia: "Not great, not terrible". But it works. Hyperparameters for catboost are not final.

In [None]:
# Data split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1)

In [None]:
# Sanity check
Y_train.head()

In [None]:
# Catboost implementation. Do not forget to kill your previous snapshot.
reg = CatBoostRegressor(
    iterations=20000,
    learning_rate=0.001,
    max_depth=8,
    od_type='Iter',
    od_wait=1000,
    eval_metric = 'RMSE',
    save_snapshot=True,
    snapshot_file='snapshot.bkp'
)
reg.fit(
    X_train, Y_train,
    # cat_features=cat_features,
    eval_set=(X_val, Y_val),
    logging_level='Silent',
    plot=True
)

In [None]:
test_df['target'] = reg.predict(X_test)

In [None]:
test_df.head()

In [None]:
res = test_df[['id', 'target']]

In [None]:
res.to_csv('submussion.csv', index=None)