# Install TabNet and load data

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
sub = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
train

# Drop the ID column

In [None]:
train = train.drop(["id"], axis=1)
features = [c for c in train.columns if "cont" in c]
test = test.drop("id", axis=1)
test

# Feature Engineering

In [None]:
fe = dict(
    rankgauss = True,
    stats = True,
    gaussmix = False,
    pca = True,
    tsne = True,
    umap = True,
    drop_original = False,
)

In [None]:
all_data = pd.concat([train, test], axis=0, ignore_index=True)
targets = all_data.target[:300000]
all_data = all_data.drop("target", axis=1)
COLS = [c for c in all_data.columns if "cont" in c]
all_data

In [None]:
import tqdm

if fe["stats"]:
    for stats in tqdm.tqdm(["sum", "var", "mean", "median", "std", "kurt", "skew"]):
        all_data["cont_" + stats] = getattr(all_data[COLS], stats)(axis = 1)
        
all_data

In [None]:
import sys
sys.path.append("../input/rank-gauss")
from gauss_rank_scaler import GaussRankScaler

if fe["rankgauss"]:
    scaler = GaussRankScaler()
    rankgauss_feat = scaler.fit_transform(all_data[COLS])
    rankgauss_df = pd.DataFrame(rankgauss_feat, columns=[f"rankgauss_{i}" for i in range(rankgauss_feat.shape[1])])
    all_data = pd.concat([all_data, rankgauss_df], axis=1)
all_data

In [None]:
from sklearn.mixture import GaussianMixture

if fe["gaussmix"]:
    def get_gmm_class_feature(feat, n):
        gmm = GaussianMixture(n_components=n, random_state=42)

        gmm.fit(all_data[feat].values.reshape(-1, 1))

        all_data[f'{feat}_class'] = gmm.predict(all_data[feat].values.reshape(-1, 1))

    get_gmm_class_feature('cont1', 4)
    get_gmm_class_feature('cont2', 10)
    get_gmm_class_feature('cont3', 6)
    get_gmm_class_feature('cont4', 4)
    get_gmm_class_feature('cont5', 3)
    get_gmm_class_feature('cont6', 2)
    get_gmm_class_feature('cont7', 3)
    get_gmm_class_feature('cont8', 4)
    get_gmm_class_feature('cont9', 4)
    get_gmm_class_feature('cont10', 8)
    get_gmm_class_feature('cont11', 5)
    get_gmm_class_feature('cont12', 4)
    get_gmm_class_feature('cont13', 6)
    get_gmm_class_feature('cont14', 6)
    CLASS_COLS = [c for c in all_data.columns if "_class" in c]
    CLASS_COLS_IDX = []
    for c in CLASS_COLS:
        CLASS_COLS_IDX.append(all_data.columns.get_loc(c))
    assert len(CLASS_COLS) > 0
all_data

In [None]:
from sklearn.decomposition import PCA

if fe["pca"]:
    pca = PCA(n_components = 0.9, random_state = 42).fit(all_data[COLS])
    pca_feat = pca.transform(all_data[COLS])
    pca_df = pd.DataFrame(pca_feat, columns = [f"pca_cont{i}" for i in range(pca.n_components_)])
    all_data = pd.concat([all_data, pca_df], axis=1)
    PCA_COLS = [c for c in all_data.columns if "pca" in c]
    assert len(PCA_COLS) > 0

all_data

In [None]:
from cuml import TSNE

if fe["tsne"]:
    tsne_components = 2
    
    perplexity = [5, 10, 15, 20, 25]
    for per in perplexity:
        tsne = TSNE(n_components = tsne_components, perplexity = per, n_neighbors = 3.5 * per)
        tsne_feat = tsne.fit_transform(all_data[COLS])
        tsne_df = pd.DataFrame(tsne_feat, columns=[f"tsne_{per}_{i}" for i in range(tsne_components)])
        all_data = pd.concat([all_data, tsne_df], axis = 1)
    TSNE_COLS = [c for c in all_data.columns if "tsne" in c]
all_data

In [None]:
from cuml import UMAP

if fe["umap"]:
    umap_components = 10
    umap = UMAP(n_components = umap_components)
    umap_feat = umap.fit_transform(all_data[COLS])
    umap_df = pd.DataFrame(umap_feat, columns=[f"umap{i}" for i in range(umap_components)])
    all_data = pd.concat([all_data, umap_df], axis=1)
    UMAP_COLS = [c for c in all_data.columns if "umap" in c]
    assert len(UMAP_COLS) > 0
all_data

In [None]:
if fe["drop_original"]:
    all_data = all_data.drop(COLS, axis=1)

In [None]:
train = all_data[:300000]
test = all_data[300000:]
features = list(all_data.columns)

# Train the model
We will not use any Feature Engineering technique to test the model's power!

In [None]:
all_data

In [None]:
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor

dtypes = {c: "int8" for c in train.columns if "_class" in c}
train = train.astype(dtypes)
X_train, X_val, y_train, y_val = train_test_split(train, targets, test_size=0.1, random_state=42)
train_pool = Pool(X_train, y_train, cat_features = [c for c in train.columns if "_class" in c])
val_pool = Pool(X_val, y_val, cat_features = [c for c in train.columns if "_class" in c])

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK
from functools import partial

ITERATIONS = 5000
MAX_EVALS = 150
def objective_func(params, train_pool, val_pool):
    model = CatBoostRegressor(iterations = ITERATIONS,task_type="GPU", devices='0:1', grow_policy="Lossguide",
                              loss_function = "RMSE", custom_metric = "RMSE", eval_metric="RMSE", verbose = 1000, **params)
    model.fit(train_pool, eval_set = val_pool, early_stopping_rounds = 200, plot=False)
    loss = model.get_best_score()
    return {"loss": loss["validation"]["RMSE"], "status": STATUS_OK}
        
space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(1)),
    "max_depth": hp.quniform("max_depth", 6, 11, 1),
    "l2_leaf_reg": hp.quniform("l2_leaf_reg", 1, 5, 0.9),
    "bagging_temperature": hp.quniform("bagging_temperature", 0, 4, 0.9),
    "min_data_in_leaf": hp.quniform("min_data_in_leaf", 1, 42, 1),
    "max_leaves": hp.quniform("max_leaves", 2**4-1, 2**6-1, 1),
}


fn = partial(objective_func, train_pool=train_pool, val_pool=val_pool)
best_params = fmin(fn = fn, space=space, algo=tpe.suggest, max_evals = MAX_EVALS)


model = CatBoostRegressor(iterations = ITERATIONS,task_type="GPU", devices='0:1', grow_policy="Lossguide",
                              loss_function = "RMSE", custom_metric = "RMSE", eval_metric="RMSE", verbose = 200, **best_params)
model.fit(train_pool, eval_set = val_pool, early_stopping_rounds = 200, plot=True)

In [None]:
sub["target"] = model.predict(test)
sub

In [None]:
sub.to_csv("submission.csv", index=False)