In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame

from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
KFOLD = 4

xgb_params1 = {"objective": "reg:squarederror",
              "n_estimators" : 40,
              "max_depth" : 7,
              "n_jobs" : -1,
              "seed" : 0,
              "eval_metric" : "rmse",
              "min_child_weight" : 4.0,
              "verbosity" : 0}

xgb_params2 = {"objective": "reg:squarederror",
              "n_estimators" : 40,
              "eta" : 0.2,
              "max_depth" : 4,
              "n_jobs" : -1,
              "seed" : 0,
              "eval_metric" : "rmse",
              "min_child_weight" : 4.0,
              "verbosity" : 0}

knn_params = {"n_neighbors" : 6}

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test_data  = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
sub        = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")

In [None]:
train_target = train_data["target"]
train = train_data.drop(["id", "target"], axis=1)

test = test_data.drop("id", axis=1)

In [None]:
def get_base_model_preds(clf, X_train, y_train, X_test):
    print(clf.clf)

    N_SPLITS = 5
    oof_valid = np.zeros((X_train.shape[0],1))
    oof_test = np.zeros((X_test.shape[0],1))
    oof_test_skf = np.zeros((N_SPLITS, X_test.shape[0],1))

    kf = KFold(n_splits=5,random_state=48,shuffle=True)
    for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
        print('[CV] {}/{}'.format(i+1, N_SPLITS))
        X_train_, X_valid_ = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_, y_valid_ = y_train.iloc[train_index], y_train.iloc[valid_index]

        clf.fit(X_train_, y_train_)

        oof_valid[valid_index] = clf.predict(X_valid_).reshape(-1,1)
        oof_test_skf[i,:] = clf.predict(X_test).reshape(-1,1)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_valid, oof_test

In [None]:
class ClfBuilder(object):
    def __init__(self, clf, params=None):
        self.clf = clf(**params)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [None]:
linear = ClfBuilder(clf=LinearRegression, params={})
xgb1 = ClfBuilder(clf=xgb.XGBRegressor, params=xgb_params1)
xgb2 = ClfBuilder(clf=xgb.XGBRegressor, params=xgb_params2)
rf   = ClfBuilder(clf=RandomForestRegressor, params={})
knn  = ClfBuilder(clf=KNeighborsRegressor, params={})

In [None]:
oof_valid_lr, oof_test_lr = get_base_model_preds(linear, train, train_target, test)
oof_valid_xgb1, oof_test_xgb1 = get_base_model_preds(xgb1, train, train_target, test)
oof_valid_xgb2, oof_test_xgb2 = get_base_model_preds(xgb2, train, train_target, test)
oof_valid_rf, oof_test_rf = get_base_model_preds(rf, train, train_target, test)
oof_valid_knn, oof_test_knn = get_base_model_preds(knn, train, train_target, test)

In [None]:
X_train_base = np.concatenate([oof_valid_lr, oof_valid_xgb1, oof_valid_xgb2, oof_valid_rf, oof_valid_knn], axis=1)
X_test_base  = np.concatenate([oof_test_lr, oof_test_xgb1, oof_test_xgb2, oof_test_rf, oof_test_knn], axis=1)

In [None]:
final_xgb = xgb.XGBRegressor(**xgb_params1)

In [None]:
final_xgb.fit(X_train_base, train_target)

In [None]:
final_pred = final_xgb.predict(X_test_base)

In [None]:
sub["target"] = final_pred

In [None]:
sub.to_csv("submit.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

_, ax = plt.subplots(figsize=(12, 5))

xgb.plot_importance(final_xgb,
                    ax=ax,
                    importance_type='weight',
                    show_values=True
                    )

plt.show()

In [None]:
import matplotlib.pyplot as plt

_, ax = plt.subplots(figsize=(12, 5))

xgb.plot_importance(final_xgb,
                    ax=ax,
                    importance_type='gain',
                    show_values=True
                    )

plt.show()

- f0 : Linear Regression
- f1 : XGBoost(optuna)
- f2 : XGBoost
- f3 : Random Forest
- f4 : KNN(k-Nearest Neighbor)