# Pseudo Labeling / 疑似ラベリング
One of the methods of semi-supervised learning / 半教師あり学習の手法の一つ

In [None]:
import os
import numpy as np
import pandas as pd
import random

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

train.drop([170514], axis=0, inplace=True)

X = np.array(train.drop(["id", "target"], axis=1))
X_test = np.array(test.drop("id", axis=1))
y = np.array(train["target"])

In [None]:
print(f"X.shape      : {X.shape}")
print(f"y.shape      : {y.shape}")
print(f"X_test.shape : {X_test.shape}")

# Modeling

In [None]:
from sklearn.model_selection import KFold
import xgboost as xgb
import lightgbm as lgb

SEED = 1380

In [None]:
params_xgb = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 15,
    "eta": 0.005,
    "gamma": 0.005346636874993822,
    "colsample_bytree": 0.5,
    "subsample": 0.7,
    "min_child_weight": 257,
    "alpha": 0.01563,
    "lambda": 0.003,
    "tree_method": "hist",
    "seed": SEED
}

In [None]:
d_test = xgb.DMatrix(X_test)

In [None]:
def prediction_xgb(X, y):
    
    pred_xgb = pd.DataFrame()
    
    SEED = random.randint(0, 100)
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    
    for tr_id, vl_id in kf.split(X, y):
        print("==================================================================")
    
        X_train, X_val = X[tr_id, :], X[vl_id, :]
        y_train, y_val = y[tr_id], y[vl_id]

        d_train = xgb.DMatrix(X_train, y_train)
        d_val = xgb.DMatrix(X_val, y_val)

        model = xgb.train(params=params_xgb,
                          dtrain=d_train,
                          num_boost_round=100000,
                          early_stopping_rounds=200,
                          verbose_eval=500,
                          evals=[(d_train, "train"), (d_val, "val")])

        pred = model.predict(d_test, ntree_limit=model.best_ntree_limit)
        pred = pd.Series(pred)
        pred_xgb = pd.concat([pred_xgb, pred], axis=1)
        
    pred_xgb = pred_xgb.mean(axis=1)
    pred_xgb = np.array(pred_xgb)
    
    return pred_xgb

In [None]:
pred_xgb = prediction_xgb(X, y)

In [None]:
X_xgb = np.concatenate([X, X_test], axis=0)
y_xgb = np.concatenate([y, pred_xgb])

In [None]:
pred_xgb = prediction_xgb(X_xgb, y_xgb)

# LightGBM

In [None]:
params_lgb = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.005,
    "num_leaves": 256,
    "bagging_fraction": 0.8206341150202605,
    "feature_fraction": 0.5,
    "min_data_in_leaf": 100,
    "lambda_l1": 1.074622455507616e-05,
    "lambda_l2": 2.0521330798729704e-06,
    "min_data_per_group": 5,
    "max_depth": -1,
    "subsample_for_bin": 200000,
    "cat_smooth": 1.0,
    "min_sum_hessian_in_leaf": 0.001,
    "bagging_freq": 6,
    "min_gain_to_split": 0.0,
    "verbosity": -1,    # warningやinfoを非表示にする
    "random_state": SEED
}

In [None]:
def prediction_lgb(X, y):
    
    pred_lgb = pd.DataFrame()
    
    SEED = random.randint(0, 100)
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    
    for tr_id, vl_id in kf.split(X, y):
        print("=====================================================================")
    
        X_train, X_val = X[tr_id, :], X[vl_id, :]
        y_train, y_val = y[tr_id], y[vl_id]

        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

        model = lgb.train(params=params_lgb,
                          train_set=lgb_train,
                          valid_sets=(lgb_train, lgb_val),
                          num_boost_round=10000,
                          early_stopping_rounds=100,
                          verbose_eval=500)

        pred = model.predict(X_test, num_iteration=model.best_iteration)
        pred = pd.Series(pred)
        pred_lgb = pd.concat([pred_lgb, pred], axis=1)
        
    pred_lgb = pred_lgb.mean(axis=1)
    pred_lgb = np.array(pred_lgb)
    
    return pred_lgb

In [None]:
pred_lgb = prediction_lgb(X, y)

In [None]:
X_lgb = np.concatenate([X, X_test], axis=0)
y_lgb = np.concatenate([y, pred_lgb])

In [None]:
pred_lgb = prediction_lgb(X_lgb, y_lgb)

# Submission

In [None]:
pred = pd.concat([pd.Series(pred_xgb), pd.Series(pred_lgb)], axis=1)
pred  = pred.mean(axis=1)

In [None]:
sample_sub = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
sub = sample_sub.copy()

In [None]:
sub["target"] = pred
sub

In [None]:
sub.to_csv("submission_pseudo.csv", index=False)