# xgb+lgb+tabpfn

## 実験結果
* xgb(特徴量抽出)  
CV: 0.3058027696858661
* lgb(特徴量抽出)  
CV: 0.31445132158950145
* TabPFN(特徴量抽出)  
CV: 0.430984415297508

## TabPFNのインストール
* 事前にダウンロードするファイル: <https://www.kaggle.com/datasets/carlmcbrideellis/tabpfn-019-whl>

In [1]:
# !pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
# !mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from tabpfn import TabPFNClassifier
from sklearn.model_selection import StratifiedKFold
import optuna
import warnings
from imblearn.over_sampling import SMOTE # SMOTE
from sklearn.impute import KNNImputer # kNN Imputation
from sklearn.feature_selection import SelectKBest, f_classif# Feature Selection
# Data Encoder and Scaler
import category_encoders as encoders
from sklearn.preprocessing import LabelEncoder, RobustScaler
warnings.simplefilter('ignore')

# 環境を指定
env = 'local'
# env = 'kaggle'

In [3]:
def knn_imputer(train_df, test_df):
    # インスタンス生成
    imputer = KNNImputer(n_neighbors=5)

    numerical_columns = train_df.drop(['Id', 'EJ', 'Class'], axis=1).columns

    # 訓練データに欠損値代入
    train_df_imputed = pd.DataFrame(imputer.fit_transform(train_df[numerical_columns]), columns=numerical_columns)
    # テストデータに欠損値代入
    test_df_imputed = pd.DataFrame(imputer.transform(test_df[numerical_columns]), columns=numerical_columns)

    # 元の訓練データも欠損値を補完したデータに置き換える
    train_df = train_df.drop(numerical_columns, axis=1)
    train_df = pd.concat([train_df, train_df_imputed], axis=1)

    # テストデータを欠損値を代入したデータに置き換える
    test_df = test_df.drop(numerical_columns, axis=1)
    test_df = pd.concat([test_df, test_df_imputed], axis=1)
    
    return train_df, test_df

def select_k_best(train_df, test_df, pvalue_upper_limit = 0.1, fscore_lower_limit = 5):
    # 欠損値の補完
    train_df, test_df = knn_imputer(train_df, test_df)
    # 数値データの列
    numerical_columns = train_df.drop(['Id', 'EJ', 'Class'], axis=1).columns
    features = pd.DataFrame(index=numerical_columns, columns=["F_value", "p_value"])
    # 訓練データを説明変数と目的変数に分割
    X_train = train_df.drop(['Id', 'EJ', 'Class'], axis=1)
    y_train = train_df['Class']
    # y_train.columns = ['Class']
    '''F値とp値を計算'''
    # インスタンス生成
    #     回帰: f_regression, mutual_info_regression
    #     分類: chi2, f_classif(分散分析のF値), mutual_info_classif
    # この時点ではkをもとの訓練データと同じにする
    fs = SelectKBest(score_func=f_classif, k=len(X_train.columns))
    # 特徴量選択
    X_selected = fs.fit_transform(X_train, y_train.values)

    '''選択したF値とp値と設定した閾値を用いて特徴量を選択'''
    new_features = [] # 選択された特徴量を格納
    drop_features = [] # 使わない特徴量を格納

    # F値が大きく、p値の小さい特徴量を選択
    for i in range(len(X_train.columns)):
        # F値とp値を格納
        features.loc[X_train.columns[i], "F_value"] = fs.scores_[i]
        features.loc[X_train.columns[i], "p_value"] = fs.pvalues_[i]
        
        if fs.pvalues_[i] <= pvalue_upper_limit and fs.scores_[i] >= fscore_lower_limit:
            new_features.append(X_train.columns[i])
        else:
            drop_features.append(X_train.columns[i])
    
    features = features.loc[new_features, :] # 選択された特徴量だけをfeaturesに保存
    features = features.sort_values("F_value", ascending=False)# F値が大きい順にソート
    
    return features, drop_features

In [4]:
class Preprocessing:
    '''前処理を行うクラス'''
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
        self.numerical_columns = train_df.drop(['Id', 'EJ', 'Class'], axis=1).columns
        self.features = pd.DataFrame(index=self.numerical_columns, columns=["F_value", "p_value"])
        
    def knn_imputer(self):
        # インスタンス生成
        imputer = KNNImputer(n_neighbors=5)
        
        # ローカル変数に値を格納
        temp_train_df = self.train_df
        temp_test_df = self.test_df
        
        # 訓練データに欠損値代入
        train_df_imputed = pd.DataFrame(imputer.fit_transform(temp_train_df[self.numerical_columns]), columns=self.numerical_columns)
        
        # テストデータに欠損値代入
        test_df_imputed = pd.DataFrame(imputer.transform(temp_test_df[self.numerical_columns]), columns=self.numerical_columns)

        # 元の訓練データも欠損値を補完したデータに置き換える
        temp_train_df = temp_train_df.drop(self.numerical_columns, axis=1)
        temp_train_df = pd.concat([temp_train_df, train_df_imputed], axis=1)

        # テストデータを欠損値を代入したデータに置き換える
        temp_test_df = temp_test_df.drop(self.numerical_columns, axis=1)
        temp_test_df = pd.concat([temp_test_df, test_df_imputed], axis=1)
        
        return temp_train_df, temp_test_df
    
    def clip_outliers_ver1(self):
        # ローカル変数に値を格納
        temp_train_df = self.train_df
        temp_test_df = self.test_df

        first_quartiles = temp_train_df[self.numerical_columns].quantile(0.25) # 第１四分位数
        third_quartiles = temp_train_df[self.numerical_columns].quantile(0.75) # 第３四分位数
        iqr = third_quartiles - first_quartiles # 四分位範囲

        lower_bound = first_quartiles - (iqr * 1.5) #外れ値の下限
        upper_bound = third_quartiles + (iqr * 1.5) #外れ値の上限

        # 訓練データとテストデータの両方に対して処理を行う
        for df in [temp_train_df, temp_test_df]:
            df[self.numerical_columns] = df[self.numerical_columns].clip(lower_bound, upper_bound, axis=1)

        return temp_train_df, temp_test_df
    
    def clip_outliers(self):
        '''訓練データとテストデータで別々に上限、下限を計算するよう変更'''
        # ローカル変数に値を格納
        clipped_df = []
        
        for df in [self.train_df, self.test_df]:
            temp_df = df
            first_quartiles = temp_df[self.numerical_columns].quantile(0.25) # 第１四分位数
            third_quartiles = temp_df[self.numerical_columns].quantile(0.75) # 第３四分位数
            iqr = third_quartiles - first_quartiles # 四分位範囲

            lower_bound = first_quartiles - (iqr * 1.5) #外れ値の下限
            upper_bound = third_quartiles + (iqr * 1.5) #外れ値の上限

            temp_df[self.numerical_columns] = temp_df[self.numerical_columns].clip(lower_bound, upper_bound, axis=1)
            clipped_df.append(temp_df)

        return clipped_df[0], clipped_df[1]
        
    def robust_scaler(self):
        # インスタンス生成
        scaler = RobustScaler()
        
        # ローカル変数に値を格納
        temp_train_df = self.train_df
        temp_test_df = self.test_df

        '''訓練データのスケーリング'''
        # インデックスを抽出
        index = temp_train_df.index
        # スケーリング
        scaler_train = scaler.fit_transform(temp_train_df[self.numerical_columns])
        scaled_train_df = pd.DataFrame(scaler_train, columns=self.numerical_columns)
        # インデックスを振りなおす
        scaled_train_df.index = index

        '''テストデータのスケーリング'''
        # インデックスを抽出
        index = temp_test_df.index
        # スケーリング
        scaler_test = scaler.fit_transform(temp_test_df[self.numerical_columns])
        scaled_test_df = pd.DataFrame(scaler_test, columns=self.numerical_columns)
        # インデックスを振りなおす
        scaled_test_df.index = index
        
        # 元の訓練データも欠損値を補完したデータに置き換える
        temp_train_df = temp_train_df.drop(self.numerical_columns, axis=1)
        temp_train_df = pd.concat([temp_train_df, scaled_train_df], axis=1)

        # テストデータを欠損値を代入したデータに置き換える
        temp_test_df = temp_test_df.drop(self.numerical_columns, axis=1)
        temp_test_df = pd.concat([temp_test_df, scaled_test_df], axis=1)
        
        return temp_train_df, temp_test_df
        
def preprocessing_pipeline(train_df, test_df):
    # クラスのインスタンスを生成
    preprocessor = Preprocessing(train_df, test_df)
    
    # 各メソッドを順に実行
    preprocessor.train_df, preprocessor.test_df = preprocessor.knn_imputer() # 欠損値代入
    # preprocessor.train_df, preprocessor.test_df = preprocessor.clip_outliers() # 外れ値除去
    # preprocessor.train_df, preprocessor.test_df = preprocessor.robust_scaler() # スケーリング
    
    # print('selected features: \n{}'.format(preprocessor.features))

    # 最終的に処理されたデータフレームを返す
    return preprocessor.train_df, preprocessor.test_df

# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

def balanced_log_loss_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    loss = balanced_log_loss(y_true, y_pred)
    return 'balanced_log_loss', loss

# Classの０，１の割合をそれぞれ計算
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

In [5]:
def xgb_training(X_train, y_train, X_valid, y_valid):
    # 訓練データの重みを計算
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    # 検証データの重みを計算
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
    # 訓練データをxgb用に変換
    xgb_train = xgb.DMatrix(data=X_train, label=y_train, weight=y_train.map({0: train_w0, 1: train_w1}))
    # 検証データをxgb用に変換
    xgb_valid = xgb.DMatrix(data=X_valid, label=y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1}))

    # モデルのインスタンス生成
    model = xgb.train(
        CFG.xgb_params, 
        dtrain = xgb_train, 
        num_boost_round = CFG.num_boost_round,
        evals = [(xgb_train, 'train'), (xgb_valid, 'eval')], 
        early_stopping_rounds = CFG.early_stopping_rounds,
        verbose_eval = False, # 整数に設定すると、n回ごとのブースティングステージで評価メトリクスを表示
    )
    # 検証
    valid_preds = model.predict(xgb.DMatrix(X_valid), iteration_range=(0, model.best_ntree_limit))
    
    return model, valid_preds

In [6]:
def lgb_training(X_train, y_train, X_valid, y_valid):
    # 訓練データの重みを計算
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    # 検証データの重みを計算
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
    # 訓練データをlgb用に変換
    lgb_train = lgb.Dataset(X_train, y_train, weight=y_train.map({0: train_w0, 1: train_w1}))
    # 検証データをlgb用に変換
    lgb_valid = lgb.Dataset(X_valid, y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1}))
    
    model = lgb.train(
        params = CFG.lgb_params,
        train_set = lgb_train,
        num_boost_round = CFG.num_boost_round,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = CFG.early_stopping_rounds,
        verbose_eval = False,
    )
    # 検証
    valid_preds = model.predict(X_valid)
    
    return model, valid_preds

In [7]:
def tabpfn_training(X_train, y_train, X_valid, y_valid):
    # インスタンス生成
    model = TabPFNClassifier(device='cpu', N_ensemble_configurations=256)
    # 学習
    model.fit(X_train, y_train)
    # 検証
    valid_preds = model.predict_proba(X_valid)[:, 1]

    return model, valid_preds

In [8]:
# pd.read_pickle('params/lgb_best_param.pkl').best_params

In [9]:
class CFG:
    '''設定値を格納'''
    num_boost_round = 926
    early_stopping_rounds = 98
    n_folds = 5 # 公差検証の分割数
    n_trials = 100 # ハイパーパラメータチューニングの試行回数
    seed = 1234
    learning_rate = 0.01
    boosting_type = "dart"
    # xgboost設定値
    xgb_params = {
        'objective': 'binary:logistic',# 学習タスク
        'tree_method': 'gpu_hist',
        'random_state': seed,
        'learning_rate': learning_rate,
        'eval_metric': 'rmse',
        # 探索したパラメータ
        'max_depth': 43,
        'colsample_bytree': 0.9270015786178574,
        'subsample': 1.0,
        'gamma': 0.9008025641267255,
        'lambda': 0.3150372040663734,
        'min_child_weight': 7,
    }
    # light-gbm設定値
    lgb_params = {
        # 探索するパラメータ
        'verbosity': -1, # 学習途中の情報を表示するかどうか
        "lambda_l1": 1.0760072734927809e-05,
        "lambda_l2": 0.17928637029753666,
        "num_leaves": 152,
        "feature_fraction": 0.6553649132473736,
        "bagging_fraction": 0.20105587614468057,
        "min_child_samples": 25,
        
        # 固定値
        "boosting_type": boosting_type,
        "objective": "binary",
        "learning_rate": learning_rate,
        "metric": "binary_logloss",
        'seed': seed,
        # 'n_jobs': -1, # -1でコア数をマックスで使う
        'is_unbalance':True, # 不均衡データの場合にTrueにする
    }

In [10]:
# データの読み込み
if env == 'local':
    BASE_DIR = '../../data'
elif env == 'kaggle':
    BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions/'
else:
    raise ValueError("Invalid environment. Set env as 'local' or 'kaggle'.")

train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
# train_df = pd.read_csv(f'{BASE_DIR}/train_integerized.csv')
greeks_df = pd.read_csv(f'{BASE_DIR}/greeks.csv')
test_df = pd.read_csv(f'{BASE_DIR}/test.csv')
submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# 前処理
# features, drop_features = select_k_best(train_df, test_df, pvalue_upper_limit = 0.1, fscore_lower_limit = 5)
features, drop_features = select_k_best(train_df, test_df, pvalue_upper_limit = 0.01, fscore_lower_limit = 3)
train_df = train_df.drop(drop_features, axis=1)
test_df = test_df.drop(drop_features, axis=1)

# 訓練データを説明変数と目的変数に分割
X_train = train_df.drop(['Id', 'EJ', 'Class'], axis=1)
y_train = train_df['Class']
y_train.columns = ['Class']

In [11]:
# 各分割ごとのバリデーションスコアを格納
scores = 0
# モデルを保存
models = []

# K-分割交差検証(層化抽出法)
kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)

for fold, (train_index, valid_index) in enumerate(kfold.split(X_train, y_train)):
    # 進行状況
    print('fold: {}'.format(fold+1))
    # 訓練データを分割
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]
    X_valid_fold = X_train.iloc[valid_index]
    y_valid_fold = y_train.iloc[valid_index]
    
    # モデルを訓練、予測を出力
    model, valid_preds = xgb_training(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

    # 評価
    val_score = balanced_log_loss(y_valid_fold, valid_preds)
    # スコアを保存
    scores += val_score
    # モデルを保存
    models.append(model)
    
# クロスバリデーションの平均値を計算
cv_score = scores /  CFG.n_folds
print(f'xgb our out of folds CV score is {cv_score}')

fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
xgb our out of folds CV score is 0.3058027696858661


In [12]:
# 各分割ごとのバリデーションスコアを格納
scores = 0
# モデルを保存
models = []

# K-分割交差検証(層化抽出法)
kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)

for fold, (train_index, valid_index) in enumerate(kfold.split(X_train, y_train)):
    # 進行状況
    print('fold: {}'.format(fold+1))
    # 訓練データを分割
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]
    X_valid_fold = X_train.iloc[valid_index]
    y_valid_fold = y_train.iloc[valid_index]
    
    # モデルを訓練、予測を出力
    model, valid_preds = lgb_training(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

    # 評価
    val_score = balanced_log_loss(y_valid_fold, valid_preds)
    # スコアを保存
    scores += val_score
    # モデルを保存
    models.append(model)
    
# クロスバリデーションの平均値を計算
cv_score = scores /  CFG.n_folds
print(f'lgb our out of folds CV score is {cv_score}')

fold: 1
fold: 2
fold: 3
fold: 4
fold: 5
lgb our out of folds CV score is 0.31445132158950145


In [13]:
# 各分割ごとのバリデーションスコアを格納
scores = 0
# モデルを保存
models = []

# K-分割交差検証(層化抽出法)
kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)

for fold, (train_index, valid_index) in enumerate(kfold.split(X_train, y_train)):
    # 進行状況
    print('fold: {}'.format(fold+1))
    # 訓練データを分割
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train.iloc[train_index]
    X_valid_fold = X_train.iloc[valid_index]
    y_valid_fold = y_train.iloc[valid_index]
    
    # モデルを訓練、予測を出力
    model, valid_preds = tabpfn_training(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold)

    # 評価
    val_score = balanced_log_loss(y_valid_fold, valid_preds)
    # スコアを保存
    scores += val_score
    # モデルを保存
    models.append(model)
    
# クロスバリデーションの平均値を計算
cv_score = scores /  CFG.n_folds
print(f'TabPFN our out of folds CV score is {cv_score}')

fold: 1
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 2
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 3
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 4
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 5
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
TabPFN our out of folds CV score is 0.430984415297508


In [14]:
# 提出用に値を変換
if env == 'kaggle':
    # 予測
    # 各分割ごとのテストデータに対する予測値を格納
    preds = np.zeros(len(test_df.drop(["Id", 'EJ'], axis=1)))
    for i in range(len(models)):
        # pred = models[i].predict(xgb.DMatrix(test_df.drop(['Id', 'EJ'], axis=1)), iteration_range=(0, models[i].best_iteration))
        pred = models[i].predict(test_df.drop(['Id', 'EJ'], axis=1))
        preds += pred
    test_pred = preds / CFG.n_folds

    # 提出
    submission = pd.DataFrame(columns = submission_df.columns)
    submission['Id'] = test_df['Id']
    submission['class_0'] = 1 - test_pred
    submission['class_1'] = test_pred
    submission.to_csv('submission.csv',index=False)