# tabpfn_xgboost_mskf.ipynb
## 実験条件
* 学習時にgreeks.csvのEpsilon使用, テストデータでは訓練データのEpsilonの最大値+1とする
* 欠損値は中央値で補完
* greeks.csvのAlphaを予測、予測後にA->0, (B, G, D)->1に変換
* CVはMultilabelStratifiedKFoldで、Beta, Gamma, Deltaのクラス割合が同じになるように分割

## TabPFNのインストール
### 事前にダウンロードするファイル: 
* TabPFN: <https://www.kaggle.com/datasets/carlmcbrideellis/tabpfn-019-whl>
* MultilabelStratifiedKFold: <https://www.kaggle.com/datasets/tilii7/iterative-stratification-017>

In [70]:
# KaggleNotebookではコメントアウトを外す
# !pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
# !mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/
# !pip install -q /kaggle/input/iterative-stratification-017/iterative_stratification-0.1.7-py3-none-any.whl

In [71]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
# model
from sklearn.base import BaseEstimator
import xgboost as xgb
from xgboost import XGBClassifier
# import lightgbm as lgb
from tabpfn import TabPFNClassifier
# over/under sampling
from imblearn.over_sampling import SMOTE # SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# Imputation
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif# Feature Selection
# import category_encoders as encoders
from sklearn.preprocessing import LabelEncoder, RobustScaler
# cross validation
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
# others
from datetime import date, datetime
import optuna
import warnings
warnings.simplefilter('ignore')

# 環境を指定
env = 'local'
# env = 'kaggle'

## データの読み込み

In [72]:
# ディレクトリの指定
if env == 'local':
    BASE_DIR = '../../data'
elif env == 'kaggle':
    BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions/'
else:
    raise ValueError("Invalid environment. Set env as 'local' or 'kaggle'.")

# データの読み込み
train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
# train_df = pd.read_csv(f'{BASE_DIR}/train_integerized.csv')
greeks_df = pd.read_csv(f'{BASE_DIR}/greeks.csv')
test_df = pd.read_csv(f'{BASE_DIR}/test.csv')
submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# greeksと結合
train_df = pd.merge(train_df, greeks_df[['Id', 'Alpha', 'Epsilon']], on='Id', how='left')

greeksのAはClass0に、B, G, DはClass1に相当

In [73]:
greeks_df.Alpha.value_counts()

Alpha
A    509
B     61
G     29
D     18
Name: count, dtype: int64

## Epsilonを特徴量に追加

In [74]:
# 欠損値以外の日付をグレゴリオ暦の序数形式（1年1月1日を1とし、1日ずつ増やしていく）に変換
train_df.Epsilon[train_df.Epsilon != 'Unknown'] = train_df.Epsilon[train_df.Epsilon != 'Unknown']\
                                        .map(lambda x: datetime.strptime(x, '%m/%d/%Y').toordinal())
# 欠損値をnp.nanに変換
train_df.Epsilon[train_df.Epsilon == 'Unknown'] = np.nan

# 訓練データを説明変数と目的変数に分割
X_train = train_df.drop(['Id', 'EJ', 'Alpha', 'Class'], axis=1)
y_train = train_df[['Class', 'Alpha']]

# テストデータから数値データ以外を削除
X_test = test_df.drop(['Id', 'EJ'], axis=1)

# テストデータは訓練データの最大値+1とする
X_test['Epsilon'] = train_df.Epsilon.max()+1

## モデル、評価基準

In [75]:
class WeightedEns(BaseEstimator):
    def __init__(self, xgb_params):
        #xgb.fitだとパラメータをdict:paramで渡せない…
        self.models = [XGBClassifier(eta = xgb_params['eta'], gamma = xgb_params['gamma'], max_depth = xgb_params['max_depth'],
                           min_child_weight = xgb_params['min_child_weight'], max_delta_step = xgb_params['max_delta_step'],
                           subsample = xgb_params['subsample'], reg_lambda = xgb_params['reg_lambda'], reg_alpha = xgb_params['reg_alpha'],
                           tree_method = 'gpu_hist'),
                       TabPFNClassifier(N_ensemble_configurations=256,device='cuda:0')]
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        # self.imputer = KNNImputer(n_neighbors=50)
    
    def fit(self, X, y):
        classes, y = np.unique(y, return_inverse=True)
        self.classes_ = classes
        X = self.imputer.fit_transform(X)
        for i, model in enumerate(self.models):
            if i > 0:
                 model.fit(X,y)
            else:
                model.fit(X,y) # 決定木ではweightを考慮するようコードを変更
    
    def predict_proba(self, X):
        X = self.imputer.transform(X)
        ps = np.stack([model.predict_proba(X) for model in self.models])
        p = np.mean(ps,axis=0)
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        # we reweight the probs, since the loss is also balanced like this
        # our models out of the box optimize CE
        # with these changes they optimize balanced CE
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
        return np.concatenate((new_p[:,:1],np.sum(new_p[:,1:],1,keepdims=True)), 1)


# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

## Hyper parameter tuning by Optuna

epsilonはDate型でそのまま突っ込めない。考えるのめんどうだったので削ってチューニングした。許して

In [76]:
#欠損値処理
missing = X_train.isnull().sum()
missing = missing[missing>0]

#中央値
for k, v in missing.items():
    X_train[k] = X_train[k].fillna(X_train[k].mean())


In [77]:
# optunaでハイパーパラメータ選定

def objective(trial):
    eta =  trial.suggest_loguniform('eta', 1e-8, 1.0)
    gamma = trial.suggest_loguniform('gamma', 1e-8, 1.0)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_child_weight = trial.suggest_loguniform('min_child_weight', 1e-8, 1.0)
    max_delta_step = trial.suggest_loguniform('max_delta_step', 1e-8, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.0, 1.0)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0.0, 1000.0)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0.0, 1000.0)
    tree_method = 'gpu_hist'


    regr = XGBClassifier(eta = eta, gamma = gamma, max_depth = max_depth,
                           min_child_weight = min_child_weight, max_delta_step = max_delta_step,
                           subsample = subsample,reg_lambda = reg_lambda,reg_alpha = reg_alpha,
                           tree_method=tree_method)

    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    labels = greeks_df[['Beta', 'Gamma', 'Delta']] # クロスバリデーションの分割で考慮する特徴量
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, labels)):
        # 進行状況
        # print('fold: {}'.format(fold+1))
        # 訓練データを分割
        X_train_fold = X_train.iloc[train_index].drop(['Epsilon'], axis=1)
        y_train_fold = y_train['Alpha'].iloc[train_index]
        X_valid_fold = X_train.iloc[valid_index].drop(['Epsilon'], axis=1)
        y_valid_fold = y_train['Class'].iloc[valid_index]
        classes, y_train_unique = np.unique(y_train_fold, return_inverse=True)
        regr.fit(X_train_fold, y_train_unique)

        # imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        # X_valid_imputed = imputer.transform(X_valid_fold)
        valid_preds = regr.predict(X_valid_fold)
        # valid_proba = regr.predict_proba(X_valid_imputed)
        # valid_ploba = np.mean(valid_proba)
        # class_0_est_instances = valid_proba[:,0].sum()
        # others_est_instances = valid_proba[:,1:].sum()
        
        # print('y_valid')
        # print(y_valid_fold)
        # print('-----------------------------')
        # print('preds')
        # print(valid_preds)
        # exit(0)

        score = balanced_log_loss(y_valid_fold, valid_preds)
        scores.append(score)
    mean = np.array(scores).mean()
    print(mean)

    return mean
    

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print('XGB tuned parameters')
print(study.best_params)


[I 2023-08-06 16:15:34,663] A new study created in memory with name: no-name-a64bfcd2-4625-43b9-862f-ea63ffa55570
[I 2023-08-06 16:15:39,999] Trial 0 finished with value: 17.269388197455342 and parameters: {'eta': 1.3821865346164523e-06, 'gamma': 1.634088873011572e-08, 'max_depth': 3, 'min_child_weight': 3.956753947072637e-07, 'max_delta_step': 1.268616650059361e-06, 'subsample': 0.3962838313863627, 'reg_lambda': 794.254809222582, 'reg_alpha': 146.8652451957414}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:15:46,496] Trial 1 finished with value: 17.269388197455342 and parameters: {'eta': 1.3413832548800603e-06, 'gamma': 1.5426100105652803e-06, 'max_depth': 4, 'min_child_weight': 0.06253859325599373, 'max_delta_step': 3.0120574036957072e-06, 'subsample': 0.3828308128884884, 'reg_lambda': 428.08671443895906, 'reg_alpha': 201.70390187555455}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:15:53,263] Trial 2 finished with value: 17.269388197455342 and parameters: {'eta': 5.007378967741152e-06, 'gamma': 0.004036401679741396, 'max_depth': 15, 'min_child_weight': 0.5368315195787738, 'max_delta_step': 0.005557952893783737, 'subsample': 0.2910917767722875, 'reg_lambda': 265.62106277573673, 'reg_alpha': 704.8768600178968}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:00,713] Trial 3 finished with value: 17.269388197455342 and parameters: {'eta': 0.05359369107377943, 'gamma': 0.002990973818729527, 'max_depth': 10, 'min_child_weight': 0.035578175048848915, 'max_delta_step': 0.1541631687036376, 'subsample': 0.6364243002293828, 'reg_lambda': 361.74910314497964, 'reg_alpha': 256.51186717948246}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:07,986] Trial 4 finished with value: 17.269388197455342 and parameters: {'eta': 0.00011817243453086208, 'gamma': 4.479412066573753e-05, 'max_depth': 18, 'min_child_weight': 0.06357105975172321, 'max_delta_step': 0.0787911302697919, 'subsample': 0.723090210614953, 'reg_lambda': 865.1948155735496, 'reg_alpha': 934.5693669239641}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:14,421] Trial 5 finished with value: 17.269388197455342 and parameters: {'eta': 0.0007115654271370854, 'gamma': 1.449873220463718e-08, 'max_depth': 13, 'min_child_weight': 0.1534762811575135, 'max_delta_step': 0.023045145558144436, 'subsample': 0.033135086374274536, 'reg_lambda': 406.08777032681996, 'reg_alpha': 677.1342935765473}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:20,053] Trial 6 finished with value: 17.269388197455342 and parameters: {'eta': 4.3204685745588117e-07, 'gamma': 0.04564929283986358, 'max_depth': 20, 'min_child_weight': 0.43208266352263325, 'max_delta_step': 0.16855459873473563, 'subsample': 0.24812552108774122, 'reg_lambda': 90.95088189727952, 'reg_alpha': 615.6952502270718}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:25,759] Trial 7 finished with value: 17.269388197455342 and parameters: {'eta': 3.9563936088715893e-07, 'gamma': 4.915389319314051e-07, 'max_depth': 16, 'min_child_weight': 3.66822607451583e-06, 'max_delta_step': 0.09757822980289467, 'subsample': 0.32694685661473943, 'reg_lambda': 89.567981536261, 'reg_alpha': 554.0451430263194}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:31,423] Trial 8 finished with value: 17.269388197455342 and parameters: {'eta': 0.0520212684066823, 'gamma': 0.004471486704761978, 'max_depth': 10, 'min_child_weight': 5.147536375567691e-05, 'max_delta_step': 0.001981515881860499, 'subsample': 0.9172975904865281, 'reg_lambda': 854.8419983825303, 'reg_alpha': 466.28289833570545}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:36,858] Trial 9 finished with value: 17.269388197455342 and parameters: {'eta': 0.00028160121507781973, 'gamma': 0.023964476807219953, 'max_depth': 6, 'min_child_weight': 0.2786702942761442, 'max_delta_step': 2.3741996908175674e-07, 'subsample': 0.08737377238520794, 'reg_lambda': 826.020063072466, 'reg_alpha': 470.37829125710385}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:42,531] Trial 10 finished with value: 17.269388197455342 and parameters: {'eta': 3.684761818667293e-08, 'gamma': 0.8206318269437507, 'max_depth': 1, 'min_child_weight': 1.2195064621597469e-08, 'max_delta_step': 1.6504951146990437e-08, 'subsample': 0.5056798178754933, 'reg_lambda': 698.711267441412, 'reg_alpha': 6.973296988316179}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:48,562] Trial 11 finished with value: 17.269388197455342 and parameters: {'eta': 1.041513763106382e-08, 'gamma': 1.5392385029739426e-08, 'max_depth': 2, 'min_child_weight': 0.0023999808632574194, 'max_delta_step': 6.220712583191139e-06, 'subsample': 0.4670398352863473, 'reg_lambda': 596.9154931074187, 'reg_alpha': 134.56765960522372}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:16:54,119] Trial 12 finished with value: 17.269388197455342 and parameters: {'eta': 4.146736275117942e-06, 'gamma': 9.496455958014762e-07, 'max_depth': 5, 'min_child_weight': 1.6419751151016587e-06, 'max_delta_step': 2.1289806791761544e-05, 'subsample': 0.4353565954323878, 'reg_lambda': 574.279180609958, 'reg_alpha': 257.0260423428191}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:02,107] Trial 13 finished with value: 17.269388197455342 and parameters: {'eta': 7.820004523801013e-06, 'gamma': 5.487129740146904e-07, 'max_depth': 6, 'min_child_weight': 0.0022112385929691854, 'max_delta_step': 4.375200187485765e-06, 'subsample': 0.1914171950043087, 'reg_lambda': 990.1515449256201, 'reg_alpha': 1.2028615770454678}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:07,728] Trial 14 finished with value: 17.269388197455342 and parameters: {'eta': 2.1652103897637463e-07, 'gamma': 1.2531403709949482e-05, 'max_depth': 4, 'min_child_weight': 4.313701579740761e-08, 'max_delta_step': 0.00016991689866384748, 'subsample': 0.36146363854837427, 'reg_lambda': 493.7793352441969, 'reg_alpha': 262.3741349324312}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:12,881] Trial 15 finished with value: 17.269388197455342 and parameters: {'eta': 2.0922668933226628e-05, 'gamma': 8.781613787090188e-08, 'max_depth': 8, 'min_child_weight': 0.0014239036021564246, 'max_delta_step': 3.499445339550405e-07, 'subsample': 0.18494491199068486, 'reg_lambda': 662.4721197676786, 'reg_alpha': 351.9786422549097}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:18,582] Trial 16 finished with value: 17.269388197455342 and parameters: {'eta': 6.763533114470915e-07, 'gamma': 3.7053945947450733e-06, 'max_depth': 3, 'min_child_weight': 9.870379204322577e-05, 'max_delta_step': 0.00019686432740371497, 'subsample': 0.573798221644009, 'reg_lambda': 724.1543660709726, 'reg_alpha': 146.31015439781163}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:24,306] Trial 17 finished with value: 17.269388197455342 and parameters: {'eta': 5.86944272534717e-08, 'gamma': 7.821842411432067e-08, 'max_depth': 8, 'min_child_weight': 2.0038143917287247e-07, 'max_delta_step': 8.823197633038701e-07, 'subsample': 0.39906440438662044, 'reg_lambda': 534.816508047167, 'reg_alpha': 119.20214059226022}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:30,037] Trial 18 finished with value: 17.269388197455342 and parameters: {'eta': 3.440054962745186e-05, 'gamma': 0.00012241175409473677, 'max_depth': 8, 'min_child_weight': 1.3516484168564953e-05, 'max_delta_step': 3.3192398445227316e-08, 'subsample': 0.13546417786578235, 'reg_lambda': 446.2679268353866, 'reg_alpha': 371.94724333405065}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342


[I 2023-08-06 16:17:35,779] Trial 19 finished with value: 17.269388197455342 and parameters: {'eta': 1.316028470673865e-06, 'gamma': 9.398882182828203e-08, 'max_depth': 12, 'min_child_weight': 4.829016707774276e-07, 'max_delta_step': 2.8282564687428658e-05, 'subsample': 0.274595216278244, 'reg_lambda': 273.45602388953273, 'reg_alpha': 102.36716925844365}. Best is trial 0 with value: 17.269388197455342.


17.269388197455342
XGB tuned parameters
{'eta': 1.3821865346164523e-06, 'gamma': 1.634088873011572e-08, 'max_depth': 3, 'min_child_weight': 3.956753947072637e-07, 'max_delta_step': 1.268616650059361e-06, 'subsample': 0.3962838313863627, 'reg_lambda': 794.254809222582, 'reg_alpha': 146.8652451957414}


## CV

In [78]:
# 初期値
seed = 779292
folds = 10
labels = greeks_df[['Beta', 'Gamma', 'Delta']] # クロスバリデーションの分割で考慮する特徴量

# 各分割ごとのバリデーションスコアを格納
scores = 0
# モデルを保存
models = []
# クロスバリデーションの分割数を指定します
mskf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, labels)):
    # 進行状況
    print('fold: {}'.format(fold+1))
    # 訓練データを分割
    X_train_fold = X_train.iloc[train_index]
    y_train_fold = y_train['Alpha'].iloc[train_index]
    X_valid_fold = X_train.iloc[valid_index]
    y_valid_fold = y_train['Class'].iloc[valid_index]
    
    # モデルを訓練、予測を出力
    model = WeightedEns(xgb_params=study.best_params)
    model.fit(X_train_fold,y_train_fold)
    valid_preds = model.predict_proba(X_valid_fold)[:, 1]

    # 評価
    val_score = balanced_log_loss(y_valid_fold, valid_preds)
    # スコアを保存
    scores += val_score
    # モデルを保存
    models.append(model)
    
# クロスバリデーションの平均値を計算
cv_score = scores /  folds
print(f'our out of folds CV score is {cv_score}')

fold: 1
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 2
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 3
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 4
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 5
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 6
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 7
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 8
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 9
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
fold: 10
Loading model that can be used for inference only
Using a Transformer wit

## 提出

In [79]:
# 提出用に値を変換
if env == 'kaggle':
    # 予測
    # 各分割ごとのテストデータに対する予測値を格納
    preds = np.zeros(len(X_test))
    for i in range(len(models)):
        # pred = models[i].predict(xgb.DMatrix(test_df.drop(['Id', 'EJ'], axis=1)), iteration_range=(0, models[i].best_iteration))
        pred = models[i].predict(X_test)
        preds += pred
    test_pred = preds / folds

    # 提出
    submission = pd.DataFrame(columns = submission_df.columns)
    submission['Id'] = test_df['Id']
    submission['class_0'] = 1 - test_pred
    submission['class_1'] = test_pred
    submission.to_csv('submission.csv',index=False)