# tabpfn_xgboost_mskf.ipynb
## 実験条件
* 学習時にgreeks.csvのEpsilon使用, テストデータでは訓練データのEpsilonの最大値+1とする
* 欠損値は中央値で補完
* greeks.csvのAlphaを予測、予測後にA->0, (B, G, D)->1に変換
* CVはMultilabelStratifiedKFoldで、Beta, Gamma, Deltaのクラス割合が同じになるように分割
### 結果
* CV: 0.19097697170787517
## 変更点(2023-08-05 20:59)
* XGBClassifierのsample_weightを設定
* sample_weightを計算するためにAlphaをラベルエンコーディング
### 結果
* CV: 0.18313566432976108

## TabPFNのインストール
### 事前にダウンロードするファイル: 
* TabPFN: <https://www.kaggle.com/datasets/carlmcbrideellis/tabpfn-019-whl>
* MultilabelStratifiedKFold: <https://www.kaggle.com/datasets/tilii7/iterative-stratification-017>

In [1]:
# KaggleNotebookではコメントアウトを外す
# !pip install -q /kaggle/input/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
# !mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/
# !pip install -q /kaggle/input/iterative-stratification-017/iterative_stratification-0.1.7-py3-none-any.whl

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
# model
from sklearn.base import BaseEstimator
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
import lightgbm as lgb
from tabpfn import TabPFNClassifier
# over/under sampling
from imblearn.over_sampling import SMOTE # SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# Imputation
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif# Feature Selection
import category_encoders as encoders
from sklearn.preprocessing import LabelEncoder, RobustScaler
# cross validation
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
# others
from datetime import date, datetime
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings
warnings.simplefilter('ignore')

# 環境を指定
env = 'local'
# env = 'kaggle'

## データの読み込み

In [3]:
# ディレクトリの指定
if env == 'local':
    BASE_DIR = '../../data'
elif env == 'kaggle':
    BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions/'
else:
    raise ValueError("Invalid environment. Set env as 'local' or 'kaggle'.")

# データの読み込み
train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
# train_df = pd.read_csv(f'{BASE_DIR}/train_integerized.csv')
greeks_df = pd.read_csv(f'{BASE_DIR}/greeks.csv')
test_df = pd.read_csv(f'{BASE_DIR}/test.csv')
submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# greeksと結合
train_df = pd.merge(train_df, greeks_df[['Id', 'Alpha', 'Epsilon']], on='Id', how='left')

greeksのAはClass0に、B, G, DはClass1に相当

In [4]:
greeks_df.Alpha.value_counts()

Alpha
A    509
B     61
G     29
D     18
Name: count, dtype: int64

## Epsilonを特徴量に追加

In [5]:
# 欠損値以外の日付をグレゴリオ暦の序数形式（1年1月1日を1とし、1日ずつ増やしていく）に変換
train_df.Epsilon[train_df.Epsilon != 'Unknown'] = train_df.Epsilon[train_df.Epsilon != 'Unknown']\
                                        .map(lambda x: datetime.strptime(x, '%m/%d/%Y').toordinal())
# 欠損値をnp.nanに変換
train_df.Epsilon[train_df.Epsilon == 'Unknown'] = np.nan

# 訓練データを説明変数と目的変数に分割
X_train = train_df.drop(['Id', 'EJ', 'Alpha', 'Class'], axis=1)
y_train = train_df[['Class', 'Alpha']]

# テストデータから数値データ以外を削除
X_test = test_df.drop(['Id', 'EJ'], axis=1)

# テストデータは訓練データの最大値+1とする
X_test['Epsilon'] = train_df.Epsilon.max()+1

## Alphaをラベルエンコーディング

In [6]:
from sklearn.preprocessing import LabelEncoder
# ラベルエンコーダを初期化
le = LabelEncoder()
# yを数値に変換
y_train['Alpha'] = le.fit_transform(y_train['Alpha'])

self.models = [XGBClassifier(eta=0.01267942116017827, 
                             gamma=0.0035099486546295838,
                             max_depth=10,
                             min_child_weight=0.00295968493319233,
                             max_delta_step=3.207138802851332e-05,
                             subsample=0.17568276820923612,
                             reg_lambda=76.82036507955048,
                             reg_alpha=812.280448836668,
                             n_estimators=100,
                             learning_rate=0.2,
                             colsample_bytree=0.85),

## モデル、評価基準

In [7]:
class WeightedEns(BaseEstimator):
    def __init__(self):
        self.models = [
                       XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                       # TabPFNClassifier(N_ensemble_configurations=256,device='cuda:0')
                      ]
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        # self.imputer = KNNImputer(n_neighbors=50)
    
    def fit(self, X, y, weights=None):
        classes, y = np.unique(y, return_inverse=True)
        self.classes_ = classes
        X = self.imputer.fit_transform(X)
        for i, model in enumerate(self.models):
            if isinstance(model, XGBClassifier):
                model.fit(X, y, sample_weight=weights) # 決定木ではweightを考慮する
            else:
                model.fit(X,y)
    def set_params(self, params):
        self.models[0].set_params(**params)
    
    def predict_proba(self, X):
        X = self.imputer.transform(X)
        ps = np.stack([model.predict_proba(X) for model in self.models])
        p = np.mean(ps,axis=0)
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        # we reweight the probs, since the loss is also balanced like this
        # our models out of the box optimize CE
        # with these changes they optimize balanced CE
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
        return np.concatenate((new_p[:,:1],np.sum(new_p[:,1:],1,keepdims=True)), 1)

# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

## Optuna

In [8]:
# 初期値
seed = 779292
folds = 10
labels = greeks_df[['Beta', 'Gamma', 'Delta']] # クロスバリデーションの分割で考慮する特徴量

def objective(trial):
    # 各分割ごとのバリデーションスコアを格納
    scores = 0
    # クロスバリデーションの分割数を指定
    mskf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    
    # ハイパーパラメータのサーチ空間を定義
    params = {
        'n_estimators': 100,
        'learning_rate': 0.2,
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'lambda': trial.suggest_float('lambda', 1e-5, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-5, 1.0, log=True)
    }

    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, labels)):
        # 訓練データを分割
        X_train_fold = X_train.iloc[train_index]
        y_train_fold = y_train['Alpha'].iloc[train_index]
        X_valid_fold = X_train.iloc[valid_index]
        y_valid_fold = y_train['Class'].iloc[valid_index]
    
        # Alphaのラベルに重みづけをする
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_fold)
        model = WeightedEns()
        model.set_params(params)
        model.fit(X_train_fold, y_train_fold, weights=sample_weights)
        valid_preds = model.predict_proba(X_valid_fold)[:, 1]
        val_score = balanced_log_loss(y_valid_fold, valid_preds)
        scores += val_score

    cv_score = scores / folds
    return cv_score

In [9]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)

best_params = study.best_params
print(f'Best parameters: {best_params}')

[I 2023-08-06 23:51:28,054] A new study created in memory with name: no-name-7979bcc2-9489-4a1a-9a27-1c65548fd2b5
[I 2023-08-06 23:51:29,289] Trial 0 finished with value: 0.26177463899754655 and parameters: {'max_depth': 15, 'subsample': 0.6711446001746522, 'colsample_bytree': 0.7631265478895534, 'min_child_weight': 1, 'gamma': 0.8348372569231347, 'lambda': 1.0433568973294579e-05, 'alpha': 0.8717712304747285}. Best is trial 0 with value: 0.26177463899754655.
[I 2023-08-06 23:51:30,016] Trial 1 finished with value: 0.25711078502590257 and parameters: {'max_depth': 10, 'subsample': 0.5126948223029311, 'colsample_bytree': 0.6823664018187763, 'min_child_weight': 7, 'gamma': 0.8982501821871622, 'lambda': 3.2703765084741954e-05, 'alpha': 2.657848395181412e-05}. Best is trial 1 with value: 0.25711078502590257.
[I 2023-08-06 23:51:30,684] Trial 2 finished with value: 0.22893004423755065 and parameters: {'max_depth': 15, 'subsample': 0.5406215070528055, 'colsample_bytree': 0.9012967447121446, '

Best parameters: {'max_depth': 18, 'subsample': 0.5236088397410353, 'colsample_bytree': 0.9081020201822949, 'min_child_weight': 1, 'gamma': 0.018902786999403336, 'lambda': 0.002946024021403057, 'alpha': 0.21131772406300453}
