# tabpfn_xgboost.ipynb
## 実験条件
* MultilabelStratifiedKFoldだとめっちゃ過学習したから普通のKFoldに変更、一番スコアが高いモデルをテストデータの予測に使用
* 学習時にgreeks.csvのEpsilon使用, テストデータでは訓練データのEpsilonの最大値+1とする
* 欠損値は中央値で補完(少なくともCVはKNNImputerに比べて高くなった）
* greeks.csvのAlphaを予測、予測後にA->0, (B, G, D)->1に変換
* alphaの各クラスの割合が同じになるように分割
## モデル
* XGBoost(パラメータを調整)
* XGBoost(パラメータはデフォルト)
* TabPFN(N_ensemble_configurations=48)
* TabPFN(N_ensemble_configurations=128)
### 結果
* CV: 0.10938
* LB: 0.14

## TabPFNのインストール
### 事前にダウンロードするファイル: 
* TabPFN: <https://www.kaggle.com/datasets/carlmcbrideellis/tabpfn-019-whl>
* MultilabelStratifiedKFold: <https://www.kaggle.com/datasets/tilii7/iterative-stratification-017>

In [1]:
# KaggleNotebookではコメントアウトを外す

# MultilabelStratifiedKFold
# !pip install -q /kaggle/input/mskfold/iterative_stratification-0.1.7-py3-none-any.whl
# tabpfn
# !pip install -q /kaggle/input/tabpfn-srs/tabpfn-019-whl/tabpfn-0.1.9-py3-none-any.whl
# !mkdir /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/tabpfn-srs/tabpfn-019-whl/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
# model
from sklearn.base import BaseEstimator
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
import lightgbm as lgb
from tabpfn import TabPFNClassifier
# over/under sampling
from imblearn.over_sampling import SMOTE # SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# Imputation
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif# Feature Selection
import category_encoders as encoders
from sklearn.preprocessing import LabelEncoder, RobustScaler
# cross validation
from sklearn.model_selection import StratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import KFold as KF, GridSearchCV
# others
from datetime import date, datetime
from sklearn.preprocessing import LabelEncoder
import optuna
import warnings
warnings.simplefilter('ignore')

# 環境を指定
env = 'local'
# env = 'kaggle'

## データの読み込み

In [3]:
# ディレクトリの指定
if env == 'local':
    BASE_DIR = '../../data'
elif env == 'kaggle':
    BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions/'
else:
    raise ValueError("Invalid environment. Set env as 'local' or 'kaggle'.")

# データの読み込み
# train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
train_df = pd.read_csv(f'{BASE_DIR}/train_integerized.csv')
greeks_df = pd.read_csv(f'{BASE_DIR}/greeks.csv')
test_df = pd.read_csv(f'{BASE_DIR}/test.csv')
submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# greeksと結合
train_df = pd.merge(train_df, greeks_df[['Id', 'Alpha', 'Epsilon']], on='Id', how='left')

greeksのAはClass0に、B, G, DはClass1に相当

In [4]:
greeks_df.Alpha.value_counts()

Alpha
A    509
B     61
G     29
D     18
Name: count, dtype: int64

## Epsilonを特徴量に追加

In [5]:
# 欠損値以外の日付をグレゴリオ暦の序数形式（1年1月1日を1とし、1日ずつ増やしていく）に変換
train_df.Epsilon[train_df.Epsilon != 'Unknown'] = train_df.Epsilon[train_df.Epsilon != 'Unknown']\
                                        .map(lambda x: datetime.strptime(x, '%m/%d/%Y').toordinal())
# 欠損値をnp.nanに変換
train_df.Epsilon[train_df.Epsilon == 'Unknown'] = np.nan

# テストデータは訓練データの最大値+1とする
test_df['Epsilon'] = train_df.Epsilon.max()+1

In [6]:
# 訓練データを説明変数と目的変数に分割
y_train = train_df['Alpha']
train_df = train_df.drop(['Id', 'EJ', 'Alpha'], axis=1)
# テストデータから数値データ以外を削除
X_test = test_df.drop(['Id', 'EJ'], axis=1)

In [7]:
ros = RandomOverSampler(random_state=42)
train_df, y_train = ros.fit_resample(train_df, y_train)

In [8]:
train_df

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FR,FS,GB,GE,GF,GH,GI,GL,Class,Epsilon
0,49.0,1284724.5,9783.0,7231.0,1552.0,79.0,42.0,1557.0,790.5,516469.0,...,2997.5,14.0,1219.0,16439.0,458118.5,5949.0,27152.0,0.120343,1,737137.0
1,34.0,404448.0,9783.0,11937.0,1552.0,410.0,42.0,2145.0,175.0,687976.0,...,857.0,84.0,999.0,16439.0,6397248.0,7830.0,12493.0,21.978000,0,
2,110.0,1088887.0,9783.0,10449.0,1552.0,760.0,42.0,2035.0,175.0,642776.0,...,1682.0,177.0,3986.0,20061.0,3126876.5,7531.0,13683.0,0.196941,0,
3,59.0,1578368.5,13802.0,24899.0,1552.0,416.0,42.0,1754.0,175.0,521862.0,...,857.0,42.0,1992.0,18659.0,478798.0,10736.0,35184.0,0.155829,0,
4,89.0,1542582.0,9783.0,4554.0,1552.0,445.0,90.0,539.0,14535.0,716988.0,...,83623.0,18.0,1764.0,33079.0,1948873.0,12196.0,14099.0,0.096614,1,737509.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031,555.0,2909244.0,97155.0,7360.0,1552.0,634.0,42.0,2751.0,602.0,458045.0,...,5318.0,10.0,2819.0,16439.0,6722726.0,12346.0,10639.0,21.978000,1,737038.0
2032,138.0,1543064.0,9783.0,19837.0,1552.0,592.5,42.0,1993.0,175.0,627263.0,...,857.0,15.0,1879.0,16439.0,2052247.0,9077.0,10191.0,21.978000,1,737202.0
2033,180.0,2808087.0,9783.0,14914.0,3335.0,641.0,42.0,1437.0,622.0,612339.0,...,1545.0,140.0,1812.0,16439.0,45137.0,4674.0,24367.0,1.782000,1,737125.0
2034,174.0,2914308.0,9783.0,6621.0,1552.0,484.0,71.0,1363.0,1050.5,602943.0,...,857.0,106.0,955.0,16439.0,2508018.0,11518.0,17034.5,0.092564,1,737112.0


## モデル、評価基準

In [9]:
from sklearn.model_selection import KFold as KF, GridSearchCV

In [10]:
class WeightedEns(BaseEstimator):
    def __init__(self):
        self.models = [
            XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.2, subsample=0.9, colsample_bytree=0.85),
            XGBClassifier(),
            TabPFNClassifier(N_ensemble_configurations=48,device='cuda:0'),
            TabPFNClassifier(N_ensemble_configurations=128,device='cuda:0')
        ]
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        # self.imputer = KNNImputer(n_neighbors=50)
    
    def fit(self, X, y, weights=None):
        classes, y = np.unique(y, return_inverse=True)
        self.classes_ = classes
        X = self.imputer.fit_transform(X)
        for i, model_ in enumerate(self.models):
            if isinstance(model_, XGBClassifier):
                # model_.fit(X, y, sample_weight=weights) # 決定木ではweightを考慮する
                model_.fit(X, y) # 決定木ではweightを考慮する
            elif model_ == self.models[2] or model_ == self.models[3]:
                model_.fit(X, y, overwrite_warning=True)
    
    def predict_proba(self, X):
        X = self.imputer.transform(X)
        ps = np.stack([model.predict_proba(X) for model in self.models])
        p = np.mean(ps,axis=0)
        # クラス0の予測確率の合計
        class_0_est_instances = p[:,0].sum()
        # クラス１の予測確率の合計
        others_est_instances = p[:,1:].sum()
        # 予測確率を合計値で割る
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        # 各サンプルの予測確率の合計が1になるように正規化
        new_p = new_p / np.sum(new_p,axis=1,keepdims=1)
        return new_p

# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

## CV

In [11]:
cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)

In [12]:
def training(model, X, y):
    # 初期値
    preds = []
    best_loss = np.inf
    fold = 0
    folds = 5

    for train_index, valid_index in cv_inner.split(train_df):
        fold += 1
        # 訓練データを分割
        X_train_fold = train_df.drop('Class', axis=1).iloc[train_index]
        y_train_fold = y_train.iloc[train_index]
        X_valid_fold = train_df.drop('Class', axis=1).iloc[valid_index]
        y_valid_fold = train_df['Class'].iloc[valid_index]

        # Alphaのラベルに重みづけをする
        # sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_fold)
        
        # モデルを訓練、予測を出力
        model.fit(X_train_fold,y_train_fold)
        valid_preds = model.predict_proba(X_valid_fold)

        # 2値分類に変換
        probabilities = np.concatenate((valid_preds[:, :1], np.sum(valid_preds[:, 1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:, :1]
        
        y_p = np.empty((valid_preds.shape[0],))
        for i in range(valid_preds.shape[0]):
            if p0[i] >= 0.5:
                y_p[i] = False
            else:
                y_p[i] = True
        y_p = y_p.astype(int)

        # balanced_log_lossを計算
        loss = balanced_log_loss(y_valid_fold, y_p)

        # best_modelを更新
        if loss < best_loss:
            best_model = model
            best_loss = loss
            print('Best model saved during fold %d' % fold)

        preds.append(loss)
        print('Validation loss for fold %d: %.5f' % (fold, loss))

    # CVスコアを計算
    mean_loss = np.mean(preds)
    print('Mean validation loss over %d folds: %.5f' % (folds, mean_loss))
    return best_model

In [13]:
model = WeightedEns()

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [14]:
best_model = training(model, train_df, y_train)

Best model saved during fold 1
Validation loss for fold 1: 0.15419
Best model saved during fold 2
Validation loss for fold 2: 0.00000
Validation loss for fold 3: 0.00000
Validation loss for fold 4: 0.19189
Validation loss for fold 5: 0.20081
Mean validation loss over 5 folds: 0.10938


## 提出

In [15]:
# 提出用に値を変換
if env == 'kaggle':
    # 予測
    # 各分割ごとのテストデータに対する予測値を格納
    y_pred = best_model.predict_proba(X_test)

    # 2値分類の値に変換
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:,:1]

    # 提出
    submission = pd.DataFrame(columns = submission_df.columns)
    submission['Id'] = test_df['Id']
    submission['class_0'] = p0
    submission['class_1'] = 1 - p0
    # submission.to_csv('submission.csv',index=False)