In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb

In [None]:
df = pd.read_feather("data/preprocessed/all_data")
df

In [None]:
def stack(target, cat, num_classes, num_folds, num_boost_round, param):
    # 根据 target 是否为 NaN 划分训练集和测试集
    data_mask = df[target].isna()
    train_data = df[~data_mask].copy()
    test_data = df[data_mask].copy()

    # 特征和目标
    X_train = train_data.drop(columns=["id", "country_destination", target])
    for col in X_train.columns:
        if col not in X_train.select_dtypes(include="category").columns.tolist():
            X_train[col] = X_train[col].fillna(-99999)
    y_train = train_data[target].values
    X_test = test_data.drop(columns=["id", "country_destination", target])
    for col in X_test.columns:
        if col not in X_test.select_dtypes(include="category").columns.tolist():
            X_test[col] = X_test[col].fillna(-99999)

    if cat:
        y_train = train_data[target].cat.codes.values  # 转为数字编码
        target_classes = train_data[target].cat.categories.tolist()  # 保存类别信息

    # train_pred = np.zeros((len(train_data), num_classes))
    test_pred = np.zeros((len(test_data), num_classes))

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    test_fold_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f"Fold {fold + 1}/{num_folds}")

        # 划分数据
        X_tr, y_tr = X_train.iloc[train_idx], y_train[train_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train[val_idx]

        dtrain = xgb.DMatrix(data=X_tr, label=y_tr, enable_categorical=True)
        dval = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
        dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

        # 设置 XGBoost 参数
        watchlist = [(dtrain, "train"), (dval, "eval")]

        # 模型训练
        model = xgb.train(
            params=param,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=watchlist,
            early_stopping_rounds=10,
            verbose_eval=200,
        )

        # 预测验证集
        # val_pred = model.predict(dval)
        # train_pred[val_idx] = val_pred.reshape(-1, num_classes)

        # 预测测试集
        test_fold_pred = model.predict(dtest)
        test_fold_preds.append(test_fold_pred)

    test_pred = np.mean(test_fold_preds, axis=0)

    if cat:
        test_pred_labels = np.argmax(test_pred, axis=1)
        test_pred = [target_classes[label] for label in test_pred_labels]

    df.loc[data_mask, target] = test_pred

In [None]:
stack(
    target="age",
    cat=False,
    num_classes=1,
    num_folds=6,
    num_boost_round=3000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "eta": 0.01,
        "max_depth": 7,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
    },
)

In [None]:
df.to_feather("cache/stacked_age")

In [None]:
stack(
    target="gender",
    cat=True,
    num_classes=3,
    num_folds=3,
    num_boost_round=1000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "eta": 0.01,
        "max_depth": 6,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
        "num_class": 3,
    },
)

In [None]:
df.to_feather("cache/stacked_age_gender")