In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb

In [2]:
df = pd.read_feather("data/preprocessed/all_data")
df

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
0,gxn3p5htnn,2010,6,28,201006,26,20100626,20100628,2009,3,...,,,,,,,,,,
1,820tgsjxq7,2011,5,25,201105,21,20110521,20110525,2009,5,...,,,,,,,,,,
2,4ft3gnwmtx,2010,9,28,201009,39,20100939,20100928,2009,6,...,,,,,,,,,,
3,bjjt8pjhuk,2011,12,5,201112,49,20111249,20111205,2009,10,...,,,,,,,,,,
4,87mebub9p4,2010,9,14,201009,37,20100937,20100914,2009,12,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,89.0,,,4.0,
275543,zp8xfonng8,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,,,,,
275544,fa6260ziny,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,78.0,,,,
275545,87k0fy4ugm,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,11.0,,,,,4.0,


In [3]:
def stack(target, cat, num_classes, num_folds, num_boost_round, param):
    # 根据 target 是否为 NaN 划分训练集和测试集
    data_mask = df[target].isna()
    train_data = df[~data_mask].copy()
    test_data = df[data_mask].copy()

    # 特征和目标
    X_train = train_data.drop(columns=["id", "country_destination", target])
    for col in X_train.columns:
        if col not in X_train.select_dtypes(include="category").columns.tolist():
            X_train[col] = X_train[col].fillna(-99999)
    y_train = train_data[target].values
    X_test = test_data.drop(columns=["id", "country_destination", target])
    for col in X_test.columns:
        if col not in X_test.select_dtypes(include="category").columns.tolist():
            X_test[col] = X_test[col].fillna(-99999)

    if cat:
        y_train = train_data[target].cat.codes.values  # 转为数字编码
        target_classes = train_data[target].cat.categories.tolist()  # 保存类别信息

    # train_pred = np.zeros((len(train_data), num_classes))
    test_pred = np.zeros((len(test_data), num_classes))

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    test_fold_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f"Fold {fold + 1}/{num_folds}")

        # 划分数据
        X_tr, y_tr = X_train.iloc[train_idx], y_train[train_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train[val_idx]

        dtrain = xgb.DMatrix(data=X_tr, label=y_tr, enable_categorical=True)
        dval = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
        dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

        # 设置 XGBoost 参数
        watchlist = [(dtrain, "train"), (dval, "eval")]

        # 模型训练
        model = xgb.train(
            params=param,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=watchlist,
            early_stopping_rounds=10,
            verbose_eval=200,
        )

        # 预测验证集
        # val_pred = model.predict(dval)
        # train_pred[val_idx] = val_pred.reshape(-1, num_classes)

        # 预测测试集
        test_fold_pred = model.predict(dtest)
        test_fold_preds.append(test_fold_pred)

    test_pred = np.mean(test_fold_preds, axis=0)

    if cat:
        test_pred_labels = np.argmax(test_pred, axis=1)
        test_pred = [target_classes[label] for label in test_pred_labels]

    df.loc[data_mask, target] = test_pred

In [4]:
stack(
    target="age",
    cat=False,
    num_classes=1,
    num_folds=6,
    num_boost_round=3000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "eta": 0.01,
        "max_depth": 7,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
    },
)

Fold 1/6
[0]	train-rmse:13.94179	eval-rmse:13.96369
[200]	train-rmse:7.56713	eval-rmse:7.73467
[400]	train-rmse:6.78723	eval-rmse:7.04897
[600]	train-rmse:6.59113	eval-rmse:6.93078
[800]	train-rmse:6.46378	eval-rmse:6.87590
[1000]	train-rmse:6.35841	eval-rmse:6.84928
[1084]	train-rmse:6.32031	eval-rmse:6.84321
Fold 2/6
[0]	train-rmse:13.93094	eval-rmse:14.01781
[200]	train-rmse:7.57370	eval-rmse:7.69655
[400]	train-rmse:6.80224	eval-rmse:6.99060
[600]	train-rmse:6.61422	eval-rmse:6.86670
[800]	train-rmse:6.48357	eval-rmse:6.80507
[1000]	train-rmse:6.37649	eval-rmse:6.77708
[1200]	train-rmse:6.29063	eval-rmse:6.75879
[1315]	train-rmse:6.24368	eval-rmse:6.75282
Fold 3/6
[0]	train-rmse:13.94041	eval-rmse:13.97036
[200]	train-rmse:7.56658	eval-rmse:7.71139
[400]	train-rmse:6.78836	eval-rmse:7.02298
[600]	train-rmse:6.58602	eval-rmse:6.91100
[800]	train-rmse:6.45013	eval-rmse:6.86192
[1000]	train-rmse:6.34635	eval-rmse:6.83867
[1083]	train-rmse:6.30646	eval-rmse:6.83226
Fold 4/6
[0]	train-r

In [5]:
df.to_feather("cache/stacked_age")

In [6]:
stack(
    target="gender",
    cat=True,
    num_classes=3,
    num_folds=3,
    num_boost_round=1000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "eta": 0.01,
        "max_depth": 6,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
        "num_class": 3,
    },
)

Fold 1/3
[0]	train-mlogloss:1.08590	eval-mlogloss:1.08594
[200]	train-mlogloss:0.27312	eval-mlogloss:0.27538
[400]	train-mlogloss:0.12624	eval-mlogloss:0.12928
[600]	train-mlogloss:0.08832	eval-mlogloss:0.09214
[800]	train-mlogloss:0.07053	eval-mlogloss:0.07548
[999]	train-mlogloss:0.06026	eval-mlogloss:0.06635
Fold 2/3
[0]	train-mlogloss:1.08590	eval-mlogloss:1.08589
[200]	train-mlogloss:0.27446	eval-mlogloss:0.27594
[400]	train-mlogloss:0.12736	eval-mlogloss:0.12978
[600]	train-mlogloss:0.08879	eval-mlogloss:0.09207
[800]	train-mlogloss:0.07086	eval-mlogloss:0.07514
[999]	train-mlogloss:0.06064	eval-mlogloss:0.06598
Fold 3/3
[0]	train-mlogloss:1.08591	eval-mlogloss:1.08587
[200]	train-mlogloss:0.27463	eval-mlogloss:0.27427
[400]	train-mlogloss:0.12814	eval-mlogloss:0.12815
[600]	train-mlogloss:0.08951	eval-mlogloss:0.09038
[800]	train-mlogloss:0.07178	eval-mlogloss:0.07378
[999]	train-mlogloss:0.06143	eval-mlogloss:0.06465


In [7]:
df.to_feather("cache/stacked_age_gender")