In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb

# Model Stacking

The original dataset contains significant missing data. Using base models to fill in these gaps, instead of discarding the data entirely or filling all missing values with a single default value, reduces the impact on the dataset. By stacking a series of base models for imputation, initial predictions, and then consolidating their results, we can typically avoid overfitting, mitigate underfitting, and reduce generalization error. In some cases, this approach can also capture subtle relationships.

Here, I used two base models to fill in the missing values in the **age** and **gender** columns. Additionally, I experimented with a meta-model (binary models for each of the 12 `country_destination` categories) in a deprecated file, though it was ultimately not adopted.

All models in this process were trained using **XGBoost**.

In [2]:
df = pd.read_feather("data/preprocessed/all_data")
df

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
0,gxn3p5htnn,2010,6,28,201006,26,20100626,20100628,2009,3,...,,,,,,,,,,
1,820tgsjxq7,2011,5,25,201105,21,20110521,20110525,2009,5,...,,,,,,,,,,
2,4ft3gnwmtx,2010,9,28,201009,39,20100939,20100928,2009,6,...,,,,,,,,,,
3,bjjt8pjhuk,2011,12,5,201112,49,20111249,20111205,2009,10,...,,,,,,,,,,
4,87mebub9p4,2010,9,14,201009,37,20100937,20100914,2009,12,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,89.0,,,4.0,
275543,zp8xfonng8,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,,,,,
275544,fa6260ziny,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,,,78.0,,,,
275545,87k0fy4ugm,2014,9,30,201409,39,20140939,20140930,2014,9,...,,,,11.0,,,,,4.0,


In [3]:
def stack(target, cat, num_classes, num_folds, num_boost_round, param):
    data_mask = df[target].isna()
    train_data = df[~data_mask].copy()
    test_data = df[data_mask].copy()
    
    X_train = train_data.drop(columns=["id", "country_destination", target])
    for col in X_train.columns:
        if col not in X_train.select_dtypes(include="category").columns.tolist():
            X_train[col] = X_train[col].fillna(-99999)
    y_train = train_data[target].values
    X_test = test_data.drop(columns=["id", "country_destination", target])
    for col in X_test.columns:
        if col not in X_test.select_dtypes(include="category").columns.tolist():
            X_test[col] = X_test[col].fillna(-99999)

    if cat:
        y_train = train_data[target].cat.codes.values
        target_classes = train_data[target].cat.categories.tolist()
        
    test_pred = np.zeros((len(test_data), num_classes))

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    test_fold_preds = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f"Fold {fold + 1}/{num_folds}")
        
        X_tr, y_tr = X_train.iloc[train_idx], y_train[train_idx]
        X_val, y_val = X_train.iloc[val_idx], y_train[val_idx]

        dtrain = xgb.DMatrix(data=X_tr, label=y_tr, enable_categorical=True)
        dval = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
        dtest = xgb.DMatrix(data=X_test, enable_categorical=True)
        
        watchlist = [(dtrain, "train"), (dval, "eval")]
        
        model = xgb.train(
            params=param,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=watchlist,
            early_stopping_rounds=10,
            verbose_eval=200,
        )
        
        test_fold_pred = model.predict(dtest)
        test_fold_preds.append(test_fold_pred)

    test_pred = np.mean(test_fold_preds, axis=0)

    if cat:
        test_pred_labels = np.argmax(test_pred, axis=1)
        test_pred = [target_classes[label] for label in test_pred_labels]

    df.loc[data_mask, target] = test_pred

In [4]:
stack(
    target="age",
    cat=False,
    num_classes=1,
    num_folds=6,
    num_boost_round=3000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "eta": 0.01,
        "max_depth": 7,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
    },
)

Fold 1/6
[0]	train-rmse:13.94145	eval-rmse:13.96361
[200]	train-rmse:7.50422	eval-rmse:7.66501
[400]	train-rmse:6.79030	eval-rmse:7.04216
[600]	train-rmse:6.59522	eval-rmse:6.92149
[800]	train-rmse:6.46732	eval-rmse:6.87289
[1000]	train-rmse:6.36818	eval-rmse:6.84756
[1036]	train-rmse:6.35305	eval-rmse:6.84597
Fold 2/6
[0]	train-rmse:13.93065	eval-rmse:14.01773
[200]	train-rmse:7.50795	eval-rmse:7.62767
[400]	train-rmse:6.79562	eval-rmse:6.98595
[600]	train-rmse:6.61002	eval-rmse:6.85965
[800]	train-rmse:6.48286	eval-rmse:6.80579
[1000]	train-rmse:6.37763	eval-rmse:6.77675
[1139]	train-rmse:6.32073	eval-rmse:6.76485
Fold 3/6
[0]	train-rmse:13.94011	eval-rmse:13.97032
[200]	train-rmse:7.50301	eval-rmse:7.64336
[400]	train-rmse:6.79027	eval-rmse:7.02365
[600]	train-rmse:6.59107	eval-rmse:6.90791
[800]	train-rmse:6.46101	eval-rmse:6.86002
[1000]	train-rmse:6.35745	eval-rmse:6.83645
[1200]	train-rmse:6.26512	eval-rmse:6.81930
[1244]	train-rmse:6.24823	eval-rmse:6.81784
Fold 4/6
[0]	train-r

In [5]:
df.to_feather("cache/stacked_age")

In [6]:
stack(
    target="gender",
    cat=True,
    num_classes=3,
    num_folds=3,
    num_boost_round=1000,
    param={
        "tree_method": "hist",
        "device": "cuda",
        "objective": "multi:softprob",
        "eval_metric": "mlogloss",
        "eta": 0.01,
        "max_depth": 6,
        "subsample": 0.7,
        "colsample_bytree": 0.3,
        "lambda": 1.0,
        "alpha": 1.0,
        "nthread": 4,
        "num_class": 3,
    },
)

Fold 1/3
[0]	train-mlogloss:1.08631	eval-mlogloss:1.08634
[200]	train-mlogloss:0.26677	eval-mlogloss:0.26889
[400]	train-mlogloss:0.12348	eval-mlogloss:0.12628
[600]	train-mlogloss:0.08676	eval-mlogloss:0.09045
[800]	train-mlogloss:0.06919	eval-mlogloss:0.07403
[999]	train-mlogloss:0.05875	eval-mlogloss:0.06469
Fold 2/3
[0]	train-mlogloss:1.08632	eval-mlogloss:1.08631
[200]	train-mlogloss:0.26874	eval-mlogloss:0.27025
[400]	train-mlogloss:0.12450	eval-mlogloss:0.12690
[600]	train-mlogloss:0.08696	eval-mlogloss:0.09032
[800]	train-mlogloss:0.06954	eval-mlogloss:0.07387
[999]	train-mlogloss:0.05931	eval-mlogloss:0.06466
Fold 3/3
[0]	train-mlogloss:1.08632	eval-mlogloss:1.08631
[200]	train-mlogloss:0.26924	eval-mlogloss:0.26895
[400]	train-mlogloss:0.12532	eval-mlogloss:0.12550
[600]	train-mlogloss:0.08783	eval-mlogloss:0.08904
[800]	train-mlogloss:0.07012	eval-mlogloss:0.07253
[999]	train-mlogloss:0.05971	eval-mlogloss:0.06333


In [7]:
df.to_feather("cache/stacked_age_gender")