In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold

In [2]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


def split_vals(df: pd.DataFrame, n: int):
    return df[:n].copy(), df[n:].copy()


def ndcg(actual, predicted, k=5):
    ndcg_scores = []

    for i in range(len(actual)):
        rel = [1 if p == actual[i] else 0 for p in predicted[i][:k]]
        dcg = sum((2 ** rel[j] - 1) / np.log2(j + 2) for j in range(k))
        ndcg = dcg
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


def get_rank(arr):
    ranks = np.zeros_like(arr)
    ranks[np.argsort(arr)[::-1]] = np.arange(1, len(arr) + 1)
    return ranks

In [3]:
# train_data = pd.read_feather("data/preprocessed/train_data_with_binary_classification")
all_data = pd.read_feather("data/preprocessed/train_data_with_binary_classification")

train_data, test_data = split_vals(
    pd.read_feather("data/preprocessed/train_data_with_binary_classification"),
    int(np.round(len(all_data) * 0.7)),
)
train_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,age_group,destination_distance_km,destination_area,...,Predother,PredFR,PredIT,PredGB,PredES,PredCA,PredDE,PredNL,PredAU,PredPT
0,gxn3p5htnn,2010,6,28,2009,3,19,-42424242.0,-42424242.0,-42424242.0,...,0.033862,0.000007,0.000007,0.000011,0.000007,0.000009,0.000008,0.000012,0.000009,0.000009
1,820tgsjxq7,2011,5,25,2009,5,23,7.0,-42424242.0,-42424242.0,...,0.105701,0.000008,0.000008,0.000012,0.000007,0.000009,0.000008,0.000012,0.000009,0.000009
2,4ft3gnwmtx,2010,9,28,2009,6,9,11.0,0.0,9826675.0,...,0.000096,0.000020,0.000007,0.000012,0.000009,0.000012,0.000012,0.000009,0.000022,0.000009
3,bjjt8pjhuk,2011,12,5,2009,10,31,8.0,-42424242.0,-42424242.0,...,0.090983,0.000007,0.000007,0.000012,0.000008,0.000009,0.000008,0.000012,0.000009,0.000009
4,87mebub9p4,2010,9,14,2009,12,8,8.0,0.0,9826675.0,...,0.000108,0.000023,0.000007,0.000013,0.000008,0.000012,0.000014,0.000009,0.000023,0.000010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149411,z7wxsdj1ic,2014,2,5,2014,2,5,5.0,-42424242.0,-42424242.0,...,0.239686,0.000007,0.000007,0.000010,0.000008,0.000009,0.000007,0.000011,0.000009,0.000008
149412,51r271bypu,2014,2,5,2014,2,5,-42424242.0,-42424242.0,-42424242.0,...,0.021907,0.000007,0.000007,0.000011,0.000008,0.000009,0.000008,0.000012,0.000009,0.000009
149413,43ylgrbn5y,2014,2,5,2014,2,5,9.0,-42424242.0,-42424242.0,...,0.054800,0.000008,0.000008,0.000012,0.000007,0.000009,0.000008,0.000012,0.000009,0.000009
149414,4aoaquwl4b,2014,2,5,2014,2,5,6.0,-42424242.0,-42424242.0,...,0.041238,0.000007,0.000007,0.000011,0.000007,0.000009,0.000008,0.000011,0.000009,0.000009


In [4]:
# test_data = pd.read_feather("data/preprocessed/test_data_with_binary_classification")
test_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,age_group,destination_distance_km,destination_area,...,Predother,PredFR,PredIT,PredGB,PredES,PredCA,PredDE,PredNL,PredAU,PredPT
149416,vq0drmmgp9,2014,2,5,2014,2,5,8.0,0.0,9826675.0,...,0.000133,0.000022,0.000007,0.000013,0.000008,0.000012,0.000014,0.000009,0.000022,0.000010
149417,ypdn72ykr4,2014,2,5,2014,2,5,5.0,0.0,9826675.0,...,0.000110,0.000022,0.000007,0.000012,0.000008,0.000011,0.000015,0.000010,0.000021,0.000010
149418,g5xte8kiiv,2014,2,5,2014,2,5,7.0,-42424242.0,-42424242.0,...,0.036537,0.000007,0.000007,0.000011,0.000007,0.000009,0.000008,0.000011,0.000009,0.000009
149419,8tgo2vn9ga,2014,2,5,2014,2,5,5.0,0.0,9826675.0,...,0.000116,0.000019,0.000007,0.000012,0.000008,0.000011,0.000013,0.000008,0.000022,0.000010
149420,ufactt5flu,2014,2,5,2014,2,5,4.0,-42424242.0,-42424242.0,...,0.060734,0.000008,0.000007,0.000011,0.000008,0.000009,0.000008,0.000012,0.000009,0.000009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014,6,30,2014,6,30,6.0,-42424242.0,-42424242.0,...,0.116518,0.000007,0.000007,0.000012,0.000008,0.000009,0.000008,0.000012,0.000009,0.000009
213447,mhewnxesx9,2014,6,30,2014,6,30,-42424242.0,-42424242.0,-42424242.0,...,0.040946,0.000006,0.000007,0.000011,0.000007,0.000009,0.000008,0.000011,0.000009,0.000009
213448,6o3arsjbb4,2014,6,30,2014,6,30,6.0,-42424242.0,-42424242.0,...,0.258391,0.000008,0.000008,0.000012,0.000007,0.000009,0.000008,0.000012,0.000009,0.000009
213449,jh95kwisub,2014,6,30,2014,6,30,-42424242.0,-42424242.0,-42424242.0,...,0.006854,0.000006,0.000007,0.000011,0.000007,0.000009,0.000008,0.000011,0.000009,0.000009


In [5]:
num_folds = 2
num_classes = len(country_levels)

train_pred = np.zeros((len(train_data), num_classes))
train_rank = np.zeros((len(train_data), num_classes))
test_pred = np.zeros((len(test_data), num_classes))
test_rank = np.zeros((len(test_data), num_classes))

test_dm = xgb.DMatrix(data=test_data.drop(columns=["id", "country_destination"]).values)

In [6]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_iters = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print(f"Fold {fold_idx + 1}/{num_folds}")
    
    train_subset = train_data.iloc[train_idx]
    val_subset = train_data.iloc[val_idx]
    
    train_dm = xgb.DMatrix(
        data=train_subset.drop(columns=["id", "country_destination"]).values,
        label=train_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
    )
    val_dm = xgb.DMatrix(
        data=val_subset.drop(columns=["id", "country_destination"]).values,
        label=val_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
    )
    
    params = {
        "tree_method": "hist",
        "device": "cuda",
        "max_depth": 6,
        "eta": 0.22,
        "booster": "gbtree",
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "objective": "multi:softprob",
        "num_class": num_classes,
    }
    
    model = xgb.train(
        params=params,
        dtrain=train_dm,
        num_boost_round=200,
        early_stopping_rounds=30,
        evals=[(train_dm, "train"), (val_dm, "val")],
        verbose_eval=10,
    )

    best_iters.append(model.best_iteration)
    
    train_pred_fold = model.predict(val_dm, iteration_range=(0, model.best_iteration))
    train_pred[val_idx] = train_pred_fold
    train_rank[val_idx] = np.apply_along_axis(get_rank, 1, train_pred_fold)
    
    test_pred += model.predict(test_dm, iteration_range=(0, model.best_iteration))
    
test_pred /= num_folds
test_rank = np.apply_along_axis(get_rank, 1, test_pred)

Fold 1/2
[0]	train-mlogloss:1.38773	val-mlogloss:1.38999
[10]	train-mlogloss:0.26589	val-mlogloss:0.27155
[20]	train-mlogloss:0.16255	val-mlogloss:0.17128
[30]	train-mlogloss:0.14288	val-mlogloss:0.15519
[40]	train-mlogloss:0.13489	val-mlogloss:0.15184
[50]	train-mlogloss:0.12996	val-mlogloss:0.15118
[60]	train-mlogloss:0.12665	val-mlogloss:0.15115
[70]	train-mlogloss:0.12324	val-mlogloss:0.15173
[80]	train-mlogloss:0.12063	val-mlogloss:0.15213
[82]	train-mlogloss:0.12004	val-mlogloss:0.15229
Fold 2/2
[0]	train-mlogloss:1.38844	val-mlogloss:1.38793
[10]	train-mlogloss:0.26705	val-mlogloss:0.27055
[20]	train-mlogloss:0.16354	val-mlogloss:0.17036
[30]	train-mlogloss:0.14389	val-mlogloss:0.15412
[40]	train-mlogloss:0.13617	val-mlogloss:0.15080
[50]	train-mlogloss:0.13171	val-mlogloss:0.15029
[60]	train-mlogloss:0.12835	val-mlogloss:0.15042
[70]	train-mlogloss:0.12500	val-mlogloss:0.15098
[80]	train-mlogloss:0.12188	val-mlogloss:0.15160
[83]	train-mlogloss:0.12105	val-mlogloss:0.15171


In [7]:
train_actual = (
    train_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
train_top5_rank = np.argsort(train_rank, axis=1)[:, :5]

test_actual = (
    test_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
test_top5_rank = np.argsort(test_rank, axis=1)[:, :5]


def masked_ndcgs(mask=[]):
    train_mask = ~(train_data["country_destination"].isin(mask))
    test_mask = ~(test_data["country_destination"].isin(mask))

    if not len(mask):
        mask = "nothing"

    train_masked_ndcg5 = ndcg(train_actual[train_mask], train_top5_rank[train_mask], 5)
    print(f"Train NDCG5 with {mask} masked: {train_masked_ndcg5}")
    test_masked_ndcg5 = ndcg(test_actual[test_mask], test_top5_rank[test_mask], 5)
    print(f"Test NDCG5 with {mask} masked: {test_masked_ndcg5}")

In [8]:
masked_ndcgs()

Train NDCG5 with nothing masked: 0.9825587285830708
Test NDCG5 with nothing masked: 0.9820234231496741


In [9]:
masked_ndcgs(["US", "NDF"])

Train NDCG5 with ['US', 'NDF'] masked: 0.8680827816435228
Test NDCG5 with ['US', 'NDF'] masked: 0.8638444082171223


In [10]:
masked_ndcgs(["US"])

Train NDCG5 with ['US'] masked: 0.9749833925945615
Test NDCG5 with ['US'] masked: 0.9754577413736434


In [11]:
masked_ndcgs(["NDF"])

Train NDCG5 with ['NDF'] masked: 0.9608780948359795
Test NDCG5 with ['NDF'] masked: 0.9587070014671824
