In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import KFold

In [2]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


def split_vals(df: pd.DataFrame, n: int):
    return df[:n].copy(), df[n:].copy()


def ndcg(actual, predicted, k=5):
    ndcg_scores = []

    for i in range(len(actual)):
        rel = [1 if p == actual[i] else 0 for p in predicted[i][:k]]
        dcg = sum((2 ** rel[j] - 1) / np.log2(j + 2) for j in range(k))
        ndcg = dcg
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


def get_rank(arr):
    ranks = np.zeros_like(arr)
    ranks[np.argsort(arr)[::-1]] = np.arange(1, len(arr) + 1)
    return ranks

In [3]:
all_data = pd.read_feather("cache/stacked_age")
for col in all_data.columns:
    if (
        col
        not in ["country_destination"]
        + all_data.select_dtypes(include="category").columns.tolist()
    ):
        all_data[col] = all_data[col].fillna(-99999)
test_idx = all_data["id"].isin(pd.read_csv("data/test_users.csv")["id"])
train_data = all_data[~test_idx]
test_data = all_data[test_idx]

In [4]:
train_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
0,gxn3p5htnn,2010,6,28,201006,26,20100626,20100628,2009,3,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1,820tgsjxq7,2011,5,25,201105,21,20110521,20110525,2009,5,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2,4ft3gnwmtx,2010,9,28,201009,39,20100939,20100928,2009,6,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
3,bjjt8pjhuk,2011,12,5,201112,49,20111249,20111205,2009,10,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
4,87mebub9p4,2010,9,14,201009,37,20100937,20100914,2009,12,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,108.0,-99999.0,-99999.0,-99999.0,-99999.0,2.0,-99999.0
213447,mhewnxesx9,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,2.0,-99999.0,232.0,-99999.0,-99999.0,4.0,-99999.0
213448,6o3arsjbb4,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,18.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
213449,jh95kwisub,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,75.0,-99999.0


In [5]:
test_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
213451,5uwns89zht,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,1.0,-99999.0
213452,jtl0dijy2j,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,9.0,-99999.0
213453,xx0ulgorjt,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,58.0,-99999.0,-99999.0,-99999.0,-99999.0
213454,6c6puo6ix0,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,11.0,-99999.0,-99999.0,-99999.0,-99999.0
213455,czqhjk3yfe,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,19.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,89.0,-99999.0,-99999.0,4.0,-99999.0
275543,zp8xfonng8,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
275544,fa6260ziny,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,78.0,-99999.0,-99999.0,-99999.0,-99999.0
275545,87k0fy4ugm,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,11.0,-99999.0,-99999.0,-99999.0,-99999.0,4.0,-99999.0


In [6]:
all_data.select_dtypes(include="object").columns.tolist()

['id']

In [7]:
num_folds = 2
num_classes = len(country_levels)

train_pred = np.zeros((len(train_data), num_classes))
train_rank = np.zeros((len(train_data), num_classes))
test_pred = np.zeros((len(test_data), num_classes))
test_rank = np.zeros((len(test_data), num_classes))

test_dm = xgb.DMatrix(
    data=test_data.drop(columns=["id", "country_destination"]),
    enable_categorical=True,
)

In [8]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_iters = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print(f"Fold {fold_idx + 1}/{num_folds}")

    train_subset = train_data.iloc[train_idx]
    val_subset = train_data.iloc[val_idx]

    train_dm = xgb.DMatrix(
        data=train_subset.drop(columns=["id", "country_destination"]),
        label=train_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )
    val_dm = xgb.DMatrix(
        data=val_subset.drop(columns=["id", "country_destination"]),
        label=val_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )

    params = {
        "tree_method": "hist",
        "device": "cuda",
        "max_depth": 7,
        "eta": 0.22,
        "booster": "gbtree",
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "objective": "multi:softprob",
        "num_class": num_classes,
    }

    model = xgb.train(
        params=params,
        dtrain=train_dm,
        num_boost_round=200,
        early_stopping_rounds=50,
        evals=[(train_dm, "train"), (val_dm, "val")],
        verbose_eval=10,
    )

    best_iters.append(model.best_iteration)

    train_pred_fold = model.predict(val_dm, iteration_range=(0, model.best_iteration))
    train_pred[val_idx] = train_pred_fold
    train_rank[val_idx] = np.apply_along_axis(get_rank, 1, train_pred_fold)

    test_pred += model.predict(test_dm, iteration_range=(0, model.best_iteration))

test_pred /= num_folds
test_rank = np.apply_along_axis(get_rank, 1, test_pred)

Fold 1/2
[0]	train-mlogloss:1.39105	val-mlogloss:1.38977
[10]	train-mlogloss:0.27679	val-mlogloss:0.27952
[20]	train-mlogloss:0.16441	val-mlogloss:0.17101
[30]	train-mlogloss:0.14289	val-mlogloss:0.15271
[40]	train-mlogloss:0.13366	val-mlogloss:0.14822
[50]	train-mlogloss:0.12775	val-mlogloss:0.14726
[60]	train-mlogloss:0.12324	val-mlogloss:0.14703
[70]	train-mlogloss:0.11979	val-mlogloss:0.14708
[80]	train-mlogloss:0.11627	val-mlogloss:0.14750
[90]	train-mlogloss:0.11290	val-mlogloss:0.14797
[100]	train-mlogloss:0.10981	val-mlogloss:0.14831
[108]	train-mlogloss:0.10743	val-mlogloss:0.14860
Fold 2/2
[0]	train-mlogloss:1.38790	val-mlogloss:1.39112
[10]	train-mlogloss:0.27270	val-mlogloss:0.28340
[20]	train-mlogloss:0.15983	val-mlogloss:0.17515
[30]	train-mlogloss:0.13860	val-mlogloss:0.15686
[40]	train-mlogloss:0.12998	val-mlogloss:0.15237
[50]	train-mlogloss:0.12517	val-mlogloss:0.15105
[60]	train-mlogloss:0.12167	val-mlogloss:0.15092
[70]	train-mlogloss:0.11713	val-mlogloss:0.15111
[8

In [9]:
train_actual = (
    train_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
train_top5_rank = np.argsort(train_rank, axis=1)[:, :5]

test_actual = (
    test_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
test_top5_rank = np.argsort(test_rank, axis=1)[:, :5]


def masked_ndcgs(mask=[]):
    train_mask = ~(train_data["country_destination"].isin(mask))
    test_mask = ~(test_data["country_destination"].isin(mask))

    if not len(mask):
        mask = "nothing"

    train_masked_ndcg5 = ndcg(train_actual[train_mask], train_top5_rank[train_mask], 5)
    print(f"Train NDCG5 with {mask} masked: {train_masked_ndcg5}")
    # test_masked_ndcg5 = ndcg(test_actual[test_mask], test_top5_rank[test_mask], 5)
    # print(f"Test NDCG5 with {mask} masked: {test_masked_ndcg5}")

In [10]:
masked_ndcgs()

Train NDCG5 with nothing masked: 0.982638477475444


In [11]:
test_top5 = np.argsort(-test_pred, axis=1)[:, :5]
test_predictions = pd.DataFrame(test_pred, columns=country_levels)
test_predictions["TestId"] = test_data["id"].values
# test_predictions.to_csv("cache/probability.csv", index=False)

In [12]:
predicted_countries = np.array([[country_levels[i] for i in row] for row in test_top5])

submission_ids = np.repeat(test_data["id"].values, 5)

submission_countries = predicted_countries.flatten()

submission = pd.DataFrame({"id": submission_ids, "country": submission_countries})

submission.to_csv("result.csv", index=False)