In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import KFold

In [None]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


def split_vals(df: pd.DataFrame, n: int):
    return df[:n].copy(), df[n:].copy()


def ndcg(actual, predicted, k=5):
    ndcg_scores = []

    for i in range(len(actual)):
        rel = [1 if p == actual[i] else 0 for p in predicted[i][:k]]
        dcg = sum((2 ** rel[j] - 1) / np.log2(j + 2) for j in range(k))
        ndcg = dcg
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


def get_rank(arr):
    ranks = np.zeros_like(arr)
    ranks[np.argsort(arr)[::-1]] = np.arange(1, len(arr) + 1)
    return ranks

In [None]:
all_data = pd.read_feather("cache/stacked_age")
for col in all_data.columns:
    if (
        col
        not in ["country_destination"]
        + all_data.select_dtypes(include="category").columns.tolist()
    ):
        all_data[col] = all_data[col].fillna(-99999)
all_data = all_data[all_data["date_account_created_yearmonth"] >= 201304]

In [None]:
test_idx = all_data["id"].isin(pd.read_csv("data/test_users.csv")["id"])
train_data = all_data[~test_idx]
test_data = all_data[test_idx]

In [None]:
train_data

In [None]:
test_data

In [None]:
all_data.select_dtypes(include="object").columns.tolist()

In [None]:
num_folds = 2
num_classes = len(country_levels)

train_pred = np.zeros((len(train_data), num_classes))
train_rank = np.zeros((len(train_data), num_classes))
test_pred = np.zeros((len(test_data), num_classes))
test_rank = np.zeros((len(test_data), num_classes))

test_dm = xgb.DMatrix(
    data=test_data.drop(columns=["id", "country_destination"]),
    enable_categorical=True,
)

In [None]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_iters = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print(f"Fold {fold_idx + 1}/{num_folds}")

    train_subset = train_data.iloc[train_idx]
    val_subset = train_data.iloc[val_idx]

    train_dm = xgb.DMatrix(
        data=train_subset.drop(columns=["id", "country_destination"]),
        label=train_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )
    val_dm = xgb.DMatrix(
        data=val_subset.drop(columns=["id", "country_destination"]),
        label=val_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )

    params = {
        "tree_method": "hist",
        "device": "cuda",
        "max_depth": 7,
        "eta": 0.22,
        "booster": "gbtree",
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "objective": "multi:softprob",
        "num_class": num_classes,
    }

    model = xgb.train(
        params=params,
        dtrain=train_dm,
        num_boost_round=200,
        early_stopping_rounds=50,
        evals=[(train_dm, "train"), (val_dm, "val")],
        verbose_eval=10,
    )

    best_iters.append(model.best_iteration)

    train_pred_fold = model.predict(val_dm, iteration_range=(0, model.best_iteration))
    train_pred[val_idx] = train_pred_fold
    train_rank[val_idx] = np.apply_along_axis(get_rank, 1, train_pred_fold)

    test_pred += model.predict(test_dm, iteration_range=(0, model.best_iteration))

test_pred /= num_folds
test_rank = np.apply_along_axis(get_rank, 1, test_pred)

In [None]:
train_actual = (
    train_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
train_top5_rank = np.argsort(train_rank, axis=1)[:, :5]

test_actual = (
    test_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
test_top5_rank = np.argsort(test_rank, axis=1)[:, :5]


def masked_ndcgs(mask=[]):
    train_mask = ~(train_data["country_destination"].isin(mask))
    test_mask = ~(test_data["country_destination"].isin(mask))

    if not len(mask):
        mask = "nothing"

    train_masked_ndcg5 = ndcg(train_actual[train_mask], train_top5_rank[train_mask], 5)
    print(f"Train NDCG5 with {mask} masked: {train_masked_ndcg5}")
    # test_masked_ndcg5 = ndcg(test_actual[test_mask], test_top5_rank[test_mask], 5)
    # print(f"Test NDCG5 with {mask} masked: {test_masked_ndcg5}")

In [None]:
masked_ndcgs()

In [None]:
test_top5 = np.argsort(-test_pred, axis=1)[:, :5]
test_predictions = pd.DataFrame(test_pred, columns=country_levels)
test_predictions["TestId"] = test_data["id"].values
# test_predictions.to_csv("cache/probability.csv", index=False)

In [None]:
predicted_countries = np.array([[country_levels[i] for i in row] for row in test_top5])

submission_ids = np.repeat(test_data["id"].values, 5)

submission_countries = predicted_countries.flatten()

submission = pd.DataFrame({"id": submission_ids, "country": submission_countries})

submission.to_csv("result.csv", index=False)