In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import KFold

import pickle

# Rank Prediction

Now we proceed to the final step: using all the processed data and the outputs from the base models to train the final prediction model.

In [2]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


def split_vals(df: pd.DataFrame, n: int):
    return df[:n].copy(), df[n:].copy()


def ndcg(actual, predicted, k=5):
    ndcg_scores = []

    for i in range(len(actual)):
        rel = [1 if p == actual[i] else 0 for p in predicted[i][:k]]
        dcg = sum((2 ** rel[j] - 1) / np.log2(j + 2) for j in range(k))
        ndcg = dcg
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


def get_rank(arr):
    ranks = np.zeros_like(arr)
    ranks[np.argsort(arr)[::-1]] = np.arange(1, len(arr) + 1)
    return ranks

Since the data to be predicted is from July to September 2014, and the world is constantly changing, selecting more recent data for training will better reflect the latest circumstances. After testing, data from April 2013 and later was ultimately chosen for training.

In [3]:
all_data = pd.read_feather("cache/stacked_age")
for col in all_data.columns:
    if (
        col
        not in ["country_destination"]
        + all_data.select_dtypes(include="category").columns.tolist()
    ):
        all_data[col] = all_data[col].fillna(-99999)
all_data = all_data[all_data["date_account_created_yearmonth"] >= 201304]

In [4]:
test_idx = all_data["id"].isin(pd.read_csv("data/test_users.csv")["id"])
train_data = all_data[~test_idx]
test_data = all_data[test_idx]

In [5]:
train_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
422,oamfblpxdy,2014,1,15,201401,2,20140102,20140115,2010,4,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
710,5p0nmf3r6i,2014,5,23,201405,21,20140521,20140523,2010,5,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1025,dlg3a9x97v,2014,4,3,201404,14,20140414,20140403,2010,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2537,m1x72dgnnk,2013,8,22,201308,34,20130834,20130822,2010,12,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2817,x1dufb5trn,2013,9,26,201309,39,20130939,20130926,2011,1,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,108.0,-99999.0,-99999.0,-99999.0,-99999.0,2.0,-99999.0
213447,mhewnxesx9,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,2.0,-99999.0,232.0,-99999.0,-99999.0,4.0,-99999.0
213448,6o3arsjbb4,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,18.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
213449,jh95kwisub,2014,6,30,201406,26,20140626,20140630,2014,6,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,75.0,-99999.0


In [6]:
test_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,date_account_created_yearmonth,date_account_created_week,date_account_created_yearmonthweek,date_account_created_yearmonthday,timestamp_first_active_year,timestamp_first_active_month,...,device_type_flg_sum_Blackberry,device_type_flg_sum_Chromebook,device_type_flg_sum_Linux Desktop,device_type_flg_sum_Mac Desktop,device_type_flg_sum_Tablet,device_type_flg_sum_Windows Desktop,device_type_flg_sum_Windows Phone,device_type_flg_sum_iPad Tablet,device_type_flg_sum_iPhone,device_type_flg_sum_iPodtouch
213451,5uwns89zht,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,1.0,-99999.0
213452,jtl0dijy2j,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,9.0,-99999.0
213453,xx0ulgorjt,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,58.0,-99999.0,-99999.0,-99999.0,-99999.0
213454,6c6puo6ix0,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,11.0,-99999.0,-99999.0,-99999.0,-99999.0
213455,czqhjk3yfe,2014,7,1,201407,26,20140726,20140701,2014,7,...,-99999.0,-99999.0,-99999.0,19.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,89.0,-99999.0,-99999.0,4.0,-99999.0
275543,zp8xfonng8,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
275544,fa6260ziny,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,78.0,-99999.0,-99999.0,-99999.0,-99999.0
275545,87k0fy4ugm,2014,9,30,201409,39,20140939,20140930,2014,9,...,-99999.0,-99999.0,-99999.0,11.0,-99999.0,-99999.0,-99999.0,-99999.0,4.0,-99999.0


In [7]:
all_data.select_dtypes(include="object").columns.tolist()

['id']

In [8]:
num_folds = 10
num_classes = len(country_levels)

train_pred = np.zeros((len(train_data), num_classes))
train_rank = np.zeros((len(train_data), num_classes))
test_pred = np.zeros((len(test_data), num_classes))
test_rank = np.zeros((len(test_data), num_classes))

test_dm = xgb.DMatrix(
    data=test_data.drop(columns=["id", "country_destination"]),
    enable_categorical=True,
)

In [9]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_iters = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print(f"Fold {fold_idx + 1}/{num_folds}")

    train_subset = train_data.iloc[train_idx]
    val_subset = train_data.iloc[val_idx]

    train_dm = xgb.DMatrix(
        data=train_subset.drop(columns=["id", "country_destination"]),
        label=train_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )
    val_dm = xgb.DMatrix(
        data=val_subset.drop(columns=["id", "country_destination"]),
        label=val_subset["country_destination"]
        .map({k: i for i, k in enumerate(country_levels)})
        .values,
        enable_categorical=True,
    )

    params = {
        "tree_method": "hist",
        "device": "cuda",
        "max_depth": 7,
        "eta": 0.22,
        "booster": "gbtree",
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "objective": "multi:softprob",
        "num_class": num_classes,
    }

    model = xgb.train(
        params=params,
        dtrain=train_dm,
        num_boost_round=200,
        early_stopping_rounds=50,
        evals=[(train_dm, "train"), (val_dm, "val")],
        verbose_eval=10,
    )

    best_iters.append(model.best_iteration)

    train_pred_fold = model.predict(val_dm, iteration_range=(0, model.best_iteration))
    train_pred[val_idx] = train_pred_fold
    train_rank[val_idx] = np.apply_along_axis(get_rank, 1, train_pred_fold)

    test_pred += model.predict(test_dm, iteration_range=(0, model.best_iteration))

test_pred /= num_folds
test_rank = np.apply_along_axis(get_rank, 1, test_pred)

Fold 1/10
[0]	train-mlogloss:1.33531	val-mlogloss:1.33934
[10]	train-mlogloss:0.13074	val-mlogloss:0.13214
[20]	train-mlogloss:0.01636	val-mlogloss:0.01669
[30]	train-mlogloss:0.00221	val-mlogloss:0.00227
[40]	train-mlogloss:0.00047	val-mlogloss:0.00049
[50]	train-mlogloss:0.00015	val-mlogloss:0.00015
[60]	train-mlogloss:0.00009	val-mlogloss:0.00009
[70]	train-mlogloss:0.00007	val-mlogloss:0.00008
[80]	train-mlogloss:0.00007	val-mlogloss:0.00007
[90]	train-mlogloss:0.00007	val-mlogloss:0.00007
[100]	train-mlogloss:0.00007	val-mlogloss:0.00007
[110]	train-mlogloss:0.00007	val-mlogloss:0.00007
[120]	train-mlogloss:0.00007	val-mlogloss:0.00007
[130]	train-mlogloss:0.00007	val-mlogloss:0.00007
[140]	train-mlogloss:0.00007	val-mlogloss:0.00007
[150]	train-mlogloss:0.00007	val-mlogloss:0.00007
[160]	train-mlogloss:0.00007	val-mlogloss:0.00007
[170]	train-mlogloss:0.00006	val-mlogloss:0.00007
[180]	train-mlogloss:0.00006	val-mlogloss:0.00007
[190]	train-mlogloss:0.00006	val-mlogloss:0.00007
[

In [10]:
train_actual = (
    train_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
train_top5_rank = np.argsort(train_rank, axis=1)[:, :5]

test_actual = (
    test_data["country_destination"]
    .map({k: i for i, k in enumerate(country_levels)})
    .values
)
test_top5_rank = np.argsort(test_rank, axis=1)[:, :5]


def masked_ndcgs(mask=[]):
    train_mask = ~(train_data["country_destination"].isin(mask))
    test_mask = ~(test_data["country_destination"].isin(mask))

    if not len(mask):
        mask = "nothing"

    train_masked_ndcg5 = ndcg(train_actual[train_mask], train_top5_rank[train_mask], 5)
    print(f"Train NDCG5 with {mask} masked: {train_masked_ndcg5}")

This function was initially designed to evaluate the model's prediction accuracy for different destinations. However, it can also be applied to other areas, such as gender, age, and so on.

In [11]:
masked_ndcgs()

Train NDCG5 with nothing masked: 1.0


In [12]:
test_top5 = np.argsort(-test_pred, axis=1)[:, :5]
test_predictions = pd.DataFrame(test_pred, columns=country_levels)
test_predictions["TestId"] = test_data["id"].values

Generate the final prediction results.

In [None]:
predicted_countries = np.array([[country_levels[i] for i in row] for row in test_top5])

submission_ids = np.repeat(test_data["id"].values, 5)

submission_countries = predicted_countries.flatten()

submission = pd.DataFrame({"id": submission_ids, "country": submission_countries})

submission.to_csv("result.csv", index=False)