In [None]:
!unzip '/learning-to-rank-fall-2020.zip'

In [3]:
import numpy as np
from sklearn.datasets import load_svmlight_file
X, y, query_ids = load_svmlight_file('l2r/train.txt', query_id=True)

In [10]:
def IDCG(y_true):
    sorted_y = np.sort(y_true)[::-1]
    gain = (2 ** sorted_y - 1)
    discount = 1.0 / np.log(np.arange(1, sorted_y.shape[0] + 1) + 1)
    idcg = np.sum(gain * discount)
    if np.isclose(idcg, 0.0):
        idcg = 1
    return idcg

In [11]:
from collections import defaultdict
train_query_groups = defaultdict(list)
for doc_id, query_id in enumerate(query_ids):
    train_query_groups[query_id].append(doc_id)
    
idcg_for_query = dict()
query_order_indicator = dict()
for query_id in train_query_groups:
    idcg_for_query[query_id] = IDCG(y[train_query_groups[query_id]])
    
    docs_ids = train_query_groups[query_id]
    y_i = y[docs_ids]
    order_indicator = np.zeros((y_i.shape[0], y_i.shape[0]))
    order_indicator += (y_i.reshape(-1, 1) > y_i)
    order_indicator -= (y_i.reshape(-1, 1) < y_i)
    query_order_indicator[query_id] = order_indicator

In [12]:
progress = 0
def objective(y_true, y_pred):
    global progress
    progress += 1
    print(f"\r{progress} ...", end='', flush=True)
    grad = np.zeros(y_true.shape[0])
    hess = np.zeros(y_true.shape[0])
    
    for query_id in train_query_groups:
        docs_ids = np.array(train_query_groups[query_id])
        h_i = y_pred[docs_ids]
        y_i = y_true[docs_ids]

        h_ij = h_i.reshape(-1, 1) - h_i
        h_ij[h_ij > 50] = 50
        h_ij[h_ij < -50] = -50

        order_indicator = query_order_indicator[query_id]
        h_ij = h_ij * order_indicator
        sigm = 1.0 / (1 + np.exp(h_ij))

        idcg = idcg_for_query[query_id]
        gain_diff = 2 ** y_i.reshape(-1, 1) - 2 ** y_i
        sorted_ids = np.argsort(h_i)[::-1]
        discount_diff = ( 1.0 / np.log(sorted_ids.reshape(-1, 1) + 2) - 1.0 / np.log(sorted_ids + 2) ) / np.log(2)
        delta_ndcg = np.abs(gain_diff * discount_diff) / idcg

        lambda_ij = delta_ndcg * sigm
        ## mask = order_indicator[order_indicator == 1]
        ## grad[docs_ids] = -np.sum(lambda_ij[mask] - lamdbda_ij[mask].T(), axis=1)
        grad[docs_ids] = -np.sum(order_indicator * lambda_ij, axis=1)
        hess[docs_ids] = np.sum(delta_ndcg * sigm * (1 - sigm), axis=1)
    
    hess[np.isclose(hess, 0.0)] = 1.0
    return grad, hess

In [13]:
from xgboost import XGBRegressor

In [14]:
%%time
params = {'objective': objective, 'max_depth': 7, 'n_estimators': 2000, 'n_jobs': 2}
model = XGBRegressor(tree_method='gpu_hist', **params)
model.fit(X, y)

2000 ...CPU times: user 1h 4min 59s, sys: 1min 45s, total: 1h 6min 45s
Wall time: 1h 6min 42s


In [15]:
model.save_model('xgb_n2000_depth7')

In [16]:
from xgboost import train
new_params = params
new_params['n_estimators'] = 13000
new_params['tree_method'] = 'gpu_hist'

In [17]:
import xgboost
xgbtrain = xgboost.DMatrix(X, y)
X_test, y_test, query_ids_test = load_svmlight_file('l2r/test.txt', query_id=True)
xb_X_test = xgboost.DMatrix(X_test)

In [18]:
tuned_mdl = train(new_params, dtrain=xgbtrain, xgb_model='xgb_n2000_depth7', verbose_eval=True)



In [19]:
y_pred = tuned_mdl.predict(xb_X_test)
test_query_groups = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test_query_groups[query_id].append(doc_id)
with open("my_subm_final.csv", 'w') as outp:
    print("QueryId,DocumentId", file=outp)
    for query_id in test_query_groups:
        docs_ids = test_query_groups[query_id]
        y_pred_for_query = y_pred[docs_ids]
        sorted_ids = np.argsort(y_pred_for_query)[::-1]
        ranked_docs = np.array(docs_ids)[sorted_ids]
        for doc_id in ranked_docs:
            print(f"{query_id},{doc_id+1}", file=outp)