# Exploring Listwise ranking models

Steps:
1. Generate random query data
2. Convert to listwise data representation
3. Train ranker models

Todo:
1. Try different query sizes
2. Introduce ranking/position bias

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-whitegrid")

np.random.seed(0)

  plt.style.use("seaborn-whitegrid")


Generate data from queries

using a set number of documents for each query.

In [145]:
n_queries = 10
n_documents = 5
n_documents_per_query = 3

rnd = np.random.default_rng()

# document IDs
query_data = np.array(
    [
        rnd.choice(np.arange(n_documents), size=(n_documents_per_query), replace=False)
        for _q in range(n_queries)
    ]
)
# labels are taken as the action on the document ID
labels_data = query_data[
    np.arange(n_queries), rnd.choice(np.arange(n_documents_per_query), size=(n_queries))
]

df = pd.DataFrame(
    data=np.concatenate(
        [np.arange(n_queries)[:, np.newaxis], query_data, labels_data[:, np.newaxis]],
        axis=1,
    ),
    columns=["query_id"]
    + [f"doc_{_idx}" for _idx in range(n_documents_per_query)]
    + ["label"],
)
df

Unnamed: 0,query_id,doc_0,doc_1,doc_2,label
0,0,4,0,1,1
1,1,0,4,1,4
2,2,0,1,4,0
3,3,4,0,2,4
4,4,0,1,4,4
5,5,3,1,2,1
6,6,1,2,3,3
7,7,1,0,3,3
8,8,1,0,2,2
9,9,4,3,1,1


Generate listwise data

In [147]:
for _idx,row in df.iterrows():
    row


# df_melt = df.melt(
#     id_vars="query_id",
#     value_vars=[f"doc_{_idx}" for _idx in range(n_documents_per_query)],
#     var_name="doc_id",
# ).sort_values(["query_id", "doc_id"])
row

query_id    9
doc_0       4
doc_1       3
doc_2       1
label       1
Name: 9, dtype: int64

Generate data from queries in listwise form

using a set number of documents for each query.

In [154]:
n_queries = 10
n_documents = 5
n_documents_per_query = 3

rnd = np.random.default_rng()

document_rows = []
for _q_id in range(n_queries):
    doc_ids = rnd.choice(
        np.arange(n_documents), size=(n_documents_per_query), replace=False
    )
    labels = rnd.uniform(size=(n_documents_per_query))
    labels = (labels == np.max(labels)) * 1
    for doc_id, label in zip(doc_ids, labels):
        document_rows.append({"query_id": _q_id, "doc_id": doc_id, "label": label})

df = pd.DataFrame(document_rows)
df

Unnamed: 0,query_id,doc_id,label
0,0,1,1
1,0,0,0
2,0,3,0
3,1,2,0
4,1,3,1
5,1,1,0
6,2,1,0
7,2,4,0
8,2,0,1
9,3,0,1


Build rankers

Assume that doc_id is the only feature (ordinal encoding)

In [155]:
import xgboost as xgb

In [176]:
ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="mean")
ranker.fit(X=df[['doc_id']], y=df['label'], qid=df['query_id'])

Predictions

In [177]:
scores = ranker.predict(X=df[['doc_id']])
df = df.assign(scores=scores)
df

Unnamed: 0,query_id,doc_id,label,scores
0,0,1,1,0.64601
1,0,0,0,-0.76193
2,0,3,0,-0.155311
3,1,2,0,0.61309
4,1,3,1,-0.155311
5,1,1,0,0.64601
6,2,1,0,0.64601
7,2,4,0,-2.386494
8,2,0,1,-0.76193
9,3,0,1,-0.76193


Predictions by doc_id are always the same...?

In [178]:
df.groupby(["doc_id"])["scores"].agg(["max", "min", "count"])

Unnamed: 0_level_0,max,min,count
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.76193,-0.76193,7
1,0.64601,0.64601,8
2,0.61309,0.61309,4
3,-0.155311,-0.155311,6
4,-2.386494,-2.386494,5


Counterfactual predictions

Estimate doc_id 1 relevance in queries of different lengths
Relevance score for query_id/doc_id pairs are the same...?

In [179]:
df_rand = df.sample(frac=0.5, replace=False).sort_values(["query_id", "doc_id"])
scores = ranker.predict(X=df_rand[["doc_id"]])
df_rand = df_rand.assign(scores=scores)

df_rand.merge(df, on=["query_id", "doc_id", "label"], how="outer").sort_values(
    ["query_id", "doc_id"]
).assign(diff=lambda x: x["scores_y"] - x["scores_x"])

Unnamed: 0,query_id,doc_id,label,scores_x,scores_y,diff
16,0,0,0,,-0.76193,
15,0,1,1,,0.64601,
17,0,3,0,,-0.155311,
20,1,1,0,,0.64601,
18,1,2,0,,0.61309,
19,1,3,1,,-0.155311,
21,2,0,1,,-0.76193,
0,2,1,0,0.64601,0.64601,0.0
1,2,4,0,-2.386494,-2.386494,0.0
2,3,0,1,-0.76193,-0.76193,0.0


Counterfactual predictions

Change the doc_ids for half of the queries

In [None]:
df_rand = df.sample(frac=0.5, replace=False).sort_values(["query_id", "doc_id"])
scores = ranker.predict(X=df_rand[["doc_id"]])
df_rand = df_rand.assign(scores=scores)

df_rand.merge(df, on=["query_id", "doc_id", "label"], how="outer").sort_values(
    ["query_id", "doc_id"]
).assign(diff=lambda x: x["scores_y"] - x["scores_x"])

Unnamed: 0,query_id,doc_id,label,scores_x,scores_y,diff
0,0,0,0,-0.636212,-0.636212,0.0
1,0,1,1,0.465795,0.465795,0.0
2,0,3,0,-0.336154,-0.336154,0.0
3,1,1,0,0.465795,0.465795,0.0
15,1,2,0,,0.472285,
4,1,3,1,-0.336154,-0.336154,0.0
5,2,0,1,-0.636212,-0.636212,0.0
6,2,1,0,0.465795,0.465795,0.0
16,2,4,0,,-1.019274,
17,3,0,1,,-0.636212,


In [156]:
from sklearn.datasets import make_classification
import numpy as np

import xgboost as xgb

# Make a synthetic ranking dataset for demonstration
seed = 1994
X, y = make_classification(random_state=seed)
rng = np.random.default_rng(seed)
n_query_groups = 3
qid = rng.integers(0, 3, size=X.shape[0])

# Sort the inputs based on query index
sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]
