In [1]:
from sklearn.datasets import load_svmlight_file
from pathlib import Path

def get_data(file):
  return load_svmlight_file(str(file), query_id=True)

# could also use the smaller LETOR 4.0 MQ2008 dataset
x_train, y_train, qid_train = get_data(Path.home().joinpath('Downloads/MSLR-WEB10K/Fold1/train.txt'))
x_test, y_test, qid_test = get_data(Path.home().joinpath('Downloads/MSLR-WEB10K/Fold1/test.txt'))

In [2]:
from sklearn.metrics import ndcg_score
import numpy as np
import pandas as pd

def precision(x, at):
    num_relevant = len(x['relevance'][lambda y: y > 1])
    if num_relevant == 0:
        return 1
    predicted_relevant = len(x['relevance'][:at][lambda y: y > 1])
    return predicted_relevant / min(at, num_relevant)

# mean ndcg appears to be slightly different to the xgboost/lightgbm metrics
# different input format but same results as https://github.com/lucky7323/nDCG/blob/master/ndcg.py
def ndcg(x, at):
    if len(x) == 1:
        return 1
    return ndcg_score(np.asarray([x['relevance']]), np.asarray([x['pred']]), k=at)

def evaluate(x, metric='ndcg', at=10):
    metrics = {'ndcg': ndcg, 'precision': precision}
    if metric not in metrics:
        raise ValueError('unsupported metric')
    if at < 1:
        raise ValueError('k must be >= 1')
    return metrics[metric](x, at)

df = pd.DataFrame({'qid': qid_test, 'relevance': y_test})
(_, uqid_train) = np.unique(qid_train, return_counts=True)
(_, uqid_test) = np.unique(qid_test, return_counts=True)

In [3]:
# imbalanced dataset, considered all documents with rating > 1 as relevant
df.groupby('relevance').size()

relevance
0.0    124784
1.0     77896
2.0     32459
3.0      4450
4.0      1932
dtype: int64

In [4]:
# Train a regression model (pointwise)
# normalization improves results with linear models, not needed for tree based models
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# https://scikit-learn.org/stable/modules/sgd.html linear svm using stochastic gradient descent
reg = make_pipeline(StandardScaler(with_mean=False), SGDRegressor(loss='epsilon_insensitive', max_iter=1000, tol=1e-3, random_state=0))
reg.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('sgdregressor',
                 SGDRegressor(loss='epsilon_insensitive', random_state=0))])

In [5]:
# Train a binary logistic regression classification model (pointwise)
# Multi-class performs better than single class, but still worse than regression
from sklearn.linear_model import SGDClassifier
clf = make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='log', class_weight='balanced', max_iter=1000, random_state=0))
clf.fit(x_train, y_train > 1)

Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('sgdclassifier',
                 SGDClassifier(class_weight='balanced', loss='log',
                               random_state=0))])

In [6]:
# Train xgboost model (LambdaMART pairwise)
# could use a grid search to find better parameters
from xgboost import XGBRanker

xranker = XGBRanker(
    tree_method='gpu_hist',
    booster='gbtree',
    objective='rank:pairwise',
    eval_metric=['ndcg@5', 'ndcg@10'],
    random_state=0,
    learning_rate=0.1,
    max_depth=8,
    n_estimators=200,
    colsample_bytree=0.9,
    subsample=0.8,
)

xranker.fit(
    x_train, y_train, group=uqid_train,
    eval_set=[(x_test, y_test)],
    eval_group=[uqid_test]
)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=['ndcg@5', 'ndcg@10'], gamma=0,
          gpu_id=0, grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.1, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=8, max_leaves=0,
          min_child_weight=1, missing=nan, monotone_constraints='()',
          n_estimators=200, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

In [7]:
# Train lightgbm model (LambdaRank listwise)
from lightgbm import LGBMRanker

lranker = LGBMRanker(
    objective='lambdarank',
    random_state=0,
    num_leaves=127,
    colsample_bytree=0.9,
    subsample=0.8,
)

lranker.fit(
    X=x_train,
    y=y_train,
    group=uqid_train,
    eval_set=[(x_test, y_test)],
    eval_group=[uqid_test],
    eval_at=[5, 10],
    eval_metric='ndcg',
    verbose=-1
)



LGBMRanker(colsample_bytree=0.9, num_leaves=127, objective='lambdarank',
           random_state=0, subsample=0.8)

In [8]:
print('xgboost', max(xranker.evals_result()['validation_0']['ndcg@10']))
print('lgbm', lranker.best_score_['valid_0']['ndcg@10'])

xgboost 0.48763223098262015
lgbm 0.5052920219250704


In [9]:
# Evaluate different models
from timeit import default_timer as timer

def evaluate_model(model):
    name = model._final_estimator.__class__ if hasattr(model, '_final_estimator') else model.__class__
    print(name)
    a = timer()
    # use probability for the binary classifier model
    df['pred'] = model.predict_proba(x_test) if hasattr(model, 'classes_') else model.predict(x_test)
    b = timer()
    print('Took', (b-a) * 1000, 'ms')
    df_sorted = df.sort_values(['qid', 'pred'], ascending=[True, False])
    print('Mean precision@5', df_sorted.groupby('qid').apply(evaluate, metric='precision', at=5).mean())
    print('Mean precision@10', df_sorted.groupby('qid').apply(evaluate, metric='precision', at=10).mean())
    print('Mean ndcg@5', df_sorted.groupby('qid').apply(evaluate, metric='ndcg', at=5).mean())
    print('Mean ndcg@10', df_sorted.groupby('qid').apply(evaluate, metric='ndcg', at=10).mean())
    print('--------')

models = [reg, clf, xranker, lranker]
for model in models:
    evaluate_model(model)


<class 'sklearn.linear_model._stochastic_gradient.SGDRegressor'>
Took 542.1740999999827 ms
Mean precision@5 0.42862500000000103
Mean precision@10 0.4350412698412699
Mean ndcg@5 0.4127612717261256
Mean ndcg@10 0.42873475322744653
--------
<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
Took 498.61269999999536 ms
Mean precision@5 0.14944166666666597
Mean precision@10 0.16473253968253984
Mean ndcg@5 0.11674199175018259
Mean ndcg@10 0.1413818798710147
--------
<class 'xgboost.sklearn.XGBRanker'>
Took 843.3518999999876 ms
Mean precision@5 0.5598833333333332
Mean precision@10 0.5373363095238101
Mean ndcg@5 0.5310369613409331
Mean ndcg@10 0.5370612857860422
--------
<class 'lightgbm.sklearn.LGBMRanker'>
Took 548.4922999999924 ms
Mean precision@5 0.5662333333333327
Mean precision@10 0.5446097222222225
Mean ndcg@5 0.538811934938625
Mean ndcg@10 0.5423399822600861
--------


In [10]:
# All models in Scala get the same values
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False).fit(x_train)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Linear regression
print('->', safe_sparse_dot(scaler.transform(x_test), reg._final_estimator.coef_, dense_output=True) + reg._final_estimator.intercept_)
print('->', reg.predict(x_test))

# Logistic classifier
print('->', sigmoid(scaler.transform(x_test) * clf._final_estimator.coef_.T + clf._final_estimator.intercept_))
print('->', clf.predict_proba(x_test))

# XGBoost ranker
print('->', xranker.predict(x_test))

-> [ 0.82529851  0.0111614  -0.02176651 ...  0.80119461  0.24418305
  0.18839102]
-> [ 0.82529851  0.0111614  -0.02176651 ...  0.80119461  0.24418305
  0.18839102]
-> [[0.55150005]
 [0.3620873 ]
 [0.34056926]
 ...
 [0.56613636]
 [0.36450151]
 [0.18566116]]
-> [[0.44849995 0.55150005]
 [0.6379127  0.3620873 ]
 [0.65943074 0.34056926]
 ...
 [0.43386364 0.56613636]
 [0.63549849 0.36450151]
 [0.81433884 0.18566116]]
-> [ 0.5361098   0.0264558  -0.33231786 ...  0.95962435 -0.19364479
 -0.55807936]


In [11]:
# could use some model tracking/artifact solution like MLFlow
np.savetxt('src/main/resources/scale.csv', [scaler.scale_], delimiter=',')
np.savetxt('src/main/resources/reg-coef.csv', [reg._final_estimator.coef_], delimiter=',')
np.savetxt('src/main/resources/reg-intercept.csv', reg._final_estimator.intercept_)
np.savetxt('src/main/resources/clf-coef.csv', clf._final_estimator.coef_, delimiter=',')
np.savetxt('src/main/resources/clf-intercept.csv', clf._final_estimator.intercept_)
xranker.save_model('src/main/resources/xgboost.json')

In [12]:
# References
# https://www.microsoft.com/en-us/research/project/mslr/
# https://github.com/treygrainger/ai-powered-search
# https://tech.olx.com/ranking-ads-with-machine-learning-ee03d7734bf4
# https://opensourceconnections.com/blog/2017/04/03/test-drive-elasticsearch-learn-to-rank-linear-model/
# https://everdark.github.io/k9/notebooks/ml/learning_to_rank/learning_to_rank.html
# https://www.elastic.co/blog/introducing-approximate-nearest-neighbor-search-in-elasticsearch-8-0
# https://haystackconf.com/us2021/talk-6/
# https://arxiv.org/pdf/1803.05127.pdf
# https://www.microsoft.com/en-us/research/wp-content/uploads/2016/08/letor3.pdf
# https://towardsdatascience.com/learning-to-rank-a-complete-guide-to-ranking-using-machine-learning-4c9688d370d4

# Building good training data sets for ranking is hard. Non ML techniques are frequently used in information retrieval.
# For search engines, we can rank by summing the tf-idf of each query term or apply the more advanced Okapi BM25 ranking function.
# To compare documents, it is common to use the cosine similarity (~= dot product of normalized TF-IDF vectors or document embeddings).
# Document embeddings are computed with doc2vec or by averaging word/sentence embeddings (Transformers like Universal Sentence Encoder are popular).
# Word embeddings can be computed using a PMI matrix + SVD or sophisticated ML based algorithms like GloVe and word2vec.