## Install Packages

In [1]:
!pip install python-terrier
!pip install fastrank
!pip install lightgbm

!pip install googletrans==4.0.0rc1
!sudo apt-get install swig
!sudo pip install jamspell
!wget https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
!tar -xvf en.tar.gz

from pyterrier.measures import *
import numpy as np
import pandas as pd
import pyterrier as pt
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import fastrank
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import re
from googletrans import Translator
import jamspell

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
swig is already the newest version (3.0.12-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-12-17 01:12:53--  https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.

## Load data

In [2]:
df = pd.read_csv('data/df.csv')
docs_df = pd.read_csv('data/docs_df.csv') # artwork__intro__desc', 'Department', 'AccessionYear', 'Object Name', 'Title', 'Culture', 'Period', 'Portfolio', 'Artist Display Name', 'Artist Nationality', 'Object Date', 'Medium', 'City', 'Country', 'Region', 'River', 'Classification', 'Tags'
docs_df_desc = pd.read_csv('data/docs_df_desc.csv') # 'artwork__intro__desc'
topics = pd.read_csv("data/Annotation - Query.csv")
qrels = pd.read_csv("data/Annotation - Evaluation.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


## Models

### Setup and index creation

In [3]:
if not pt.started():
    pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [4]:
tr_va_topics, test_topics = train_test_split(topics, test_size=0.3, random_state=42)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=0.1, random_state=42)

In [5]:
index_dir_desc = './metdocs_index_desc'
indexer_desc = pt.DFIndexer(index_dir_desc, overwrite=True)
index_ref_desc = indexer_desc.index(docs_df_desc["text"], docs_df_desc["docno"])
# index_ref_desc.toString()
index_desc = pt.IndexFactory.of(index_ref_desc)

In [6]:
index_dir = './metdocs_index'
indexer = pt.DFIndexer(index_dir, overwrite=True)
index_ref = indexer.index(docs_df["text"], docs_df["docno"])
# index_ref.toString()
index = pt.IndexFactory.of(index_ref)

### Query reformation functions

#### Cross language, spell correction, stop word removal




In [7]:
nltk.download('stopwords')
stops = set(stopwords.words('english'))
translator = Translator()
jsp = jamspell.TSpellCorrector()
assert jsp.LoadLangModel('en.bin')

def _query_rewrite(q):
    query = q["query"]
    # Translation
    if not query.isascii():
      query = translator.translate(query).text

    # Spell corrector
    query = jsp.FixFragment(query)
        
    # Remove stop word
    terms = query.split(" ")
    terms = [t for t in terms if not t in stops ]
    terms = [term.lower() for term in terms]
    query = " ".join(terms)
    return query

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Cross language translation, spell correction, query expansion, stop word removal


In [8]:
# Reference
# Swaroop, Shyam. “A Simple Query Expansion.” Medium, Medium, 8 Jan. 2020, https://medium.com/@swaroopshyam0/a-simple-query-expansion-49aef3442416. 
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

pos_tag_map = {
  'NN': [ wn.NOUN ],
  'JJ': [ wn.ADJ, wn.ADJ_SAT ],
  'RB': [ wn.ADV ],
  'VB': [ wn.VERB ]
}

def tokenizer(sentence):
    return word_tokenize(sentence)
def pos_tagger(tokens):
    return nltk.pos_tag(tokens)
def stopword_treatment(tokens):
    stopword = stopwords.words('english')
    result = []
    for token in tokens:
        if token[0].lower() not in stopword:
            result.append(tuple([token[0].lower(), token[1]]))
    return result
def pos_tag_converter(nltk_pos_tag):
    root_tag = nltk_pos_tag[0:2]
    try:
        pos_tag_map[root_tag]
        return pos_tag_map[root_tag]
    except KeyError:
        return ''
def get_synsets(tokens):
    synsets = []
    for token in tokens:
        wn_pos_tag = pos_tag_converter(token[1])
        if wn_pos_tag == '':
            continue
        else:
            synsets.append(wn.synsets(token[0], wn_pos_tag))
    return synsets
def get_tokens_from_synsets(synsets):
    tokens = {}
    for synset in synsets:
        for s in synset:
            if s.name() in tokens:
                tokens[s.name().split('.')[0]] += 1
            else:
                tokens[s.name().split('.')[0]] = 1
    return tokens
def get_hypernyms(synsets):
    hypernyms = []
    for synset in synsets:
        for s in synset:
            hypernyms.append(s.hypernyms())
    return hypernyms
def get_tokens_from_hypernyms(synsets):
    tokens = {}
    for synset in synsets:
        for s in synsets:
            for ss in s:
                if ss.name().split('.')[0] in tokens:
                    tokens[(ss.name().split('.')[0])] += 1
                else:
                    tokens[(ss.name().split('.')[0])] = 1
    return tokens
def underscore_replacer(tokens):
    new_tokens = {}
    for key in tokens.keys():
        mod_key = re.sub(r'_', ' ', key)
        new_tokens[mod_key] = tokens[key]
    return new_tokens
def generate_tokens(sentence):
    tokens = tokenizer(sentence)
    tokens = pos_tagger(tokens)
    tokens = stopword_treatment(tokens)
    synsets = get_synsets(tokens)
    synonyms = get_tokens_from_synsets(synsets)
    synonyms = underscore_replacer(synonyms)
    hypernyms = get_hypernyms(synsets)
    hypernyms = get_tokens_from_hypernyms(hypernyms)
    hypernyms = underscore_replacer(hypernyms)
    tokens = {**synonyms, **hypernyms}
    return tokens
def query_expansion(query):
  d = generate_tokens('Portrait')
  query = query + ' ' + ' '.join(d.keys())
  return query

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def _query_rewrite_expansion(q):
    query = q["query"]
    # Translation
    if not query.isascii():
      query = translator.translate(query).text

    # Spell corrector
    query = jsp.FixFragment(query)

    # Query expansion
    if (len(query.split()) == 1 ):
      query = query_expansion(query) 

    # Lowercase convertion and remove stop word
    terms = query.split(" ")
    terms = [term.lower() for term in terms] 
    terms = [t for t in terms if not t in stops ]
    query = " ".join(terms)
    return query

### Baseline models

In [10]:
tf_desc = pt.BatchRetrieve(index_desc, wmodel="Tf")        
tfidf_desc = pt.BatchRetrieve(index_desc, wmodel="TF_IDF") 
bm25_desc = pt.BatchRetrieve(index_desc, wmodel="BM25")  
pl2_desc = pt.BatchRetrieve(index_desc, wmodel="PL2")      

In [11]:
pt.Experiment(
    [tf_desc, tfidf_desc, bm25_desc, pl2_desc],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10","P_10", "recall_10", "mrt"],
)

Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10,mrt
0,BR(Tf),0.513812,0.681429,0.502862,0.576471,0.136866,26.584015
1,BR(TF_IDF),0.842302,0.843766,0.791584,0.886275,0.169494,18.511859
2,BR(BM25),0.844051,0.84396,0.791426,0.886275,0.169494,17.111502
3,BR(PL2),0.841602,0.843863,0.792883,0.886275,0.169494,16.397402


In [12]:
tf = pt.BatchRetrieve(index, wmodel="Tf")        
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF") 
bm25 = pt.BatchRetrieve(index, wmodel="BM25")  
pl2 = pt.BatchRetrieve(index, wmodel="PL2")      

In [13]:
pt.Experiment(
    [tf, tfidf,bm25, pl2],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10","P_10", "recall_10", "mrt"],
)

Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10,mrt
0,BR(Tf),0.503799,0.723229,0.5044,0.55098,0.12505,15.687858
1,BR(TF_IDF),0.691574,0.830612,0.763626,0.815686,0.149123,15.444945
2,BR(BM25),0.684189,0.814138,0.747705,0.803922,0.147947,12.541712
3,BR(PL2),0.689512,0.827865,0.74302,0.798039,0.147358,11.692737


### Models with 'artwork__intro__desc'

In [14]:
tf_desc = pt.BatchRetrieve(index_desc, wmodel="Tf")
bm25_desc = pt.BatchRetrieve(index_desc, wmodel="BM25")      
tfidf_desc = pt.BatchRetrieve(index_desc, wmodel="TF_IDF")

ltr_feats_desc = pt.apply.query(_query_rewrite) >> pt.BatchRetrieve(index_desc) >> (
    bm25_desc
    **
    tfidf_desc
    **  # abstract coordinate match
    pt.BatchRetrieve(index_desc, wmodel="CoordinateMatch")
)

In [15]:
# Random Forest
rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=42, n_jobs=2)
rf_pipe_desc = ltr_feats_desc >> pt.ltr.apply_learned_model(rf)
%time rf_pipe_desc.fit(train_topics, qrels)

# Fastrank
train_request = fastrank.TrainRequest.coordinate_ascent()

params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 42

ca_pipe_desc = ltr_feats_desc >> pt.ltr.apply_learned_model(train_request, form='fastrank')
%time ca_pipe_desc.fit(train_topics, qrels)

# LightGBM LambdaMART
lmart_l_desc = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    max_bin=255,
    num_leaves=30,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5)

lmart_l_pipe_desc = ltr_feats_desc >> pt.ltr.apply_learned_model(lmart_l_desc, form="ltr", fit_kwargs={'eval_at':[10]})
%time lmart_l_pipe_desc.fit(train_topics, qrels, valid_topics, qrels)

# XGBoost LambdaMART
lmart_x_desc = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42)

lmart_x_pipe_desc = ltr_feats_desc >> pt.ltr.apply_learned_model(lmart_x_desc, form="ltr")
%time lmart_x_pipe_desc.fit(train_topics, qrels, valid_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.3s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    4.7s finished


CPU times: user 14.7 s, sys: 315 ms, total: 15 s
Wall time: 9.03 s
CPU times: user 9.89 s, sys: 161 ms, total: 10.1 s
Wall time: 7.32 s




[1]	valid_0's ndcg@10: 0.635771
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's ndcg@10: 0.608455
[3]	valid_0's ndcg@10: 0.607222
[4]	valid_0's ndcg@10: 0.593059
[5]	valid_0's ndcg@10: 0.584836
[6]	valid_0's ndcg@10: 0.58639
Early stopping, best iteration is:
[1]	valid_0's ndcg@10: 0.635771
CPU times: user 4.55 s, sys: 49.2 ms, total: 4.6 s
Wall time: 4.33 s
CPU times: user 4.82 s, sys: 46.3 ms, total: 4.87 s
Wall time: 4.13 s


In [16]:
pt.Experiment(
    [tf_desc, tfidf_desc, bm25_desc, ca_pipe_desc, rf_pipe_desc, lmart_x_pipe_desc, lmart_l_pipe_desc],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10","P_10", "recall_10", "mrt"],
    names=["TF", "TF-IDF", "BM25", "Fastrank", "Random Forest", "LambdaMART (xgBoost)", "LambdaMART (LightGBM)"],
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.5s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10,mrt
0,TF,0.513812,0.681429,0.502862,0.576471,0.136866,11.12458
1,TF-IDF,0.842302,0.843766,0.791584,0.886275,0.169494,13.907863
2,BM25,0.844051,0.84396,0.791426,0.886275,0.169494,10.123341
3,Fastrank,0.885932,0.898894,0.843992,0.931373,0.174004,90.712107
4,Random Forest,0.856247,0.902891,0.894235,0.935294,0.176437,105.528254
5,LambdaMART (xgBoost),0.847945,0.877724,0.794209,0.905882,0.171172,92.025598
6,LambdaMART (LightGBM),0.796068,0.854904,0.71222,0.815686,0.163659,94.485807


### Multifeature models

#### Data Preprocessing

In [17]:
# def concat(row):
#   text = ''
#   cols = 'artwork__intro__desc', 'Department', 'AccessionYear', 'Object Name', 'Title', 'Culture', 'Period', 'Portfolio', 'Artist Display Name', 'Artist Nationality', 'Object Date', 'Medium', 'City', 'Country', 'Region', 'River', 'Classification', 'Tags'
  
#   for col in cols:
#     if not pd.isnull(row[col]) and row[col] != ' ':
#       # text = text + col + ': ' + str(row[col]) + '. '
#       text = text + str(row[col]) + '.'
#   return text

# df['text'] = df.apply(concat, axis=1)
# docs_df = df.loc[:,['docno', 'text']]
# docs_df.reset_index(drop=True, inplace = True)
# docs_df.head(5)
# docs_df.to_csv("docs_df.csv", index = False)

In [18]:
docs_df.head(5)

Unnamed: 0,docno,text
0,d21628.0,Tiffany Studios (1902-1932) became well-known ...
1,d21629.0,Tiffany Studios (1902-1932) became well-known ...
2,d21630.0,Tiffany Studios (1902-1932) became well-known ...
3,d76994.0,This photograph presents a narrow view of West...
4,d76995.0,In this triptych featuring dried lotus stalks ...


#### Models

In [19]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")    
tf = pt.BatchRetrieve(index, wmodel="Tf")         
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")  

ltr_feats = pt.apply.query(_query_rewrite) >> pt.BatchRetrieve(index) >> (
    bm25
    **
    tfidf
    **  # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
)

# Random Forest
rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=42, n_jobs=2)
rf_pipe = ltr_feats >> pt.ltr.apply_learned_model(rf)
%time rf_pipe.fit(train_topics, qrels)

# Fastrank
train_request = fastrank.TrainRequest.coordinate_ascent()

params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 42

ca_pipe = ltr_feats >> pt.ltr.apply_learned_model(train_request, form='fastrank')
%time ca_pipe.fit(train_topics, qrels)

# LightGBM LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    max_bin=255,
    num_leaves=30,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5)

lmart_l_pipe = ltr_feats >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
%time lmart_l_pipe.fit(train_topics, qrels, valid_topics, qrels)

# XGBoost LambdaMART
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42)

lmart_x_pipe = ltr_feats >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
%time lmart_x_pipe.fit(train_topics, qrels, valid_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    4.3s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    7.8s finished


CPU times: user 17.4 s, sys: 204 ms, total: 17.6 s
Wall time: 12.1 s
CPU times: user 8.63 s, sys: 55.8 ms, total: 8.68 s
Wall time: 6.38 s




[1]	valid_0's ndcg@10: 0.54606
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's ndcg@10: 0.548505
[3]	valid_0's ndcg@10: 0.548505
[4]	valid_0's ndcg@10: 0.542684
[5]	valid_0's ndcg@10: 0.547443
[6]	valid_0's ndcg@10: 0.530685
[7]	valid_0's ndcg@10: 0.530685
Early stopping, best iteration is:
[2]	valid_0's ndcg@10: 0.548505
CPU times: user 4.67 s, sys: 76.9 ms, total: 4.75 s
Wall time: 4.46 s
CPU times: user 5.97 s, sys: 62.4 ms, total: 6.04 s
Wall time: 4.92 s


In [20]:
pt.Experiment(
    [tf, tfidf,bm25, pl2, ca_pipe, rf_pipe, lmart_x_pipe, lmart_l_pipe],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "P_10", "recall_10", "mrt"],
    names=["TF", "TF-IDF","BM25", "PL2", "Fastrank", "Random Forest", "LambdaMART (xgBoost)", "LambdaMART (LightGBM)"],
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.7s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10,mrt
0,TF,0.503799,0.723229,0.5044,0.55098,0.12505,12.420902
1,TF-IDF,0.691574,0.830612,0.763626,0.815686,0.149123,11.413205
2,BM25,0.684189,0.814138,0.747705,0.803922,0.147947,11.401484
3,PL2,0.689512,0.827865,0.74302,0.798039,0.147358,15.125121
4,Fastrank,0.727084,0.87467,0.803964,0.852941,0.152848,103.549236
5,Random Forest,0.821029,0.915186,0.868662,0.907843,0.166834,122.875602
6,LambdaMART (xgBoost),0.666596,0.839034,0.674356,0.752941,0.146537,106.508116
7,LambdaMART (LightGBM),0.685304,0.857394,0.71403,0.772549,0.145541,111.036562


#### Models with Query Expansion

In [21]:
ltr_feats = pt.apply.query(_query_rewrite_expansion) >> pt.BatchRetrieve(index) >> (
    bm25
    **
    tfidf
    **  # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
)

# Random Forest
rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=42, n_jobs=2)
rf_pipe = ltr_feats >> pt.ltr.apply_learned_model(rf)
%time rf_pipe.fit(train_topics, qrels)

# Fastrank
train_request = fastrank.TrainRequest.coordinate_ascent()

params = train_request.params
params.init_random = True
params.normalize = True
params.seed = 42

ca_pipe = ltr_feats >> pt.ltr.apply_learned_model(train_request, form='fastrank')
%time ca_pipe.fit(train_topics, qrels)

# LightGBM LambdaMART
lmart_l = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=100,
    max_bin=255,
    num_leaves=30,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[10],
    learning_rate= .1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5)

lmart_l_pipe = ltr_feats >> pt.ltr.apply_learned_model(lmart_l, form="ltr", fit_kwargs={'eval_at':[10]})
%time lmart_l_pipe.fit(train_topics, qrels, valid_topics, qrels)

# XGBoost LambdaMART
lmart_x = xgb.sklearn.XGBRanker(objective='rank:ndcg',
      learning_rate=0.1,
      gamma=1.0,
      min_child_weight=0.1,
      max_depth=6,
      verbose=2,
      random_state=42)

lmart_x_pipe = ltr_feats >> pt.ltr.apply_learned_model(lmart_x, form="ltr")
%time lmart_x_pipe.fit(train_topics, qrels, valid_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    5.5s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:   11.1s finished


CPU times: user 27.8 s, sys: 349 ms, total: 28.1 s
Wall time: 17.7 s
CPU times: user 16.2 s, sys: 183 ms, total: 16.4 s
Wall time: 11.7 s




[1]	valid_0's ndcg@10: 0.281144
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's ndcg@10: 0.310783
[3]	valid_0's ndcg@10: 0.427372
[4]	valid_0's ndcg@10: 0.427372
[5]	valid_0's ndcg@10: 0.427372
[6]	valid_0's ndcg@10: 0.521925
[7]	valid_0's ndcg@10: 0.521925
[8]	valid_0's ndcg@10: 0.503247
[9]	valid_0's ndcg@10: 0.527917
[10]	valid_0's ndcg@10: 0.513063
[11]	valid_0's ndcg@10: 0.513063
[12]	valid_0's ndcg@10: 0.513063
[13]	valid_0's ndcg@10: 0.531742
[14]	valid_0's ndcg@10: 0.527917
[15]	valid_0's ndcg@10: 0.531742
[16]	valid_0's ndcg@10: 0.531742
[17]	valid_0's ndcg@10: 0.531742
[18]	valid_0's ndcg@10: 0.550674
[19]	valid_0's ndcg@10: 0.531742
[20]	valid_0's ndcg@10: 0.550674
[21]	valid_0's ndcg@10: 0.531742
[22]	valid_0's ndcg@10: 0.554856
[23]	valid_0's ndcg@10: 0.580813
[24]	valid_0's ndcg@10: 0.580813
[25]	valid_0's ndcg@10: 0.596719
[26]	valid_0's ndcg@10: 0.580813
[27]	valid_0's ndcg@10: 0.599746
[28]	valid_0's ndcg@10: 0.580813
[29]	valid_0's ndcg@10: 

In [22]:
pt.Experiment(
    [tf, tfidf, bm25, pl2, ca_pipe, rf_pipe, lmart_x_pipe, lmart_l_pipe],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "P_10", "recall_10", "mrt"],
    names=["TF", "TF-IDF", "BM25", "PL2", "Fastrank", "Random Forest", "LambdaMART (xgBoost)", "LambdaMART (LightGBM)"],
)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    1.2s finished


Unnamed: 0,name,map,ndcg,ndcg_cut_10,P_10,recall_10,mrt
0,TF,0.503799,0.723229,0.5044,0.55098,0.12505,14.308777
1,TF-IDF,0.691574,0.830612,0.763626,0.815686,0.149123,13.615104
2,BM25,0.684189,0.814138,0.747705,0.803922,0.147947,13.516538
3,PL2,0.689512,0.827865,0.74302,0.798039,0.147358,25.457123
4,Fastrank,0.474499,0.70578,0.499173,0.55098,0.081754,199.392122
5,Random Forest,0.729584,0.835938,0.824937,0.856863,0.15807,216.87563
6,LambdaMART (xgBoost),0.440087,0.68181,0.431133,0.494118,0.075043,193.605347
7,LambdaMART (LightGBM),0.547396,0.759433,0.70246,0.754902,0.115596,183.80491


### Final model

In [23]:
index_dir = './metdocs_index'
indexer = pt.DFIndexer(index_dir, overwrite=True)
index_ref = indexer.index(docs_df["text"], docs_df["docno"])
index = pt.IndexFactory.of(index_ref)

bm25 = pt.BatchRetrieve(index, wmodel="BM25")          
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")  
ltr_feats = pt.apply.query(_query_rewrite) >> pt.BatchRetrieve(index) >> (
    bm25
    **
    tfidf
    **  # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
)

# Random Forest
rf = RandomForestRegressor(n_estimators=400, verbose=1, random_state=42, n_jobs=2)
rf_pipe = ltr_feats >> pt.ltr.apply_learned_model(rf)
%time rf_pipe.fit(train_topics, qrels)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    3.4s


CPU times: user 17.1 s, sys: 151 ms, total: 17.2 s
Wall time: 10.6 s


[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    7.0s finished


In [24]:
q = "Who was influenced by Claude Monet"
rf_pipe.search(q).head(20)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 400 out of 400 | elapsed:    0.1s finished


Unnamed: 0,qid,docid,docno,score,query_0,query,features,rank
3,1,19922,d459107.0,5.0,Who was influenced by Claude Monet,who influenced claude monet,"[24.839023338244893, 13.584735005426134, 2.0]",0
4,1,19623,d438551.0,5.0,Who was influenced by Claude Monet,who influenced claude monet,"[27.80355587158899, 15.250432474359641, 3.0]",1
5,1,19071,d437112.0,5.0,Who was influenced by Claude Monet,who influenced claude monet,"[25.61942879945677, 14.011800200020897, 2.0]",2
0,1,17950,d11936.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[28.49534054708856, 15.804262351324908, 4.0]",3
17,1,19081,d437124.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.689211550525705, 12.408665343103234, 2.0]",4
24,1,19092,d437137.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.704521956258564, 12.417173979273027, 2.0]",5
25,1,19075,d437118.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.554066501370333, 12.334881887103013, 2.0]",6
26,1,19082,d437125.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.554066501370333, 12.334881887103013, 2.0]",7
27,1,19069,d437110.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.479685178232515, 12.294198822673051, 2.0]",8
28,1,19076,d437119.0,4.995,Who was influenced by Claude Monet,who influenced claude monet,"[22.405858788806537, 12.253819305699007, 2.0]",9
