# Learning to Rank

## Feature extraction

### Training data

In [1]:
import pandas as pd
from pyserini.index import IndexReader

# Load collection
passages = pd.read_csv('collections/msmarco-passage/collectionandqueries/collection.tsv', sep = '\t', names=['pid', 'passage'])

# Load training data
queries_train = pd.read_csv('collections/msmarco-passage/collectionandqueries/queries.train.tsv', sep = '\t', names=['qid', 'query'])
qrels_train = pd.read_csv('collections/msmarco-passage/collectionandqueries/qrels.train.tsv', sep = '\t', names=['qid', 'Q0', 'docid', 'rating'])

print(passages)
print(queries_train)
print(qrels_train)

             pid                                            passage
0              0  The presence of communication amid scientific ...
1              1  The Manhattan Project and its atomic bomb help...
2              2  Essay on The Manhattan Project - The Manhattan...
3              3  The Manhattan Project was the name for a proje...
4              4  versions of each volume as well as complementa...
...          ...                                                ...
8841818  8841818  When metal salts emit short wavelengths of vis...
8841819  8841819  Thousands of people across the United States w...
8841820  8841820  The recipe that creates blue, for example, inc...
8841821  8841821  On Independence Days of yore, old-timey crowds...
8841822  8841822  View full size image. Behind the scenes of the...

[8841823 rows x 2 columns]
            qid                                           query
0        121352                                  define extreme
1        634306        what 

In [2]:
import numpy as np
from tqdm import tqdm

# Create index reader to analyze terms and compute metrics
index_reader = IndexReader('indexes/collection_jsonl')

# List to store features calculated from the training data
training_data = []

# Going through all query-document pairs
for index in tqdm(range(len(qrels_train))):
    row = qrels_train.loc[index]
    
    # Identifying relevant query and passage/document
    query = queries_train[queries_train['qid'] == row['qid']]['query'].values[0]
    passage = passages[passages['pid'] == row['docid']]['passage'].values[0]
    
    rating = row['rating']
    
    # Computing BM25
    bm25_score = index_reader.compute_query_document_score(str(row['docid']), query)
    
    # Computing passage length
    passage_length = len(passage)

    # Initializing c, df, cf, idf, and c*idf
    c = 0
    df = 0
    cf = 0
    idf = 0
    c_idf = 0
    C = len(passages)
    
    # Get analyzed passage
    passage_analyzed = index_reader.analyze(passage)
    
    # Go through analyzed terms in the query
    for term in index_reader.analyze(query):
        # If there are no analyzed terms, skip the term
        if index_reader.analyze(term) == []:
            continue
        else:
            # Compute each of the metrics as specified by LETOR
            c += passage_analyzed.count(term)
            df_temp, cf_temp = index_reader.get_term_counts(term) 
            df += df_temp
            cf += cf_temp
            idf += np.log((C - df + 0.5) / (df + 0.5))

            c_idf += c * idf
    
    # Add relevant data to list
    temp = [row['qid'], row['docid'], row['rating'], bm25_score, passage_length, c, df, cf, idf, c_idf]
    training_data.append(temp)

Attempting to initialize pre-built index msmarco-passage.
/Users/jessie/.cache/pyserini/indexes/index-msmarco-passage-20201117-f87c94.1efad4f1ae6a77e235042eff4be1612d already exists, skipping download.
Initializing msmarco-passage...


  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += n

  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
100%|██████████| 532761/532761 [2:12:38<00:00, 66.94it/s]


In [19]:
# Transform data into dataframe
training_data = pd.DataFrame(training_data, columns=['qid', 'docid', 'rating', 'bm25', \
                                                     'passage_length', 'c', 'df', 'cf', \
                                                     'idf', 'c_idf']).dropna()

# Save to csv in case the kernel is stopped during experiments
training_data.to_csv('training_data.csv')

print(training_data)

            qid    docid  rating       bm25  passage_length   c       df  \
0       1185869        0       1  11.560829             325   5   797858   
1       1185868       16       1  20.567997             306   9   917868   
2        597651       49       1  11.347543             305   6   718082   
3        403613       60       1  13.854973             521   7   200763   
4       1183785      389       1  10.219151             319   3   390237   
...         ...      ...     ...        ...             ...  ..      ...   
532756    19285  8841362       1   8.897675             304   2   209748   
532757   558837  4989159       1   3.825954             297   3   612669   
532758   559149  8841547       1  13.113671             214   4  1068484   
532759   706678  8841643       1   7.690524             149   1   543276   
532760   405466  8841735       1  10.627259             798  13   155130   

             cf        idf        c_idf  
0        983404  15.112933   173.856118  
1  

In [4]:
import random

# List to store features calculated from nonrelevant pairs in the training data
training_data_nonrelevant = []

# Go through a sample of 10,000 query-document pairs from the training data
for index in tqdm(random.sample(range(len(qrels_train)), 10000)):
    row = qrels_train.loc[index]
    
    # Identifying relevant query
    query = queries_train[queries_train['qid'] == row['qid']]['query'].values[0]
    
    # Sample a docid that is not the current docid
    docid = passages[passages['pid'] != row['docid']]['pid'].sample().values[0]
    
    # Identify 'irrelevant' passage
    passage = passages[passages['pid'] == docid]['passage'].values[0]
    
    rating = row['rating']
    
    # Same code as computing metrics for relevant pairs
    bm25_score = index_reader.compute_query_document_score(str(docid), query)
    
    passage_length = len(passage)

    c = 0
    df = 0
    cf = 0
    idf = 0
    c_idf = 0
    C = len(passages)
    passage_analyzed = index_reader.analyze(passage)
    for term in index_reader.analyze(query):
        if index_reader.analyze(term) == []:
            continue
        else:
            c += passage_analyzed.count(term)
            df_temp, cf_temp = index_reader.get_term_counts(term) 
            df += df_temp
            cf += cf_temp
            idf += np.log((C - df + 0.5) / (df + 0.5))

            c_idf += c * idf
    
    temp = [row['qid'], docid, 0, bm25_score, passage_length, c, df, cf, idf, c_idf]
    training_data_nonrelevant.append(temp)

100%|██████████| 10000/10000 [5:02:04<00:00,  1.81s/it]  


In [20]:
# Transform data into dataframe
training_data_nonrelevant = pd.DataFrame(training_data_nonrelevant, columns=['qid', 'docid', 'rating', 'bm25', \
                                                                             'passage_length', 'c', 'df', 'cf', \
                                                                             'idf', 'c_idf']).dropna()
# Save to csv in case the kernel is stopped during experiments
training_data_nonrelevant.to_csv('training_data_nonrelevant.csv')
print(training_data_nonrelevant)

          qid    docid  rating      bm25  passage_length  c       df       cf  \
0      539478  2693464       0  0.000000             258  0    91835   113542   
1      400705  7629915       0  0.000000             205  0   150809   186927   
2        2797  3365674       0  0.000000             282  0   622866   810154   
3      734390  6235369       0  0.000000             505  0   780361   969702   
4      345809   971793       0  0.000000             296  0   756389   955734   
...       ...      ...     ...       ...             ... ..      ...      ...   
9995    92552  5884425       0  0.000000             268  0   221142   264779   
9996   435696  2716350       0  2.086764             304  1   221593   267426   
9997  1181703  8602111       0  0.000000             321  0   486729   732203   
9998  1173262  3786944       0  0.000000             737  0   122353   181461   
9999   650309  7430049       0  0.000000             334  0  1645114  2377775   

            idf      c_idf 

In [43]:
# Write data to a file in the LETOR format
with open('training_data.txt', 'w') as file:
    # Sample part of the relevant query document pair features
    sample = training_data.sample(10000)
    # Go through sample and write each record to a text file
    for index in tqdm(range(len(sample))):
        row = sample.iloc[index]
        file.write('{} qid:{} 1:{} 2:{} 3:{} 4:{} 5:{} 6:{} 7:{} # docid = {} \n'.format(int(row['rating']), int(row['qid']), \
                                                                         row['bm25'], row['passage_length'], \
                                                                        row['c'], row['df'], row['cf'], \
                                                                        row['idf'], row['c_idf'], int(row['docid'])))
    # Go through nonrelevant query document pair features and write each record to a text file
    for index in tqdm(range(len(training_data_nonrelevant))):
        row = training_data_nonrelevant.iloc[index]
        file.write('{} qid:{} 1:{} 2:{} 3:{} 4:{} 5:{} 6:{} 7:{} # docid = {} \n'.format(int(row['rating']), int(row['qid']), \
                                                                        row['bm25'], row['passage_length'], \
                                                                        row['c'], row['df'], row['cf'], \
                                                                        row['idf'], row['c_idf'], int(row['docid'])))

100%|██████████| 10000/10000 [00:04<00:00, 2318.29it/s]
100%|██████████| 10000/10000 [00:03<00:00, 2747.59it/s]


### Validation data

In [8]:
# Load validation data
queries_val = pd.read_csv('collections/msmarco-passage/collectionandqueries/queries.dev.small.tsv', sep = '\t', names=['qid', 'query'])
qrels_val = pd.read_csv('collections/msmarco-passage/collectionandqueries/qrels.dev.small.tsv', sep = '\t', names=['qid', 'Q0', 'docid', 'rating'])

print(queries_val)
print(qrels_val)

          qid                                          query
0     1048585                   what is paula deen's brother
1           2                       Androgen receptor define
2      524332  treating tension headaches without medication
3     1048642                            what is paranoid sc
4      524447            treatment of varicose veins in legs
...       ...                                            ...
6975   734979                           what is coronary cta
6976   524166              transportation to us bank stadium
6977   968921                 where did last names originate
6978   786375                 what is preoperative clearance
6979  1048565                  who plays sebastian michaelis

[6980 rows x 2 columns]
         qid  Q0    docid  rating
0     300674   0  7067032       1
1     125705   0  7067056       1
2      94798   0  7067181       1
3       9083   0  7067274       1
4     174249   0  7067348       1
...      ...  ..      ...     ...
7432 

In [9]:
# Same code as for the relevant training query-document pairs
validation_data = []

for index in tqdm(range(len(qrels_val))):
    row = qrels_val.loc[index]
    query = queries_val[queries_val['qid'] == row['qid']]['query'].values[0]
    passage = passages[passages['pid'] == row['docid']]['passage'].values[0]
    rating = row['rating']
    
    bm25_score = index_reader.compute_query_document_score(str(row['docid']), query)
    
    passage_length = len(passage)

    c = 0
    df = 0
    cf = 0
    idf = 0
    c_idf = 0
    C = len(passages)
    passage_analyzed = index_reader.analyze(passage)
    for term in index_reader.analyze(query):
        if index_reader.analyze(term) == []:
            continue
        else:
            c += passage_analyzed.count(term)
            df_temp, cf_temp = index_reader.get_term_counts(term) 
            df += df_temp
            cf += cf_temp
            idf += np.log((C - df + 0.5) / (df + 0.5))

            c_idf += c * idf
    
    temp = [row['qid'], row['docid'], row['rating'], bm25_score, passage_length, c, df, cf, idf, c_idf]
    validation_data.append(temp)

  idf += np.log((C - df + 0.5) / (df + 0.5))
  idf += np.log((C - df + 0.5) / (df + 0.5))
100%|██████████| 7437/7437 [01:46<00:00, 69.57it/s]


In [23]:
# Transform data into dataframe
validation_data = pd.DataFrame(validation_data, columns=['qid', 'docid', 'rating', 'bm25', \
                                                         'passage_length', 'c', 'df', 'cf', \
                                                         'idf', 'c_idf']).dropna()

# Save to csv in case the kernel is stopped during experiments
validation_data.to_csv('validation_data.csv')
print(validation_data)

         qid    docid  rating       bm25  passage_length   c       df  \
0     300674  7067032       1  24.918810             324   9  2081477   
1     125705  7067056       1   3.744147             186   4   211104   
2      94798  7067181       1  13.672122             542   9   135438   
3       9083  7067274       1  11.293467             169   4   329167   
4     174249  7067348       1   8.314486             630  16  2118995   
...      ...      ...     ...        ...             ...  ..      ...   
7432  147073  8008770       1  11.880719             257   8  1126133   
7433  243761  8008787       1  12.142422             497   7  1083447   
7434  162662  8008977       1  17.052584             220   4  2425838   
7435  247194  8009319       1   8.106571             262   3  3267541   
7436  195199  8009377       1   5.757708             233   1   390524   

           cf        idf       c_idf  
0     2715181  14.770192  395.896149  
1      248706   8.136258   32.545033  
2     

In [56]:
# Same code as for the non-relevant training query-document pairs
validation_data_nonrelevant = []

for index in tqdm(random.sample(range(len(qrels_val)), 1000)):
    row = qrels_val.loc[index]
    query = queries_val[queries_val['qid'] == row['qid']]['query'].values[0]
    docid = passages[passages['pid'] != row['docid']]['pid'].sample().values[0]
    passage = passages[passages['pid'] == docid]['passage'].values[0]
    rating = row['rating']
    
    bm25_score = index_reader.compute_query_document_score(str(docid), query)
    
    passage_length = len(passage)

    c = 0
    df = 0
    cf = 0
    idf = 0
    c_idf = 0
    C = len(passages)
    passage_analyzed = index_reader.analyze(passage)
    for term in index_reader.analyze(query):
        if index_reader.analyze(term) == []:
            continue
        else:
            c += passage_analyzed.count(term)
            df_temp, cf_temp = index_reader.get_term_counts(term) 
            df += df_temp
            cf += cf_temp
            idf += np.log((C - df + 0.5) / (df + 0.5))

            c_idf += c * idf
    
    temp = [row['qid'], docid, 0, bm25_score, passage_length, c, df, cf, idf, c_idf]
    validation_data_nonrelevant.append(temp)

  idf += np.log((C - df + 0.5) / (df + 0.5))
100%|██████████| 1000/1000 [42:45<00:00,  2.57s/it]


In [54]:
# Transform data into dataframe
validation_data_nonrelevant = pd.DataFrame(validation_data_nonrelevant, columns=['qid', 'docid', 'rating', \
                                                                                 'bm25', 'passage_length', \
                                                                                 'c', 'df', 'cf', 'idf', \
                                                                                 'c_idf']).dropna()
# Save to csv in case the kernel is stopped during experiments
validation_data_nonrelevant.to_csv('validation_data_nonrelevant.csv')
print(validation_data_nonrelevant)

         qid    docid  rating  bm25  passage_length  c       df       cf  \
0      27743  4682722       0   0.0             600  0   529555   788281   
1     518675  1030433       0   0.0             242  0   350389   501527   
2     727765  3245041       0   0.0             290  0   861528  1071565   
3    1096619  7465244       0   0.0             293  0  1075196  1341418   
4    1095928  7857386       0   0.0             235  0   953099  1231183   
..       ...      ...     ...   ...             ... ..      ...      ...   
994   988954  3187724       0   0.0             261  0   547616   674387   
995   402427  6425929       0   0.0             265  0   224504   288905   
996  1046475   169071       0   0.0             102  0   554641   679486   
997   789997  6375925       0   0.0             259  0  1320154  1852520   
998  1080968  6683466       0   0.0             259  0   528991   777616   

           idf  c_idf  
0    19.183001    0.0  
1    20.578482    0.0  
2    10.212658 

In [55]:
# Writing data to LETOR format, same code as for training data
with open('validation_data.txt', 'w') as file:
    sample = validation_data.sample(1000)
    for index in tqdm(range(len(sample))):
        row = sample.iloc[index]
        file.write('{} qid:{} 1:{} 2:{} 3:{} 4:{} 5:{} 6:{} 7:{} # docid = {} \n'.format(int(row['rating']), int(row['qid']), \
                                                                         row['bm25'], row['passage_length'], \
                                                                        row['c'], row['df'], row['cf'], \
                                                                        row['idf'], row['c_idf'], int(row['docid'])))
    for index in tqdm(range(len(validation_data_nonrelevant))):
        row = validation_data_nonrelevant.iloc[index]
        file.write('{} qid:{} 1:{} 2:{} 3:{} 4:{} 5:{} 6:{} 7:{} # docid = {} \n'.format(int(row['rating']), int(row['qid']), \
                                                                        row['bm25'], row['passage_length'], \
                                                                        row['c'], row['df'], row['cf'], \
                                                                        row['idf'], row['c_idf'], int(row['docid'])))

100%|██████████| 1000/1000 [00:00<00:00, 1064.83it/s]
100%|██████████| 999/999 [00:00<00:00, 1213.30it/s]


### Testing data

In [15]:
# Load testing data
queries_test = pd.read_csv('collections/msmarco-passage/msmarco-test2019-queries.tsv', sep = '\t', names=['qid', 'query'])
qrels_test = pd.read_csv('collections/msmarco-passage/2019qrels-pass.txt', sep = ' ', names=['qid', 'Q0', 'docid', 'rating'])

print(queries_test)
print(qrels_test)

         qid                                              query
0    1108939                  what slows down the flow of blood
1    1112389            what is the county for grand rapids, mn
2     792752                                     what is ruclip
3    1119729  what do you do when you have a nosebleed from ...
4    1105095                  where is sugar lake lodge located
..       ...                                                ...
195   146187  difference between a mcdouble and a double che...
196   634428                           what does chs stand for?
197  1121986     what are the effects of having low blood sugar
198   321441                 how much is a us postal stamp cost
199   532603                   university of dubuque enrollment

[200 rows x 2 columns]
          qid  Q0    docid  rating
0       19335  Q0  1017759       0
1       19335  Q0  1082489       0
2       19335  Q0   109063       0
3       19335  Q0  1160863       0
4       19335  Q0  1160871       

In [16]:
# Same code as for the relevant training query-document pairs
testing_data = []

for index in tqdm(range(len(qrels_test))):
    row = qrels_test.loc[index]
    query = queries_test[queries_test['qid'] == row['qid']]['query'].values[0]
    passage = passages[passages['pid'] == row['docid']]['passage'].values[0]
    rating = row['rating']
    
    bm25_score = index_reader.compute_query_document_score(str(row['docid']), query)
    
    passage_length = len(passage)

    c = 0
    df = 0
    cf = 0
    idf = 0
    c_idf = 0
    C = len(passages)
    passage_analyzed = index_reader.analyze(passage)
    for term in index_reader.analyze(query):
        if index_reader.analyze(term) == []:
            continue
        else:
            c += passage_analyzed.count(term)
            df_temp, cf_temp = index_reader.get_term_counts(term) 
            df += df_temp
            cf += cf_temp
            idf += np.log((C - df + 0.5) / (df + 0.5))

            c_idf += c * idf
    
    temp = [row['qid'], row['docid'], row['rating'], bm25_score, passage_length, c, df, cf, idf, c_idf]
    testing_data.append(temp)

100%|██████████| 9260/9260 [02:25<00:00, 63.50it/s]


In [27]:
# Transform data into dataframe
testing_data = pd.DataFrame(testing_data, columns=['qid', 'docid', 'rating', 'bm25', 'passage_length', \
                                                   'c', 'df', 'cf', 'idf', 'c_idf'])
# Save to csv in case the kernel is stopped during experiments
testing_data.to_csv('testing_data.csv')
print(testing_data)

          qid    docid  rating      bm25  passage_length  c      df      cf  \
0       19335  1017759       0  3.901871             309  3  270291  322485   
1       19335  1082489       0  6.753509             494  2  270291  322485   
2       19335   109063       0  6.547346             351  3  270291  322485   
3       19335  1160863       0  4.193603             279  4  270291  322485   
4       19335  1160871       0  4.209503             273  4  270291  322485   
...       ...      ...     ...       ...             ... ..     ...     ...   
9255  1133167  8839920       2  9.100336             252  5  649104  869667   
9256  1133167  8839922       2  7.480691             250  3  649104  869667   
9257  1133167   944810       0  2.127326             586  1  649104  869667   
9258  1133167   949411       0  2.163620             577  1  649104  869667   
9259  1133167   977421       0  6.752583             271  6  649104  869667   

            idf       c_idf  
0     15.600364   46.

In [28]:
# Writing data to LETOR format
with open('testing_data.txt', 'w') as file:
    for index in tqdm(range(len(testing_data))):
        row = testing_data.iloc[index]
        file.write('{} qid:{} 1:{} 2:{} 3:{} 4:{} 5:{} 6:{} 7:{} # docid = {} \n'.format(int(row['rating']), int(row['qid']), \
                                                                         row['bm25'], row['passage_length'], \
                                                                        row['c'], row['df'], row['cf'], \
                                                                        row['idf'], row['c_idf'], int(row['docid'])))

100%|██████████| 9260/9260 [00:02<00:00, 3601.77it/s]


## Model creation

Creating the model:

```
java -jar RankLib-2.15.jar -train training_data.txt -ranker 8 -gmax 1 -validate validation_data.txt -test testing_data.txt -metric2T MAP -save RandomForests.txt
```

Result:
```
Discard orig. features
Training data:	training_data.txt
Test data:	testing_data.txt
Validation data:	validation_data.txt
Feature vector representation: Dense.
Ranking method:	Random Forests
Feature description file:	Unspecified. All features will be used.
Train metric:	ERR@10
Test metric:	MAP
Highest relevance label (to compute ERR): 1
Feature normalization: No
Model file: RandomForests.txt

[+] Random Forests's Parameters:
No. of bags: 300
Sub-sampling: 1.0
Feature-sampling: 0.3
No. of trees: 1
No. of leaves: 100
No. of threshold candidates: 256
Learning rate: 0.1

Reading feature file [training_data.txt]... [Done.]            
(20000 ranked lists, 20000 entries read)
Reading feature file [validation_data.txt]... [Done.]            
(1998 ranked lists, 1999 entries read)
Reading feature file [testing_data.txt]... [Done.]            
(43 ranked lists, 9260 entries read)
Initializing... [Done]
------------------------------------
Training starts...
------------------------------------
bag       | ERR@10-B  | ERR@10-OOB  | 
------------------------------------
b[1]      | 0.2488    | 
b[2]      | 0.2457    | 
b[3]      | 0.2491    | 
b[4]      | 0.2486    | 
b[5]      | 0.2471    | 
b[6]      | 0.2515    | 
b[7]      | 0.2514    | 
b[8]      | 0.2506    | 
b[9]      | 0.2516    | 
b[10]     | 0.2473    | 
b[11]     | 0.2494    | 
b[12]     | 0.2504    | 
b[13]     | 0.2466    | 
b[14]     | 0.2505    | 
b[15]     | 0.2508    | 
b[16]     | 0.2515    | 
b[17]     | 0.2484    | 
b[18]     | 0.2501    | 
b[19]     | 0.2497    | 
b[20]     | 0.2468    | 
b[21]     | 0.2517    | 
b[22]     | 0.2518    | 
b[23]     | 0.2483    | 
b[24]     | 0.2525    | 
b[25]     | 0.2494    | 
b[26]     | 0.2527    | 
b[27]     | 0.2488    | 
b[28]     | 0.2498    | 
b[29]     | 0.252     | 
b[30]     | 0.2502    | 
b[31]     | 0.2516    | 
b[32]     | 0.2501    | 
b[33]     | 0.2485    | 
b[34]     | 0.2493    | 
b[35]     | 0.2473    | 
b[36]     | 0.2499    | 
b[37]     | 0.2471    | 
b[38]     | 0.2483    | 
b[39]     | 0.2528    | 
b[40]     | 0.2516    | 
b[41]     | 0.2488    | 
b[42]     | 0.2486    | 
b[43]     | 0.2517    | 
b[44]     | 0.2522    | 
b[45]     | 0.2503    | 
b[46]     | 0.2443    | 
b[47]     | 0.251     | 
b[48]     | 0.2494    | 
b[49]     | 0.2481    | 
b[50]     | 0.2503    | 
b[51]     | 0.2508    | 
b[52]     | 0.2511    | 
b[53]     | 0.2449    | 
b[54]     | 0.2464    | 
b[55]     | 0.2488    | 
b[56]     | 0.2491    | 
b[57]     | 0.2503    | 
b[58]     | 0.2484    | 
b[59]     | 0.2482    | 
b[60]     | 0.2485    | 
b[61]     | 0.2503    | 
b[62]     | 0.2505    | 
b[63]     | 0.2488    | 
b[64]     | 0.2522    | 
b[65]     | 0.2474    | 
b[66]     | 0.2473    | 
b[67]     | 0.2477    | 
b[68]     | 0.2477    | 
b[69]     | 0.2504    | 
b[70]     | 0.2482    | 
b[71]     | 0.2527    | 
b[72]     | 0.2515    | 
b[73]     | 0.2518    | 
b[74]     | 0.2507    | 
b[75]     | 0.2465    | 
b[76]     | 0.2484    | 
b[77]     | 0.2475    | 
b[78]     | 0.2507    | 
b[79]     | 0.2517    | 
b[80]     | 0.249     | 
b[81]     | 0.2493    | 
b[82]     | 0.2486    | 
b[83]     | 0.2521    | 
b[84]     | 0.252     | 
b[85]     | 0.2492    | 
b[86]     | 0.2518    | 
b[87]     | 0.2495    | 
b[88]     | 0.2483    | 
b[89]     | 0.2513    | 
b[90]     | 0.2507    | 
b[91]     | 0.2508    | 
b[92]     | 0.2509    | 
b[93]     | 0.2489    | 
b[94]     | 0.2505    | 
b[95]     | 0.249     | 
b[96]     | 0.2485    | 
b[97]     | 0.2512    | 
b[98]     | 0.2489    | 
b[99]     | 0.2484    | 
b[100]    | 0.2472    | 
b[101]    | 0.2514    | 
b[102]    | 0.2518    | 
b[103]    | 0.2501    | 
b[104]    | 0.2492    | 
b[105]    | 0.2524    | 
b[106]    | 0.2513    | 
b[107]    | 0.2495    | 
b[108]    | 0.2481    | 
b[109]    | 0.2512    | 
b[110]    | 0.2482    | 
b[111]    | 0.248     | 
b[112]    | 0.2479    | 
b[113]    | 0.2488    | 
b[114]    | 0.2492    | 
b[115]    | 0.2496    | 
b[116]    | 0.2519    | 
b[117]    | 0.2531    | 
b[118]    | 0.2516    | 
b[119]    | 0.2493    | 
b[120]    | 0.2503    | 
b[121]    | 0.2485    | 
b[122]    | 0.2482    | 
b[123]    | 0.2516    | 
b[124]    | 0.2474    | 
b[125]    | 0.2481    | 
b[126]    | 0.2508    | 
b[127]    | 0.2502    | 
b[128]    | 0.2491    | 
b[129]    | 0.248     | 
b[130]    | 0.2467    | 
b[131]    | 0.2522    | 
b[132]    | 0.2498    | 
b[133]    | 0.2496    | 
b[134]    | 0.2503    | 
b[135]    | 0.248     | 
b[136]    | 0.2504    | 
b[137]    | 0.2467    | 
b[138]    | 0.2536    | 
b[139]    | 0.2486    | 
b[140]    | 0.2494    | 
b[141]    | 0.249     | 
b[142]    | 0.249     | 
b[143]    | 0.2514    | 
b[144]    | 0.2507    | 
b[145]    | 0.2502    | 
b[146]    | 0.2496    | 
b[147]    | 0.2521    | 
b[148]    | 0.2502    | 
b[149]    | 0.2526    | 
b[150]    | 0.2535    | 
b[151]    | 0.2507    | 
b[152]    | 0.2483    | 
b[153]    | 0.2506    | 
b[154]    | 0.248     | 
b[155]    | 0.2534    | 
b[156]    | 0.2487    | 
b[157]    | 0.2539    | 
b[158]    | 0.2511    | 
b[159]    | 0.2528    | 
b[160]    | 0.2491    | 
b[161]    | 0.2485    | 
b[162]    | 0.2524    | 
b[163]    | 0.2471    | 
b[164]    | 0.2501    | 
b[165]    | 0.2509    | 
b[166]    | 0.2488    | 
b[167]    | 0.2511    | 
b[168]    | 0.2505    | 
b[169]    | 0.2502    | 
b[170]    | 0.2471    | 
b[171]    | 0.2468    | 
b[172]    | 0.2504    | 
b[173]    | 0.2509    | 
b[174]    | 0.2512    | 
b[175]    | 0.2486    | 
b[176]    | 0.25      | 
b[177]    | 0.2522    | 
b[178]    | 0.2512    | 
b[179]    | 0.2494    | 
b[180]    | 0.2536    | 
b[181]    | 0.2469    | 
b[182]    | 0.2531    | 
b[183]    | 0.2502    | 
b[184]    | 0.2523    | 
b[185]    | 0.2464    | 
b[186]    | 0.2497    | 
b[187]    | 0.2498    | 
b[188]    | 0.2502    | 
b[189]    | 0.2488    | 
b[190]    | 0.2505    | 
b[191]    | 0.2505    | 
b[192]    | 0.2477    | 
b[193]    | 0.2509    | 
b[194]    | 0.2512    | 
b[195]    | 0.2492    | 
b[196]    | 0.2493    | 
b[197]    | 0.2484    | 
b[198]    | 0.2495    | 
b[199]    | 0.249     | 
b[200]    | 0.25      | 
b[201]    | 0.2517    | 
b[202]    | 0.25      | 
b[203]    | 0.2493    | 
b[204]    | 0.2519    | 
b[205]    | 0.2505    | 
b[206]    | 0.2492    | 
b[207]    | 0.249     | 
b[208]    | 0.2475    | 
b[209]    | 0.2478    | 
b[210]    | 0.2511    | 
b[211]    | 0.2496    | 
b[212]    | 0.2501    | 
b[213]    | 0.2454    | 
b[214]    | 0.251     | 
b[215]    | 0.2492    | 
b[216]    | 0.2495    | 
b[217]    | 0.2517    | 
b[218]    | 0.2486    | 
b[219]    | 0.2531    | 
b[220]    | 0.2449    | 
b[221]    | 0.2465    | 
b[222]    | 0.2482    | 
b[223]    | 0.2514    | 
b[224]    | 0.2502    | 
b[225]    | 0.2477    | 
b[226]    | 0.2489    | 
b[227]    | 0.2511    | 
b[228]    | 0.2492    | 
b[229]    | 0.2499    | 
b[230]    | 0.2516    | 
b[231]    | 0.2523    | 
b[232]    | 0.2526    | 
b[233]    | 0.2465    | 
b[234]    | 0.2504    | 
b[235]    | 0.2502    | 
b[236]    | 0.2497    | 
b[237]    | 0.2472    | 
b[238]    | 0.2487    | 
b[239]    | 0.2479    | 
b[240]    | 0.2519    | 
b[241]    | 0.2476    | 
b[242]    | 0.2471    | 
b[243]    | 0.248     | 
b[244]    | 0.2478    | 
b[245]    | 0.2491    | 
b[246]    | 0.2506    | 
b[247]    | 0.2527    | 
b[248]    | 0.2494    | 
b[249]    | 0.2494    | 
b[250]    | 0.2485    | 
b[251]    | 0.2522    | 
b[252]    | 0.2494    | 
b[253]    | 0.2481    | 
b[254]    | 0.2486    | 
b[255]    | 0.2489    | 
b[256]    | 0.2482    | 
b[257]    | 0.254     | 
b[258]    | 0.2517    | 
b[259]    | 0.251     | 
b[260]    | 0.2486    | 
b[261]    | 0.253     | 
b[262]    | 0.2479    | 
b[263]    | 0.2476    | 
b[264]    | 0.2463    | 
b[265]    | 0.2493    | 
b[266]    | 0.2483    | 
b[267]    | 0.2519    | 
b[268]    | 0.2483    | 
b[269]    | 0.2489    | 
b[270]    | 0.2534    | 
b[271]    | 0.2489    | 
b[272]    | 0.2471    | 
b[273]    | 0.2495    | 
b[274]    | 0.2471    | 
b[275]    | 0.2498    | 
b[276]    | 0.2488    | 
b[277]    | 0.2477    | 
b[278]    | 0.2502    | 
b[279]    | 0.2501    | 
b[280]    | 0.2486    | 
b[281]    | 0.2499    | 
b[282]    | 0.2499    | 
b[283]    | 0.2498    | 
b[284]    | 0.2498    | 
b[285]    | 0.2484    | 
b[286]    | 0.2497    | 
b[287]    | 0.249     | 
b[288]    | 0.2486    | 
b[289]    | 0.2523    | 
b[290]    | 0.2486    | 
b[291]    | 0.2503    | 
b[292]    | 0.2537    | 
b[293]    | 0.2517    | 
b[294]    | 0.2509    | 
b[295]    | 0.2505    | 
b[296]    | 0.2504    | 
b[297]    | 0.2518    | 
b[298]    | 0.2519    | 
b[299]    | 0.249     | 
b[300]    | 0.2502    | 
------------------------------------
Finished sucessfully.
ERR@10 on training data: 0.25
ERR@10 on validation data: 0.2503
------------------------------------
MAP on test data: 0.5371

Model saved to: RandomForests.txt

```

Applying model to test data:

```
java -jar RankLib-2.15.jar -rank testing_data.txt -load RandomForests.txt -indri testoutputRF.trec
```

Result:

```
Discard orig. features
Model file:	RandomForests.txt
Feature normalization: No
Model:		Random Forests
Reading feature file [testing_data.txt]... [Done.]            
(43 ranked lists, 9260 entries read)
```

Evaluation results using trec_eval:

```
tools/eval/trec_eval.9.0.4/trec_eval -c -mrecall.1000 -mmap -mndcg -m ndcg_cut.5,10,100 -mP.10 -mgm_map -mrecip_rank.10  collections/msmarco-passage/2019qrels-pass.trec runs/testoutputRF.trec

map                       all    0.5376
gm_map                    all    0.4786
recip_rank                all    0.7036
P_10                      all    0.5163
recall_1000               all    1.0000
ndcg                      all    0.7392
ndcg_cut_5                all    0.3840
ndcg_cut_10               all    0.3830
ndcg_cut_100              all    0.5820
```