# LTR Case Study: DIY

Here DIY is taken to mean an index that does not offer any native/plugin based support for LTR. Specifically, the index has no built-in support for generating feature values (although this can certainly be faked for some features).

Advantages are that you are no longer constrained by the types of features your plugin can give you. Disadvantages are that you have to work harder to generate features.

In our case, we will use the same Solr index we ran our Solr+LTR case study against, except that we will not depend on the index for any LTR support.

Start the Solr server with the following command:

    bin/solr start -Dsolr.ltr.enabled=true

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import datetime
import gensim
import json
import numpy as np
import operator
import os
import random
import requests
import spacy
import urllib

In [2]:
DATA_DIR = "../../data"
MODEL_DIR = "../../models"

SOLR_URL = "http://localhost:8983/solr/tmdbindex"
FEATURE_FILE_TEMPLATE = os.path.join(DATA_DIR, "diy_features_{:s}.txt")
SCORE_FILE = os.path.join(DATA_DIR, "diy_lambdamart_scores.txt")

FEATURE_LIST = [
    "origScore", "titleSimTFIDF", "titleSimBM25", "descSimTFIDF", "descSimBM25",
    "docRecency", "isGoHands", "isAniplex", "isThriller", "isForeign",
    "isDrama", "isWar", "isAction", "isComedy", "isMusic", 
    "isRomance", "isAdventure", "isFamily", "isFantasy", "isCrime",
    "isHorror", "isHistory", "isMystery", "isAnimation", "isDocumentary",
    "isWestern"
]
QUERY_LIST = [
    "murder", "musical", "biography", "police", "world war ii",
    "comedy", "superhero", "nazis", "romance", "martial arts",
    "extramarital", "spy", "vampire", "magic", "wedding",
    "sport", "prison", "teacher", "alien", "dystopia"
]
TOP_N = 10

## Setup Plugin

Not required

## Load Data

We already have data in this index, so we will just reuse that.

## Define LTR features

Since we are not using index support, we do not need to define our features to the index, so we can skip this step as well.

## Generate LTR features

We will use the same features as our Solr case study, but we will generate the feature values using our own code outside the index.

In [3]:
nlp = spacy.load("en")

In [4]:
def get_search_results(query, num_docs):
    payload = {
        "q": query,
        "defType": "edismax",
        "qf": "title_t description_t",
        "pf": "title_t description_t",
        "mm": 2,
        "fl": "*,score",            
        "rows": num_docs
    }
    params = urllib.parse.urlencode(payload, quote_via=urllib.parse.quote_plus)
    search_url = SOLR_URL + "/select?" + params
    resp = requests.get(search_url)
    resp_json = json.loads(resp.text)
    docs = resp_json["response"]["docs"]
    return docs


docs = get_search_results("martial arts", 100)
assert(len(docs) <= 100)

### TF-IDF Similarity

In [5]:
def get_tfidf_similarities(query, docs, field_name):
    fields = []
    for doc in docs:
        try:
            fields.append(doc[field_name])
        except KeyError:
            fields.append(" ")
    tfidf = TfidfVectorizer()
    field_vecs = tfidf.fit_transform(fields)
    query_vec = np.sum(tfidf.transform(query.split(" ")), axis=0)
    sims = linear_kernel(query_vec, field_vecs).flatten()
    tfidf = None
    return sims


desc_sims_tfidf = get_tfidf_similarities("martial arts", docs, "description_t")
assert(len(desc_sims_tfidf) == len(docs))

### BM25 Similarity

In [6]:
def get_bm25_similarities(query, docs, field_name):
    """ Code adapted from:
        https://stackoverflow.com/questions/40966014/how-to-use-gensim-bm25-ranking-in-python
    """
    fields = []
    for doc in docs:
        try:
            fields.append(nlp(doc[field_name].lower()))
        except KeyError:
            fields.append(nlp(" "))
    field_tokens = []
    for field in fields:
        field_tokens.append([token.text for token in field])
    dictionary = gensim.corpora.Dictionary(field_tokens)
    corpus = [dictionary.doc2bow(token) for token in field_tokens]
    bm25 = gensim.summarization.bm25.BM25(corpus)
    avg_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
    query_tokens = [token.text for token in nlp(query.lower())]
    query_vec = dictionary.doc2bow(query_tokens)
    sims = bm25.get_scores(query_vec, avg_idf)
    dictionary, corpus, bm25 = None, None, None
    return sims


desc_sims_bm25 = get_bm25_similarities("martial arts", docs, "description_t")
assert(len(desc_sims_bm25) == len(docs))

### Document Recency

We will get this as the number of seconds since epoch divided by 365\*24\*60\*60 (so decimal value in years).

In [7]:
def get_doc_recencies(docs, field_name):
    epoch = datetime.datetime.utcfromtimestamp(0)
    recencies = []
    for doc in docs:
        try:
            field = doc[field_name]
            field_dttm = datetime.datetime.strptime(doc[field_name], "%Y-%m-%dT%H:%M:%SZ")
            total_years = (field_dttm - epoch).total_seconds() / (365 * 24 * 60 * 60)
        except KeyError:
            total_years = 0
        recencies.append(total_years)
    return recencies

doc_recencies = get_doc_recencies(docs, "released_dt")
assert(len(doc_recencies) == len(docs))

### Document Categories

In [8]:
def get_doc_categories(docs):
    categories = []
    for doc in docs:
        category_dict = {}
        try:
            genres = set(doc["genres_ss"])
            for feature in FEATURE_LIST[6:]:
                feat_val = feature[2:]
                if feat_val in genres:
                    category_dict[feature] = 1
                else:
                    category_dict[feature] = 0
        except KeyError:
            category_dict = {feature: 0 for feature in FEATURE_LIST[6:]}
        categories.append(category_dict)
    return categories

categories = get_doc_categories(docs)
assert(len(categories) == len(docs))

### Feature Generation

In [9]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    return int(rating // 2) + 1

assert(rating2label(6.4) == 4)
assert(rating2label(9.8) == 5)

In [10]:
feature_name2id = {name: idx + 1 for idx, name in enumerate(FEATURE_LIST)}

assert(feature_name2id["isRomance"] == 16)

In [11]:
def format_letor(doc_id, rating, qid, query, orig_score, title_sim_tfidf, desc_sim_tfidf,
                 title_sim_bm25, desc_sim_bm25, doc_recency, doc_categories):
    label = rating2label(rating)
    features = {
        "origScore": "{:.5f}".format(orig_score),
        "titleSimTFIDF": "{:.5f}".format(title_sim_tfidf),
        "titleSimBM25": "{:.5f}".format(title_sim_bm25),
        "descSimTFIDF": "{:.5f}".format(desc_sim_tfidf),
        "descSimBM25": "{:.5f}".format(desc_sim_bm25),
        "docRecency": "{:.5f}".format(doc_recency),
        "isGoHands": "{:.3f}".format(doc_categories["isGoHands"]),
        "isAniplex": "{:.3f}".format(doc_categories["isAniplex"]),
        "isThriller": "{:.3f}".format(doc_categories["isThriller"]),
        "isForeign": "{:.3f}".format(doc_categories["isForeign"]),
        "isDrama": "{:.3f}".format(doc_categories["isDrama"]),
        "isWar": "{:.3f}".format(doc_categories["isWar"]),  
        "isAction": "{:.3f}".format(doc_categories["isAction"]),
        "isComedy": "{:.3f}".format(doc_categories["isComedy"]),
        "isMusic": "{:.3f}".format(doc_categories["isMusic"]),
        "isRomance": "{:.3f}".format(doc_categories["isRomance"]),
        "isAdventure": "{:.3f}".format(doc_categories["isAdventure"]),
        "isFamily": "{:.3f}".format(doc_categories["isFamily"]),
        "isFantasy": "{:.3f}".format(doc_categories["isFantasy"]),
        "isCrime": "{:.3f}".format(doc_categories["isCrime"]),
        "isHorror": "{:.3f}".format(doc_categories["isHorror"]),
        "isHistory": "{:.3f}".format(doc_categories["isHistory"]),
        "isMystery": "{:.3f}".format(doc_categories["isMystery"]),
        "isAnimation": "{:.3f}".format(doc_categories["isAnimation"]),
        "isDocumentary": "{:.3f}".format(doc_categories["isDocumentary"]),
        "isWestern": "{:.3f}".format(doc_categories["isWestern"])
    }
    feat_pairs = []
    for feat_name in FEATURE_LIST:
        feat_id = str(feature_name2id[feat_name])
        feat_val = features[feat_name]
        feat_pairs.append(":".join([feat_id, feat_val]))
    return "{:d} qid:{:d} {:s} # docid:{:d} query:{:s}".format(
        label, qid, " ".join(feat_pairs), doc_id, query)

In [12]:
random.shuffle(QUERY_LIST)
train_queries = QUERY_LIST[0:12]
val_queries = QUERY_LIST[12:15]
test_queries = QUERY_LIST[15:]
feat_suffixes = ["train", "val", "test"]
test_qid2query = {}
qid = 1
for qt_idx, queries in enumerate([train_queries, val_queries, test_queries]):
    fletor = open(FEATURE_FILE_TEMPLATE.format(feat_suffixes[qt_idx]), "w")
    for query in queries:
        print("generating feature for {:s} ({:s})".format(query, feat_suffixes[qt_idx]))
        if feat_suffixes[qt_idx] == "test":
            test_qid2query[qid] = query
        docs = get_search_results(query, 100)
        # features from search result
        orig_scores = [doc["score"] for doc in docs]
        title_sims_tfidf = get_tfidf_similarities(query, docs, "title_t")
        desc_sims_tfidf = get_tfidf_similarities(query, docs, "description_t")
        title_sims_bm25 = get_bm25_similarities(query, docs, "title_t")
        desc_sims_bm25 = get_bm25_similarities(query, docs, "description_t")
        doc_recencies = get_doc_recencies(docs, "released_dt")
        doc_categories = get_doc_categories(docs)
        for i in range(len(docs)):
            doc = docs[i]
            # get additional fields
            doc_id = int(doc["id"])
            rating = doc["rating_f"]
            # write record
            fletor.write("{:s}\n".format(format_letor(doc_id, rating, qid, query, orig_scores[i],
                                                      title_sims_tfidf[i], desc_sims_tfidf[i],
                                                      title_sims_bm25[i], desc_sims_bm25[i],
                                                      doc_recencies[i], doc_categories[i])))
        qid += 1
    fletor.close()
print("number of queries, train {:d}, test {:d}, validation {:d}".format(
    len(train_queries), len(test_queries), len(val_queries)))

generating feature for sport (train)
generating feature for nazis (train)
generating feature for teacher (train)
generating feature for world war ii (train)
generating feature for spy (train)
generating feature for vampire (train)
generating feature for wedding (train)
generating feature for police (train)
generating feature for murder (train)
generating feature for martial arts (train)
generating feature for biography (train)
generating feature for dystopia (train)
generating feature for comedy (val)
generating feature for musical (val)
generating feature for romance (val)
generating feature for alien (test)
generating feature for prison (test)
generating feature for superhero (test)
generating feature for extramarital (test)
generating feature for magic (test)
number of queries, train 12, test 5, validation 3


## Train Model

Command to train LambdaMART model using the LETOR files generated in previous step is as follows:

    java -jar RankLib-2.10.jar \
        -train ../data/diy_features_train.txt \
        -test ../data/diy_features_test.txt \
        -validate ../data/diy_features_val.txt \
        -ranker 6 \
        -metric2t NDCG@10 \
        -metric2T NDCG@10 \
        -norm zscore \
        -save ../models/diy_lambdamart_model.txt

And the console output for this command is as follows:

    Discard orig. features
    Training data:	../data/diy_features_train.txt
    Test data:	../data/diy_features_test.txt
    Validation data:	../data/diy_features_val.txt
    Feature vector representation: Dense.
    Ranking method:	LambdaMART
    Feature description file:	Unspecified. All features will be used.
    Train metric:	NDCG@10
    Test metric:	NDCG@10
    Feature normalization: zscore
    Model file: ../models/diy_lambdamart_model.txt
    
    [+] LambdaMART's Parameters:
    No. of trees: 1000
    No. of leaves: 10
    No. of threshold candidates: 256
    Min leaf support: 1
    Learning rate: 0.1
    Stop early: 100 rounds without performance gain on validation data
    
    Reading feature file [../data/diy_features_train.txt]... [Done.]            
    (12 ranked lists, 997 entries read)
    Reading feature file [../data/diy_features_val.txt]... [Done.]            
    (3 ranked lists, 295 entries read)
    Reading feature file [../data/diy_features_test.txt]... [Done.]            
    (5 ranked lists, 410 entries read)
    Initializing... [Done]
    ---------------------------------
    Training starts...
    ---------------------------------
    #iter   | NDCG@10-T | NDCG@10-V | 
    ---------------------------------
    1       | 0.5433    | 0.5499    | 
    2       | 0.6531    | 0.4939    | 
    3       | 0.6574    | 0.4581    | 
    4       | 0.6798    | 0.4575    | 
    5       | 0.6711    | 0.4583    | 
    6       | 0.6874    | 0.4859    | 
    7       | 0.6925    | 0.4855    | 
    8       | 0.6906    | 0.5006    | 
    9       | 0.698     | 0.5006    | 
    10      | 0.6852    | 0.5006    | 
    11      | 0.6864    | 0.5006    | 
    12      | 0.6908    | 0.5006    | 
    13      | 0.7036    | 0.5006    | 
    14      | 0.6948    | 0.5006    | 
    15      | 0.7087    | 0.5064    | 
    16      | 0.7099    | 0.4914    | 
    17      | 0.716     | 0.4914    | 
    18      | 0.708     | 0.4914    | 
    19      | 0.7167    | 0.4903    | 
    20      | 0.7271    | 0.4903    | 
    21      | 0.7255    | 0.4885    | 
    22      | 0.7305    | 0.4793    | 
    23      | 0.733     | 0.4793    | 
    24      | 0.7314    | 0.4793    | 
    25      | 0.7366    | 0.4821    | 
    26      | 0.7363    | 0.4884    | 
    27      | 0.7398    | 0.4766    | 
    28      | 0.741     | 0.4926    | 
    29      | 0.7389    | 0.492     | 
    30      | 0.7357    | 0.4861    | 
    31      | 0.7424    | 0.4789    | 
    32      | 0.7455    | 0.4802    | 
    33      | 0.7456    | 0.4904    | 
    34      | 0.7505    | 0.4876    | 
    35      | 0.7496    | 0.4951    | 
    36      | 0.7588    | 0.4954    | 
    37      | 0.7686    | 0.4964    | 
    38      | 0.768     | 0.4988    | 
    39      | 0.7698    | 0.4988    | 
    40      | 0.7725    | 0.5068    | 
    41      | 0.7754    | 0.5068    | 
    42      | 0.7778    | 0.5158    | 
    43      | 0.778     | 0.5177    | 
    44      | 0.7848    | 0.5189    | 
    45      | 0.7863    | 0.5278    | 
    46      | 0.7858    | 0.5269    | 
    47      | 0.7821    | 0.5298    | 
    48      | 0.7872    | 0.5408    | 
    49      | 0.7879    | 0.5316    | 
    50      | 0.7884    | 0.5641    | 
    51      | 0.7902    | 0.5627    | 
    52      | 0.7951    | 0.5639    | 
    53      | 0.7967    | 0.5639    | 
    54      | 0.7934    | 0.5606    | 
    55      | 0.7928    | 0.5644    | 
    56      | 0.7997    | 0.5734    | 
    57      | 0.7992    | 0.5684    | 
    58      | 0.8001    | 0.547     | 
    59      | 0.8034    | 0.5525    | 
    60      | 0.8065    | 0.5514    | 
    61      | 0.8064    | 0.5514    | 
    62      | 0.8062    | 0.5497    | 
    63      | 0.8122    | 0.529     | 
    64      | 0.8123    | 0.5381    | 
    65      | 0.8134    | 0.5529    | 
    66      | 0.8198    | 0.5532    | 
    67      | 0.82      | 0.5673    | 
    68      | 0.8206    | 0.5652    | 
    69      | 0.8219    | 0.566     | 
    70      | 0.8228    | 0.5682    | 
    71      | 0.8226    | 0.5675    | 
    72      | 0.8243    | 0.5695    | 
    73      | 0.8243    | 0.5755    | 
    74      | 0.8301    | 0.5783    | 
    75      | 0.827     | 0.5728    | 
    76      | 0.828     | 0.5728    | 
    77      | 0.8303    | 0.5787    | 
    78      | 0.8339    | 0.5799    | 
    79      | 0.8326    | 0.5774    | 
    80      | 0.8339    | 0.577     | 
    81      | 0.8342    | 0.5797    | 
    82      | 0.8291    | 0.5826    | 
    83      | 0.8379    | 0.5794    | 
    84      | 0.8382    | 0.5627    | 
    85      | 0.8417    | 0.5613    | 
    86      | 0.8447    | 0.5615    | 
    87      | 0.8468    | 0.5742    | 
    88      | 0.8449    | 0.5853    | 
    89      | 0.8429    | 0.5828    | 
    90      | 0.8477    | 0.5825    | 
    91      | 0.8497    | 0.586     | 
    92      | 0.8502    | 0.588     | 
    93      | 0.8495    | 0.5844    | 
    94      | 0.8537    | 0.5914    | 
    95      | 0.8532    | 0.5906    | 
    96      | 0.8522    | 0.5912    | 
    97      | 0.8531    | 0.5912    | 
    98      | 0.85      | 0.5912    | 
    99      | 0.8531    | 0.5912    | 
    100     | 0.8582    | 0.5912    | 
    101     | 0.8608    | 0.5697    | 
    102     | 0.8586    | 0.5806    | 
    103     | 0.8683    | 0.5849    | 
    104     | 0.8703    | 0.5802    | 
    105     | 0.8674    | 0.5907    | 
    106     | 0.8823    | 0.5743    | 
    107     | 0.8868    | 0.572     | 
    108     | 0.8855    | 0.5741    | 
    109     | 0.8907    | 0.5725    | 
    110     | 0.8923    | 0.5729    | 
    111     | 0.894     | 0.5717    | 
    112     | 0.8937    | 0.5725    | 
    113     | 0.8978    | 0.5775    | 
    114     | 0.8998    | 0.5784    | 
    115     | 0.8968    | 0.5787    | 
    116     | 0.9013    | 0.5787    | 
    117     | 0.9013    | 0.5754    | 
    118     | 0.8996    | 0.5745    | 
    119     | 0.9004    | 0.5659    | 
    120     | 0.9015    | 0.5659    | 
    121     | 0.9079    | 0.5729    | 
    122     | 0.9094    | 0.5537    | 
    123     | 0.9152    | 0.5525    | 
    124     | 0.9127    | 0.5576    | 
    125     | 0.9128    | 0.5588    | 
    126     | 0.9124    | 0.5584    | 
    127     | 0.9177    | 0.5584    | 
    128     | 0.9135    | 0.5584    | 
    129     | 0.9197    | 0.5584    | 
    130     | 0.9181    | 0.5584    | 
    131     | 0.9199    | 0.5574    | 
    132     | 0.9248    | 0.558     | 
    133     | 0.9235    | 0.5568    | 
    134     | 0.9272    | 0.5568    | 
    135     | 0.9263    | 0.5558    | 
    136     | 0.9266    | 0.5599    | 
    137     | 0.9289    | 0.5558    | 
    138     | 0.9304    | 0.5559    | 
    139     | 0.9306    | 0.5561    | 
    140     | 0.9308    | 0.5558    | 
    141     | 0.9372    | 0.5544    | 
    142     | 0.9373    | 0.5544    | 
    143     | 0.9349    | 0.5498    | 
    144     | 0.9413    | 0.5498    | 
    145     | 0.9448    | 0.5498    | 
    146     | 0.945     | 0.5602    | 
    147     | 0.9451    | 0.5588    | 
    148     | 0.9453    | 0.5585    | 
    149     | 0.9455    | 0.5585    | 
    150     | 0.9444    | 0.5608    | 
    151     | 0.9495    | 0.565     | 
    152     | 0.957     | 0.5695    | 
    153     | 0.9605    | 0.5691    | 
    154     | 0.9608    | 0.5798    | 
    155     | 0.9647    | 0.5711    | 
    156     | 0.9684    | 0.5715    | 
    157     | 0.9691    | 0.5715    | 
    158     | 0.9716    | 0.5702    | 
    159     | 0.9698    | 0.5702    | 
    160     | 0.9705    | 0.5702    | 
    161     | 0.9716    | 0.5543    | 
    162     | 0.9715    | 0.5543    | 
    163     | 0.9742    | 0.5477    | 
    164     | 0.9742    | 0.5553    | 
    165     | 0.976     | 0.5553    | 
    166     | 0.976     | 0.5386    | 
    167     | 0.9768    | 0.5386    | 
    168     | 0.9767    | 0.5478    | 
    169     | 0.9765    | 0.5386    | 
    170     | 0.9767    | 0.5482    | 
    171     | 0.9768    | 0.5478    | 
    172     | 0.9749    | 0.5478    | 
    173     | 0.9773    | 0.5402    | 
    174     | 0.9773    | 0.5406    | 
    175     | 0.9771    | 0.5406    | 
    176     | 0.9776    | 0.5482    | 
    177     | 0.9776    | 0.5482    | 
    178     | 0.9765    | 0.5482    | 
    179     | 0.9791    | 0.5483    | 
    180     | 0.9795    | 0.5492    | 
    181     | 0.9798    | 0.5492    | 
    182     | 0.9798    | 0.5486    | 
    183     | 0.9798    | 0.5486    | 
    184     | 0.9798    | 0.5527    | 
    185     | 0.9798    | 0.5518    | 
    186     | 0.9798    | 0.5535    | 
    187     | 0.9798    | 0.5564    | 
    188     | 0.9781    | 0.5509    | 
    189     | 0.9781    | 0.5368    | 
    190     | 0.9801    | 0.5326    | 
    191     | 0.9801    | 0.5418    | 
    192     | 0.9789    | 0.5429    | 
    193     | 0.9807    | 0.5429    | 
    194     | 0.9812    | 0.5418    | 
    195     | 0.9812    | 0.5339    | 
    ---------------------------------
    Finished sucessfully.
    NDCG@10 on training data: 0.8537
    NDCG@10 on validation data: 0.5914
    ---------------------------------
    NDCG@10 on test data: 0.4409
    
    Model saved to: ../models/diy_lambdamart_model.txt
    

## Upload Trained Model

Since the index does not provide any LTR support, we don't have to upload the trained model.

## Run rerank Query

On the other hand, we cannot use the index to run a rerank query, so we need to run inference on the trained model directly.

In our case study, we have chosen to run it from the command line as shown below, but for a production system, this can be done by hooking into RankLib's inference mechanism, since it is written in Java and open source.

    cd <scripts_dir>
    java -jar RankLib-2.10.jar \
        -load ../data/diy_lambdamart_model.txt \
        -rank ../data/diy_features_test.txt \
        -norm zscore \
        -score ../data/diy_lambdamart_scores.txt

which returns the following console output:

    Discard orig. features
    Model file:	../models/diy_lambdamart_model.txt
    Feature normalization: zscore
    Model:		LambdaMART
    Reading feature file [../data/diy_features_test.txt]... [Done.]            
    (5 ranked lists, 410 entries read)

and writes out the score file as tab separated (qid, doc_id, score) triples.

In [13]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    return int(rating // 2) + 1


def get_rating_string(rating):
    rating_string = []
    for i in range(rating):
        rating_string.append(u"\u2605")
    for i in range(5 - rating):
        rating_string.append(u"\u2606")
    return "".join(rating_string)


print(get_rating_string(3))
print(get_rating_string(rating2label(6.4)))

★★★☆☆
★★★★☆


In [14]:
def render_results(docs, query, top_n):
    print("top {:d} results for {:s}".format(top_n * 2, query))
    print("---")
    for doc in docs[0:top_n * 2]:
        doc_id = int(doc["id"])
        stars = get_rating_string(rating2label(float(doc["rating_f"])))
        score = float(doc["score"])
        title = doc["title_t"]
        print("{:s} {:06d} {:.3f} {:s}".format(stars, doc_id, score, title))

In [15]:
test_qids = list(test_qid2query.keys())
qid = random.randint(min(test_qids), max(test_qids))
query = test_qid2query[qid]
print(query)

magic


### Top 20 without LTR

In [16]:
payload = {
    "q": query,
    "defType": "edismax",
    "qf": "title_t description_t",
    "pf": "title_t description_t",
    "mm": 2,
    "fl": "id,title_t,rating_f,score",            
    "rows": TOP_N * 10
}
params = urllib.parse.urlencode(payload, quote_via=urllib.parse.quote_plus)
search_url = SOLR_URL + "/select?" + params
resp = requests.get(search_url)
resp_json = json.loads(resp.text)
docs = resp_json["response"]["docs"]
render_results(docs, query, TOP_N)

top 20 results for magic
---
★★★☆☆ 139519 9.781 Magic Magic
★★★★☆ 034193 8.863 Magic
★★★★☆ 032916 7.557 Scooby-Doo! Abracadabra-Doo
★★★★☆ 070313 7.519 Bowery to Bagdad
★★★★☆ 181533 7.514 Night at the Museum: Secret of the Tomb
★★★★☆ 045671 7.468 Rough Magic
★★★★☆ 006435 7.468 Practical Magic
★★★★☆ 068894 7.468 Shadow Magic
★★★★☆ 037204 7.468 Summer Magic
★★★★★ 038464 7.468 Black Magic
★★☆☆☆ 024797 7.468 Magic Man
★★★★☆ 057211 7.468 Magic Trip
★★★☆☆ 215405 7.468 Magic Kid
★★★★☆ 203179 7.468 Magic Camp
★★★★☆ 077930 7.468 Magic Mike
★★★★☆ 090966 7.468 Magic Town
★★★★☆ 092796 7.468 Black Magic
★★★★☆ 302429 7.468 Strange Magic
★★☆☆☆ 029419 7.468 Carnival Magic
★★★★☆ 081420 7.468 Christmas Magic


### Top 20 reranked with LTR

In [17]:
fscores = open(SCORE_FILE, "r")
rows = []
doc_idx = 0
for line in fscores:
    line = line.strip()
    rqid, doc_id, score = line.split("\t")
    if int(rqid) != qid:
        continue
    rows.append((doc_idx, float(score)))
    doc_idx += 1
fscores.close()
reranked_rows = sorted(rows, key=operator.itemgetter(1), reverse=True)[0:TOP_N]

In [18]:
reranked_docs = []
# LTR layer
for doc_id, score in reranked_rows:
    doc = docs[doc_id]
    doc["score"] = score
    reranked_docs.append(doc)
# rest of the results
doc_ids_to_remove = set([x[0] for x in reranked_rows])
for doc in docs:
    doc_id = int(doc["id"])
    if doc_id in doc_ids_to_remove:
        continue
    reranked_docs.append(doc)
    
render_results(reranked_docs, query, TOP_N)

top 20 results for magic
---
★★★★☆ 388192 3.724 Siccîn 2
★★★★☆ 038191 2.130 The Sunchaser
★★★☆☆ 160220 1.637 Bronies: The Extremely Unexpected Adult Fans of My Little Pony
★★★☆☆ 206145 1.540 Tarzan's Magic Fountain
★☆☆☆☆ 243565 1.457 Mother Carey's Chickens
★★★★☆ 000673 1.441 Harry Potter and the Prisoner of Azkaban
★★★★☆ 181533 1.422 Night at the Museum: Secret of the Tomb
★★★☆☆ 224141 1.365 Into the Woods
★★☆☆☆ 181426 1.326 Chandu on the Magic Island
★★★★☆ 015400 1.306 Mickey's Once Upon a Christmas
★★★☆☆ 139519 9.781 Magic Magic
★★★★☆ 034193 8.863 Magic
★★★★☆ 032916 7.557 Scooby-Doo! Abracadabra-Doo
★★★★☆ 070313 7.519 Bowery to Bagdad
★★★★☆ 181533 1.422 Night at the Museum: Secret of the Tomb
★★★★☆ 045671 7.468 Rough Magic
★★★★☆ 006435 7.468 Practical Magic
★★★★☆ 068894 7.468 Shadow Magic
★★★★☆ 037204 7.468 Summer Magic
★★★★★ 038464 7.468 Black Magic
