# LTR Case Study: Elasticsearch

We need Elasticsearch 6.3 installed, it can be [downloaded from here](https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.4.2.tar.gz). To start Elasticsearch, run the command:

    cd <elasticsearch_home>
    bin/elasticsearch
    
The case study follows the steps outlined in the [Elasticsearch LTR Documentation](https://elasticsearch-learning-to-rank.readthedocs.io/en/latest/core-concepts.html) and the tutorials from [Pere Urbon-Bayes](https://medium.com/@purbon/learning-to-rank-101-5755f2797a3a) and [Doug Turnbull](https://github.com/o19s/elasticsearch-learning-to-rank/tree/master/demo).

In [1]:
import csv
import json
import os
import random
import requests
import sqlite3

In [2]:
DATA_DIR = "../../data"
MODEL_DIR = "../../models"

MOVIES_DATA = os.path.join(DATA_DIR, "movies_metadata.csv")
LOOKUPS_DB = os.path.join(DATA_DIR, "lookups.db")
FEATURE_FILE_TEMPLATE = os.path.join(DATA_DIR, "es_features_{:s}.txt")
MODEL_FILE = os.path.join(MODEL_DIR, "es_lambdamart_model.txt")

FEATURE_LIST = [
    "origScore", "titleSimTFIDF", "titleSimBM25", "descSimTFIDF", "descSimBM25",
    "docRecency", "isGoHands", "isAniplex", "isThriller", "isForeign",
    "isDrama", "isWar", "isAction", "isComedy", "isMusic", 
    "isRomance", "isAdventure", "isFamily", "isFantasy", "isCrime",
    "isHorror", "isHistory", "isMystery", "isAnimation", "isDocumentary",
    "isWestern"
]
QUERY_LIST = [
    "murder", "musical", "biography", "police", "world war ii",
    "comedy", "superhero", "nazis", "romance", "martial arts",
    "extramarital", "spy", "vampire", "magic", "wedding",
    "sport", "prison", "teacher", "alien", "dystopia"
]

ES_URL = "http://localhost:9200"
TOP_N = 10

## Setup Plugin

We are using ES-LTR plugin for ES 6.3. LTR plugin is installed using the following command:

    cd <elasticsearch_home>
    bin/elasticsearch-plugin install http://es-learn-to-rank.labs.o19s.com/ltr-1.1.0-es6.3.1.zip
    
We are using [elasticsearch-head](https://github.com/mobz/elasticsearch-head) as our browser client. In order to make it work, you need to disable some security measures using the following directives in `config/elasticsearch.yml`.

    http.cors.enabled: true
    http.cors.allow-origin: "*"


## Load Data

### Create Index and Schema

In [3]:
headers = {
    "Content-Type": "application/json"
}
data = {
    "settings" : {
        "index" : {
            "number_of_shards" : 1, 
            "number_of_replicas" : 1 
        }
    },
    "mappings" : {
        "doc" : {
            "properties" : {
                "doc_id": { "type": "keyword", "store": "true" },
                "title": { "type": "text", "store": "true", "copy_to": "title_tfidf" },
                "description": { "type": "text", "store": "true", "copy_to": "description_tfidf" },
                "title_tfidf": { "type": "text", "store": "true", "similarity": "classic" },
                "description_tfidf": { "type": "text", "store": "true", "similarity": "classic" },
                "popularity": { "type": "double", "store": "true" },
                "release_dt": { "type": "date", "store": "true" },
                "revenue": { "type": "double", "store": "true" },
                "runtime": { "type": "double", "store": "true" },
                "rating": { "type": "double", "store": "true" },
                "keywords": { "type": "keyword", "store": "true" },
                "genres": { "type": "keyword", "store": "true" }
            }
        }
    }
}
resp = requests.put(ES_URL + "/tmdbindex", headers=headers, data=json.dumps(data))
print(resp.text)

{"acknowledged":true,"shards_acknowledged":true,"index":"tmdbindex"}


### Insert Records

In [4]:
def get_keywords(conn, movie_id):
    cur = conn.cursor()
    cur.execute("select keywords from keywords where mid = ?", [movie_id])
    rows = cur.fetchall()
    keywords = []
    if len(rows) > 0:
        for row in rows:
            keywords = row[0].split("|")
            break
    cur.close()
    return keywords


def filter_genres(conn, genres):
    filtered_genres = []
    cur = conn.cursor()
    for genre in genres:
        cur.execute("select gname from genres where gname = ?", [genre])
        rows = cur.fetchall()
        if len(rows) == 0:
            continue
        filtered_genres.append(genre)
    cur.close()
    return filtered_genres


def get_float(orig_value, default_value):
    if orig_value is None:
        return default_value
    elif len(orig_value.strip()) == 0:
        return default_value
    else:
        return float(orig_value)


def parse_genres(genre_json):
    if len(genre_json.strip()) == 0:
        return []
    names = []
    idname_pairs = json.loads(genre_json.replace("'", "\""))
    for idname_pair in idname_pairs:
        names.append(idname_pair["name"])
    return names

In [5]:
def add_record_to_es(es_url, doc_id, title, description, popularity, 
                     release_date, revenue, runtime, rating, keywords, genres,
                     should_commit=False):
    if doc_id is not None:
        doc = {
            "doc_id": str(doc_id),
            "title": title,
            "description": description,
            "popularity": popularity,
            "release_dt": release_date,
            "revenue": revenue,
            "runtime": runtime,
            "rating": rating,
            "keywords": keywords,
            "genres": genres
        }
        resp = requests.put(es_url + "/tmdbindex/doc/{:d}".format(doc_id), 
                            headers=headers, data=json.dumps(doc))
    if should_commit:
        requests.post(es_url + "/tmdbindex/_flush")


In [6]:
conn = sqlite3.connect(LOOKUPS_DB)
i = 0
should_commit = False
with open(MOVIES_DATA, "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if i % 1000 == 0:
            print("{:d} records ingested into Elasticsearch".format(i))
            should_commit = True
        if row["original_language"] != "en":
            # only stick to english
            i += 1
            continue
        doc_id = int(row["id"])
        title = row["original_title"]
        description = row["overview"]
        popularity = get_float(row["popularity"], 0.0)
        release_date = row["release_date"]
        revenue = get_float(row["revenue"], 0.0)
        runtime = get_float(row["runtime"], 0.0)
        rating = get_float(row["vote_average"], 0.0)
        # look up keywords
        keywords = get_keywords(conn, doc_id)
        # parse out genres
        genres = filter_genres(conn, parse_genres(row["genres"]))
        # add record to solr
        add_record_to_es(ES_URL, doc_id, title, description, popularity, 
                         release_date, revenue, runtime, rating, keywords, genres,
                         should_commit=should_commit)
        should_commit = False
        i += 1

add_record_to_es(ES_URL, None, None, None, None, None, None, None, None, None, None, True)
print("{:d} records ingested into Elasticsearch, COMPLETE".format(i))
conn.close()

0 records ingested into Elasticsearch
1000 records ingested into Elasticsearch
2000 records ingested into Elasticsearch
3000 records ingested into Elasticsearch
4000 records ingested into Elasticsearch
5000 records ingested into Elasticsearch
6000 records ingested into Elasticsearch
7000 records ingested into Elasticsearch
8000 records ingested into Elasticsearch
9000 records ingested into Elasticsearch
10000 records ingested into Elasticsearch
11000 records ingested into Elasticsearch
12000 records ingested into Elasticsearch
13000 records ingested into Elasticsearch
14000 records ingested into Elasticsearch
15000 records ingested into Elasticsearch
16000 records ingested into Elasticsearch
17000 records ingested into Elasticsearch
18000 records ingested into Elasticsearch
19000 records ingested into Elasticsearch
20000 records ingested into Elasticsearch
21000 records ingested into Elasticsearch
22000 records ingested into Elasticsearch
23000 records ingested into Elasticsearch
24000

## Define LTR features

### Initialize Feature Store

In [7]:
requests.delete(ES_URL + "/_ltr")
resp = requests.put(ES_URL + "/_ltr")
print(resp.text)

{"acknowledged":true,"shards_acknowledged":true,"index":".ltrstore"}


### Feature definition

We then construct our features and POST them into a named feature store. A validation query is used to make sure that our queries are well-formed. See [the ES LTR docs](https://elasticsearch-learning-to-rank.readthedocs.io/en/latest/building-features.html) for more information. One thing to note is that all the templates are really queries (the stuff that goes under the "query" key in the normal search JSON requests), so you can build most of them by referring to the ES online query docs.

In [8]:
headers = {
    "Content-Type": "application/json"
}
data = {
    "validation": {
        "params": {
            "query": "martial arts"
        },
        "index": "tmdbindex"
    },
    "featureset": {
        "features": [
            {
                "name": "origScore",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "dis_max": {
                        "queries": [
                            { 
                                "match": { 
                                    "title": "{{query}}"
                                }
                            },
                            {
                                "match": {
                                    "description":  "{{query}}"
                                }
                            }
                        ]
                    }
                }
            },
            {
                "name": "titleSimTFIDF",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "title_tfidf": "{{query}}"
                    }
                }
            },
            {
                "name": "titleSimBM25",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "title": "{{query}}"
                    }
                }
            },
            {
                "name": "descSimTFIDF",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "description_tfidf": "{{query}}"
                    }
                }
            },
            {
                "name": "descSimBM25",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "description": "{{query}}"
                    }
                }
            },
            {
                "name": "docRecency",
                "params": [],
                "template": {
                    "function_score": {
                        "field_value_factor": {
                            "field": "release_dt",
                            "factor": 3.16e-11,
                            "modifier": "reciprocal",
                            "missing": 1
                        }
                    }
                }
            },
            {
                "name": "isGoHands",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "GoHands" 
                    }
                }
            },
            {
                "name": "isAniplex",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Aniplex" 
                    }
                }
            },
            {
                "name": "isThriller",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Thriller" 
                    }
                }
            },
            {
                "name": "isForeign",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Foreign" 
                    }
                }
            },
            {
                "name": "isDrama",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Drama" 
                    }
                }
            },
            {
                "name": "isWar",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "War" 
                    }
                }
            },
            {
                "name": "isAction",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Action" 
                    }
                }
            },
            {
                "name": "isComedy",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Comedy" 
                    }
                }
            },
            {
                "name": "isMusic",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Music" 
                    }
                }
            },
            {
                "name": "isRomance",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Romance" 
                    }
                }
            },
            {
                "name": "isAdventure",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Adventure" 
                    }
                }
            },
            {
                "name": "isFamily",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Family" 
                    }
                }
            },
            {
                "name": "isFantasy",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Fantasy" 
                    }
                }
            },
            {
                "name": "isCrime",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Crime" 
                    }
                }
            },
            {
                "name": "isHorror",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Horror" 
                    }
                }
            },
            {
                "name": "isHistory",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "History" 
                    }
                }
            },
            {
                "name": "isMystery",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Mystery" 
                    }
                }
            },
            {
                "name": "isAnimation",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Animation" 
                    }
                }
            },
            {
                "name": "isDocumentary",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Documentary" 
                    }
                }
            },
            {
                "name": "isWestern",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Western" 
                    }
                }
            }
        ]
    }
}
resp = requests.post(ES_URL + "/_ltr/_featureset/myFeatures", headers=headers, 
                     data=json.dumps(data))
print(resp.text)

{"_index":".ltrstore","_type":"store","_id":"featureset-myFeatures","_version":1,"result":"created","forced_refresh":true,"_shards":{"total":1,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}


In [9]:
# list all featuresets
resp = requests.get(ES_URL + "/_ltr/_featureset", headers=headers)
resp_json = json.loads(resp.text)
# print(resp_json)
for doc in resp_json["hits"]["hits"]:
    print(doc["_id"])

featureset-myFeatures


### Generate LTR Features

We will generate resultsets for 20 queries, and split them up into training, validation and test sets. Labels are generated by applying a transformation on the rating (continuous values 0-10) to transform them into a 5-level categorical label. Judgement lists of query, features and labels are written out in LETOR format.

In [10]:
def collect_docids_for_query(query):
    data = {
        "query": {
            "dis_max": {
                "queries": [
                    {
                        "match": {
                            "title": "%s"
                        }
                    },
                    {
                        "match": {
                            "description": "%s"
                        }
                    }
                ]
            }
        },
        "from": 0,
        "size": 100
    }
    resp = requests.post(ES_URL + "/tmdbindex/_search", headers=headers, data=json.dumps(data))
    resp_json = json.loads(resp.text)
    doc_ids = []
    for doc in resp_json["hits"]["hits"]:
        doc_id = doc["_source"]["doc_id"]
        doc_ids.append(doc_id)
    return doc_ids


doc_ids = collect_docids_for_query("martial arts")
assert(len(doc_ids) <= 100)

In [11]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    return int(rating // 2) + 1

assert(rating2label(6.4) == 4)
assert(rating2label(9.8) == 5)

In [12]:
feature_name2id = {name: idx + 1 for idx, name in enumerate(FEATURE_LIST)}

assert(feature_name2id["isRomance"] == 16)

In [13]:
def collect_features_for_docids(query, doc_ids, feature_name2id):
    data = {
        "query": {
            "bool": {
                "filter": [
                    {
                        "terms": {
                            "_id": doc_ids
                        }
                    },
                    {
                        "sltr": {
                            "_name": "logged_featureset",
                            "featureset": "myFeatures",
                            "params": {
                                "query": query
                            }
                        }
                    }
                ]
            }
        },
        "ext": {
            "ltr_log": {
                "log_specs": {
                    "name": "main",
                    "named_query": "logged_featureset",
                    "missing_as_zero": True
                }
            }
        },
        "from": 0,
        "size": 100
    }
    resp = requests.post(ES_URL + "/tmdbindex/_search", headers=headers, data=json.dumps(data))
    resp_json = json.loads(resp.text)
    features = {}
    for doc in resp_json["hits"]["hits"]:
        doc_src = doc["_source"]
        doc_id = doc_src["doc_id"]
        rating = doc_src["rating"]
        label = rating2label(rating)
        letor_feats = []
        doc_feats = doc["fields"]["_ltrlog"][0]["main"]
        for feat_nv in doc_feats:
            feat_name = feat_nv["name"]
            feat_id = feature_name2id[feat_name]
            feat_value = feat_nv["value"]
            letor_feats.append("{:d}:{:.3f}".format(feat_id, feat_value))
        features[doc_id] = (label, " ".join(letor_feats))
    return features


feats = collect_features_for_docids("martial arts", ['35405', '34068', '13492'], feature_name2id)
assert(len(feats) == 3)

In [14]:
def print_letor(fout, doc_ids, features, qid, query):
    for doc_id in doc_ids:
        label, feat_str = features[doc_id]
        fout.write("{:d} qid:{:d} {:s} # docid:{:s} query:{:s}\n".format(
            label, qid, feat_str, doc_id, query))
    return 

In [15]:
random.shuffle(QUERY_LIST)
train_queries = QUERY_LIST[0:12]
val_queries = QUERY_LIST[12:15]
test_queries = QUERY_LIST[15:]
feat_suffixes = ["train", "val", "test"]
qid = 1
for qt_idx, queries in enumerate([train_queries, val_queries, test_queries]):
    fletor = open(FEATURE_FILE_TEMPLATE.format(feat_suffixes[qt_idx]), "w")
    for query in queries:
        print("generating feature for {:s} ({:s})".format(query, feat_suffixes[qt_idx]))
        # collect doc_ids for query
        doc_ids = collect_docids_for_query(query)
        # return features for each doc_id
        features = collect_features_for_docids(query, doc_ids, feature_name2id)
        print_letor(fletor, doc_ids, features, qid, query)
        qid += 1
        
print("number of queries, train {:d}, test {:d}, validation {:d}".format(
    len(train_queries), len(test_queries), len(val_queries)))

generating feature for vampire (train)
generating feature for prison (train)
generating feature for musical (train)
generating feature for extramarital (train)
generating feature for spy (train)
generating feature for world war ii (train)
generating feature for biography (train)
generating feature for martial arts (train)
generating feature for murder (train)
generating feature for police (train)
generating feature for sport (train)
generating feature for alien (train)
generating feature for dystopia (val)
generating feature for magic (val)
generating feature for comedy (val)
generating feature for nazis (test)
generating feature for superhero (test)
generating feature for wedding (test)
generating feature for romance (test)
generating feature for teacher (test)
number of queries, train 12, test 5, validation 3


## Train LTR Model using RankLib

Command to train RankLib on command line with LETOR files generated in previous step.

    java -jar RankLib-2.10.jar \
        -train ../data/es_features_train.txt \
        -test ../data/es_features_test.txt \
        -validate ../data/es_features_val.txt \
        -ranker 6 \
        -metric2t NDCG@10 \
        -metric2T NDCG@10 \
        -save ../models/es_lambdamart_model.txt

Command output is as follows:

    Discard orig. features
    Training data:	../data/es_features_train.txt
    Test data:	../data/es_features_test.txt
    Validation data:	../data/es_features_val.txt
    Feature vector representation: Dense.
    Ranking method:	LambdaMART
    Feature description file:	Unspecified. All features will be used.
    Train metric:	NDCG@10
    Test metric:	NDCG@10
    Feature normalization: No
    Model file: ../models/es_lambdamart_model.txt
    
    [+] LambdaMART's Parameters:
    No. of trees: 1000
    No. of leaves: 10
    No. of threshold candidates: 256
    Min leaf support: 1
    Learning rate: 0.1
    Stop early: 100 rounds without performance gain on validation data
    
    Reading feature file [../data/es_features_train.txt]... [Done.]            
    (12 ranked lists, 1200 entries read)
    Reading feature file [../data/es_features_val.txt]... [Done.]            
    (3 ranked lists, 300 entries read)
    Reading feature file [../data/es_features_test.txt]... [Done.]            
    (5 ranked lists, 480 entries read)
    Initializing... [Done]
    ---------------------------------
    Training starts...
    ---------------------------------
    #iter   | NDCG@10-T | NDCG@10-V | 
    ---------------------------------
    1       | 0.844     | 0.844     | 
    2       | 0.8652    | 0.8652    | 
    3       | 0.8652    | 0.8652    | 
    4       | 0.8652    | 0.8652    | 
    5       | 0.8652    | 0.8652    | 
    6       | 0.8652    | 0.8652    | 
    7       | 0.8652    | 0.8652    | 
    8       | 0.8652    | 0.8652    | 
    9       | 0.8652    | 0.8652    | 
    10      | 0.8652    | 0.8652    | 
    11      | 0.8652    | 0.8652    | 
    12      | 0.8652    | 0.8652    | 
    13      | 0.8997    | 0.8997    | 
    14      | 0.8997    | 0.8997    | 
    15      | 0.9011    | 0.9011    | 
    16      | 0.9011    | 0.9011    | 
    17      | 0.9028    | 0.9028    | 
    18      | 0.9028    | 0.9028    | 
    19      | 0.9373    | 0.9373    | 
    20      | 0.9373    | 0.9373    | 
    21      | 0.9373    | 0.9373    | 
    22      | 0.9435    | 0.9435    | 
    23      | 0.9607    | 0.9607    | 
    24      | 0.9607    | 0.9607    | 
    25      | 0.978     | 0.978     | 
    26      | 0.9801    | 0.9801    | 
    27      | 0.9865    | 0.9865    | 
    28      | 0.9917    | 0.9917    | 
    29      | 0.9917    | 0.9917    | 
    30      | 0.9917    | 0.9917    | 
    31      | 0.9917    | 0.9917    | 
    32      | 0.9917    | 0.9917    | 
    33      | 1.0       | 1.0       | 
    34      | 1.0       | 1.0       | 
    35      | 1.0       | 1.0       | 
    36      | 1.0       | 1.0       | 
    37      | 1.0       | 1.0       | 
    38      | 1.0       | 1.0       | 
    39      | 1.0       | 1.0       | 
    40      | 1.0       | 1.0       | 
    41      | 1.0       | 1.0       | 
    42      | 1.0       | 1.0       | 
    43      | 1.0       | 1.0       | 
    44      | 1.0       | 1.0       | 
    45      | 1.0       | 1.0       | 
    46      | 1.0       | 1.0       | 
    47      | 1.0       | 1.0       | 
    48      | 1.0       | 1.0       | 
    49      | 1.0       | 1.0       | 
    50      | 1.0       | 1.0       | 
    51      | 1.0       | 1.0       | 
    52      | 1.0       | 1.0       | 
    53      | 1.0       | 1.0       | 
    54      | 1.0       | 1.0       | 
    55      | 1.0       | 1.0       | 
    56      | 1.0       | 1.0       | 
    57      | 1.0       | 1.0       | 
    58      | 1.0       | 1.0       | 
    59      | 1.0       | 1.0       | 
    60      | 1.0       | 1.0       | 
    61      | 1.0       | 1.0       | 
    62      | 1.0       | 1.0       | 
    63      | 1.0       | 1.0       | 
    64      | 1.0       | 1.0       | 
    65      | 1.0       | 1.0       | 
    66      | 1.0       | 1.0       | 
    67      | 1.0       | 1.0       | 
    68      | 1.0       | 1.0       | 
    69      | 1.0       | 1.0       | 
    70      | 1.0       | 1.0       | 
    71      | 1.0       | 1.0       | 
    72      | 1.0       | 1.0       | 
    73      | 1.0       | 1.0       | 
    74      | 1.0       | 1.0       | 
    75      | 1.0       | 1.0       | 
    76      | 1.0       | 1.0       | 
    77      | 1.0       | 1.0       | 
    78      | 1.0       | 1.0       | 
    79      | 1.0       | 1.0       | 
    80      | 1.0       | 1.0       | 
    81      | 1.0       | 1.0       | 
    82      | 1.0       | 1.0       | 
    83      | 1.0       | 1.0       | 
    84      | 1.0       | 1.0       | 
    85      | 1.0       | 1.0       | 
    86      | 1.0       | 1.0       | 
    87      | 1.0       | 1.0       | 
    88      | 1.0       | 1.0       | 
    89      | 1.0       | 1.0       | 
    90      | 1.0       | 1.0       | 
    91      | 1.0       | 1.0       | 
    92      | 1.0       | 1.0       | 
    93      | 1.0       | 1.0       | 
    94      | 1.0       | 1.0       | 
    95      | 1.0       | 1.0       | 
    96      | 1.0       | 1.0       | 
    97      | 1.0       | 1.0       | 
    98      | 1.0       | 1.0       | 
    99      | 1.0       | 1.0       | 
    100     | 1.0       | 1.0       | 
    101     | 1.0       | 1.0       | 
    102     | 1.0       | 1.0       | 
    103     | 1.0       | 1.0       | 
    104     | 1.0       | 1.0       | 
    105     | 1.0       | 1.0       | 
    106     | 1.0       | 1.0       | 
    107     | 1.0       | 1.0       | 
    108     | 1.0       | 1.0       | 
    109     | 1.0       | 1.0       | 
    110     | 1.0       | 1.0       | 
    111     | 1.0       | 1.0       | 
    112     | 1.0       | 1.0       | 
    113     | 1.0       | 1.0       | 
    114     | 1.0       | 1.0       | 
    115     | 1.0       | 1.0       | 
    116     | 1.0       | 1.0       | 
    117     | 1.0       | 1.0       | 
    118     | 1.0       | 1.0       | 
    119     | 1.0       | 1.0       | 
    120     | 1.0       | 1.0       | 
    121     | 1.0       | 1.0       | 
    122     | 1.0       | 1.0       | 
    123     | 1.0       | 1.0       | 
    124     | 1.0       | 1.0       | 
    125     | 1.0       | 1.0       | 
    126     | 1.0       | 1.0       | 
    127     | 1.0       | 1.0       | 
    128     | 1.0       | 1.0       | 
    129     | 1.0       | 1.0       | 
    130     | 1.0       | 1.0       | 
    131     | 1.0       | 1.0       | 
    132     | 1.0       | 1.0       | 
    133     | 1.0       | 1.0       | 
    134     | 1.0       | 1.0       | 
    ---------------------------------
    Finished sucessfully.
    NDCG@10 on training data: 1.0
    NDCG@10 on validation data: 1.0
    ---------------------------------
    NDCG@10 on test data: 0.9962
    
    Model saved to: ../models/es_lambdamart_model.txt
    

## Upload Trained Model

In [16]:
model_def = None
with open(MODEL_FILE, "r") as model_file:
    model_def = model_file.read()

data = {
    "model": {
        "name": "es_lambdamart_model",
        "model": {
            "type": "model/ranklib",
            "definition": model_def
        }
    }
}
headers = {
    "Content-Type": "application/json"
}
resp = requests.post(ES_URL + "/_ltr/_featureset/myFeatures/_createmodel", 
                     headers=headers, data=json.dumps(data))
print(resp.text)

{"_index":".ltrstore","_type":"store","_id":"model-es_lambdamart_model","_version":1,"result":"created","forced_refresh":true,"_shards":{"total":1,"successful":1,"failed":0},"_seq_no":1,"_primary_term":1}


## Run rerank Query

In [17]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    if rating == 10.0:
        rating -= 0.01
    return int(rating // 2) + 1


def get_rating_string(rating):
    rating_string = []
    for i in range(rating):
        rating_string.append(u"\u2605")
    for i in range(5 - rating):
        rating_string.append(u"\u2606")
    return "".join(rating_string)


print(get_rating_string(3))
print(get_rating_string(rating2label(6.4)))

★★★☆☆
★★★★☆


In [27]:
query = QUERY_LIST[random.randint(0, len(QUERY_LIST))]

In [28]:
def render_results(docs, query, top_n):
    print("top {:d} results for {:s}".format(TOP_N * 2, query))
    print("---")
    for doc in docs:
        doc_id, title, rating, score = doc
        stars = get_rating_string(rating2label(rating))
        print("{:s} {:06d} {:.3f} {:s}".format(stars, int(doc_id), score, title))

### Top 20 results without re-ranking

In [29]:
headers = {
    "Content-Type": "application/json"
}
data = {
    "query": {
        "dis_max": {
            "queries": [
                { "match": { "title": query }},
                { "match": { "body":  query }}
            ]
        }
    },
    "from": 0,
    "size": TOP_N * 2
}
resp = requests.post(ES_URL + "/tmdbindex/_search", headers=headers, data=json.dumps(data))
resp_json = json.loads(resp.text)
result_docs = []
for doc in resp_json["hits"]["hits"]:
    doc_src = doc["_source"]
    doc_id = doc_src["doc_id"]
    rating = doc_src["rating"]
    title = doc_src["title"]
    score = doc["_score"]
    result_docs.append((doc_id, title, rating, score))
render_results(result_docs, query, TOP_N)

top 20 results for murder
---
★★★★★ 407992 8.166 MURDER and murder
★★★★☆ 031930 8.162 Murder!
★★★★☆ 176841 6.878 Moonlight Murder
★★★☆☆ 031043 6.878 Mike's Murder
★★★☆☆ 013561 6.878 Murder Party
★★★★☆ 000758 6.878 Murder Ahoy
★★★★☆ 015375 6.878 Murder, Inc.
★★☆☆☆ 089345 6.878 Bloody Murder
★★★★☆ 360626 6.878 Prescription: Murder
★★★★☆ 047401 6.878 Sky Murder
★★★☆☆ 009415 5.942 Murder at 1600
★★★★☆ 010440 5.942 Manhattan Murder Mystery
★★★★☆ 001965 5.942 A Perfect Murder
★★★★☆ 001834 5.942 Murder, My Sweet
★★★★☆ 006037 5.942 Murder by Death
★★★★☆ 011892 5.942 Murder by Numbers
★★★★☆ 034374 5.942 Murder by Decree
★★★★☆ 000750 5.942 Murder She Said
★★★★☆ 000757 5.942 Murder Most Foul
★★★★☆ 018930 5.942 Murder by Contract


### Top 20 results with LTR reranking

In [30]:
data = {
    "query": {
        "dis_max": {
            "queries": [
                { "match": { "title": query }},
                { "match": { "body":  query }}
            ]
        }
    },
    "rescore": {
        "window_size": 100,
        "query": {
            "rescore_query": {
                "sltr": {
                    "params": {
                        "query": query
                    },
                    "model": "es_lambdamart_model",
                }
            }
        }
    },
    "from": 0,
    "size": TOP_N * 2
}
resp = requests.post(ES_URL + "/tmdbindex/_search", headers=headers, data=json.dumps(data))
resp_json = json.loads(resp.text)
result_docs = []
for doc in resp_json["hits"]["hits"]:
    doc_src = doc["_source"]
    doc_id = doc_src["doc_id"]
    rating = doc_src["rating"]
    title = doc_src["title"]
    score = doc["_score"]
    result_docs.append((doc_id, title, rating, score))
render_results(result_docs, query, TOP_N)

top 20 results for murder
---
★★★★★ 407992 4.722 MURDER and murder
★★★★☆ 031930 4.664 Murder!
★★★★☆ 053947 4.637 The Murder of Fred Hampton
★★★★☆ 368835 4.551 Murder Rap: Inside the Biggie and Tupac Murders
★★☆☆☆ 089345 3.898 Bloody Murder
★★★☆☆ 031043 3.789 Mike's Murder
★★★☆☆ 013561 3.606 Murder Party
★★★★☆ 015375 3.435 Murder, Inc.
★★★★☆ 006037 3.414 Murder by Death
★★★★☆ 000758 3.392 Murder Ahoy
★★★★☆ 360626 3.312 Prescription: Murder
★★★☆☆ 009415 3.232 Murder at 1600
★★★★☆ 176841 3.208 Moonlight Murder
★★★★☆ 047401 3.208 Sky Murder
★★★★☆ 010440 2.973 Manhattan Murder Mystery
★★★★☆ 001965 2.757 A Perfect Murder
★★★★☆ 011892 2.757 Murder by Numbers
★★★☆☆ 108282 2.755 Murder on Flight 502
★★★★☆ 038962 2.684 Murder by Proxy
★★★★☆ 034374 2.636 Murder by Decree
