# Feature Generation

In [1]:
import json
import random
import requests
import os

In [2]:
DATA_DIR = "../../data"
ES_URL = "http://localhost:9200/"

FEATURE_LIST = [
    "origScore", "titleSimTFIDF", "titleSimBM25", "descSimTFIDF", "descSimBM25",
    "docRecency", "isGoHands", "isAniplex", "isThriller", "isForeign",
    "isDrama", "isWar", "isAction", "isComedy", "isMusic", 
    "isRomance", "isAdventure", "isFamily", "isFantasy", "isCrime",
    "isHorror", "isHistory", "isMystery", "isAnimation", "isDocumentary",
    "isWestern"
]
QUERY_LIST = [
    "murder", "musical", "biography", "police", "world war ii",
    "comedy", "superhero", "nazis", "romance", "martial arts",
    "extramarital", "spy", "vampire", "magic", "wedding",
    "sport", "prison", "teacher", "alien", "dystopia"
]

## Initialize Feature Store

From [the LTR docs](https://elasticsearch-learning-to-rank.readthedocs.io/en/latest/building-features.html), we can initialize the feature store using the following HTTP call.

    PUT _ltr
    
In case you want to remove the store, the command is:

    DELETE _ltr

In [3]:
requests.delete(ES_URL + "_ltr")
resp = requests.put(ES_URL + "_ltr")
print(resp.text)

{"acknowledged":true,"shards_acknowledged":true,"index":".ltrstore"}


## Feature definition

We then construct our features and POST them into a named feature store. A validation query is used to make sure that our queries are well-formed. See [the ES LTR docs](https://elasticsearch-learning-to-rank.readthedocs.io/en/latest/building-features.html) for more information. One thing to note is that all the templates are really queries (the stuff that goes under the "query" key in the normal search JSON requests), so you can build most of them by referring to the ES online query docs.

In [4]:
headers = {
    "Content-Type": "application/json"
}
data = {
    "validation": {
        "params": {
            "query": "martial arts"
        },
        "index": "tmdbindex"
    },
    "featureset": {
        "features": [
            {
                "name": "origScore",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "dis_max": {
                        "queries": [
                            { 
                                "match": { 
                                    "title": "{{query}}"
                                }
                            },
                            {
                                "match": {
                                    "description":  "{{query}}"
                                }
                            }
                        ]
                    }
                }
            },
            {
                "name": "titleSimTFIDF",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "title_tfidf": "{{query}}"
                    }
                }
            },
            {
                "name": "titleSimBM25",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "title": "{{query}}"
                    }
                }
            },
            {
                "name": "descSimTFIDF",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "description_tfidf": "{{query}}"
                    }
                }
            },
            {
                "name": "descSimBM25",
                "params": [
                    "query"
                ],
                "template_language": "mustache",
                "template": {
                    "match": {
                        "description": "{{query}}"
                    }
                }
            },
            {
                "name": "docRecency",
                "params": [],
                "template": {
                    "function_score": {
                        "field_value_factor": {
                            "field": "release_dt",
                            "factor": 3.16e-11,
                            "modifier": "reciprocal",
                            "missing": 1
                        }
                    }
                }
            },
            {
                "name": "isGoHands",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "GoHands" 
                    }
                }
            },
            {
                "name": "isAniplex",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Aniplex" 
                    }
                }
            },
            {
                "name": "isThriller",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Thriller" 
                    }
                }
            },
            {
                "name": "isForeign",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Foreign" 
                    }
                }
            },
            {
                "name": "isDrama",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Drama" 
                    }
                }
            },
            {
                "name": "isWar",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "War" 
                    }
                }
            },
            {
                "name": "isAction",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Action" 
                    }
                }
            },
            {
                "name": "isComedy",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Comedy" 
                    }
                }
            },
            {
                "name": "isMusic",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Music" 
                    }
                }
            },
            {
                "name": "isRomance",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Romance" 
                    }
                }
            },
            {
                "name": "isAdventure",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Adventure" 
                    }
                }
            },
            {
                "name": "isFamily",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Family" 
                    }
                }
            },
            {
                "name": "isFantasy",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Fantasy" 
                    }
                }
            },
            {
                "name": "isCrime",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Crime" 
                    }
                }
            },
            {
                "name": "isHorror",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Horror" 
                    }
                }
            },
            {
                "name": "isHistory",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "History" 
                    }
                }
            },
            {
                "name": "isMystery",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Mystery" 
                    }
                }
            },
            {
                "name": "isAnimation",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Animation" 
                    }
                }
            },
            {
                "name": "isDocumentary",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Documentary" 
                    }
                }
            },
            {
                "name": "isWestern",
                "params": [],
                "template": {
                    "term" : { 
                        "genres" : "Western" 
                    }
                }
            }
        ]
    }
}
resp = requests.post(ES_URL + "_ltr/_featureset/myFeatures", headers=headers, data=json.dumps(data))
print(resp.text)

{"_index":".ltrstore","_type":"store","_id":"featureset-myFeatures","_version":1,"result":"created","forced_refresh":true,"_shards":{"total":1,"successful":1,"failed":0},"_seq_no":0,"_primary_term":1}


In [5]:
# list all featuresets
resp = requests.get(ES_URL + "_ltr/_featureset", headers=headers)
resp_json = json.loads(resp.text)
# print(resp_json)
for doc in resp_json["hits"]["hits"]:
    print(doc["_id"])

featureset-myFeatures


## Feature Extraction

We will generate resultsets for 20 queries, and split them up into training, validation and test sets. Labels (aka Judgement Lists) are generated by applying a transformation on the rating (continuous values 0-10) to transform them into a 5-level categorical label.

In [6]:
def collect_docids_for_query(query):
    data = {
        "query": {
            "dis_max": {
                "queries": [
                    {
                        "match": {
                            "title": "%s"
                        }
                    },
                    {
                        "match": {
                            "description": "%s"
                        }
                    }
                ]
            }
        },
        "from": 0,
        "size": 100
    }
    resp = requests.post(ES_URL + "tmdbindex/_search", headers=headers, data=json.dumps(data))
    resp_json = json.loads(resp.text)
    doc_ids = []
    for doc in resp_json["hits"]["hits"]:
        doc_id = doc["_source"]["doc_id"]
        doc_ids.append(doc_id)
    return doc_ids


doc_ids = collect_docids_for_query("martial arts")
assert(len(doc_ids) <= 100)

In [7]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    return int(rating // 2) + 1

assert(rating2label(6.4) == 4)
assert(rating2label(9.8) == 5)

In [8]:
feature_name2id = {name: idx + 1 for idx, name in enumerate(FEATURE_LIST)}

assert(feature_name2id["isRomance"] == 16)

In [9]:
def collect_features_for_docids(query, doc_ids, feature_name2id):
    data = {
        "query": {
            "bool": {
                "filter": [
                    {
                        "terms": {
                            "_id": doc_ids
                        }
                    },
                    {
                        "sltr": {
                            "_name": "logged_featureset",
                            "featureset": "myFeatures",
                            "params": {
                                "query": query
                            }
                        }
                    }
                ]
            }
        },
        "ext": {
            "ltr_log": {
                "log_specs": {
                    "name": "main",
                    "named_query": "logged_featureset",
                    "missing_as_zero": True
                }
            }
        },
        "from": 0,
        "size": 100
    }
    resp = requests.post(ES_URL + "tmdbindex/_search", headers=headers, data=json.dumps(data))
    resp_json = json.loads(resp.text)
    features = {}
    for doc in resp_json["hits"]["hits"]:
        doc_src = doc["_source"]
        doc_id = doc_src["doc_id"]
        rating = doc_src["rating"]
        label = rating2label(rating)
        letor_feats = []
        doc_feats = doc["fields"]["_ltrlog"][0]["main"]
        for feat_nv in doc_feats:
            feat_name = feat_nv["name"]
            feat_id = feature_name2id[feat_name]
            feat_value = feat_nv["value"]
            letor_feats.append("{:d}:{:.3f}".format(feat_id, feat_value))
        features[doc_id] = (label, " ".join(letor_feats))
    return features


feats = collect_features_for_docids("martial arts", ['35405', '34068', '13492'], feature_name2id)
assert(len(feats) == 3)

In [10]:
def print_letor(fout, doc_ids, features, qid, query):
    for doc_id in doc_ids:
        label, feat_str = features[doc_id]
        fout.write("{:d} qid:{:d} {:s} # docid:{:s} query:{:s}\n".format(
            label, qid, feat_str, doc_id, query))
    return 

In [11]:
random.shuffle(QUERY_LIST)
train_queries = QUERY_LIST[0:12]
val_queries = QUERY_LIST[12:15]
test_queries = QUERY_LIST[15:]
feat_suffixes = ["train", "val", "test"]
qid = 1
for qt_idx, queries in enumerate([train_queries, val_queries, test_queries]):
    fletor = open(os.path.join(DATA_DIR, "es_features_{:s}.txt".format(feat_suffixes[qt_idx])), "w")
    for query in queries:
        print("generating feature for {:s} ({:s})".format(query, feat_suffixes[qt_idx]))
        # collect doc_ids for query
        doc_ids = collect_docids_for_query(query)
        # return features for each doc_id
        features = collect_features_for_docids(query, doc_ids, feature_name2id)
        print_letor(fletor, doc_ids, features, qid, query)
        qid += 1
        
print("number of queries, train {:d}, test {:d}, validation {:d}".format(
    len(train_queries), len(test_queries), len(val_queries)))

generating feature for musical (train)
generating feature for comedy (train)
generating feature for martial arts (train)
generating feature for dystopia (train)
generating feature for alien (train)
generating feature for superhero (train)
generating feature for prison (train)
generating feature for spy (train)
generating feature for sport (train)
generating feature for world war ii (train)
generating feature for extramarital (train)
generating feature for romance (train)
generating feature for nazis (val)
generating feature for teacher (val)
generating feature for biography (val)
generating feature for police (test)
generating feature for magic (test)
generating feature for vampire (test)
generating feature for murder (test)
generating feature for wedding (test)
number of queries, train 12, test 5, validation 3
