# Feature Generation (no Index support)

Both Solr and Elasticsearch (and possibly other indexes that support LTR) provide support in generating query document features, such as similarities. Here we will assume that we have a search index (we will use Solr) which hosts the data already, so no need to load the data.

We will use third party similarity functions to generate our similarity features. Output will be a file in LETOR format similar to the other two cases.

NOTE: one nice thing is that we are now free to come up with more novel similarity features, such as cosine similarity of query and document vectors generated from word embeddings, or include features that may be difficult to store in the index because of its volatility (for example, user preferences).

Solr needs to be running and listening on port 8983. If it is not up, start with:

    bin/solr start -Dsolr.ltr.enabled=true


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import datetime
import gensim
import json
import numpy as np
import os
import random
import requests
import spacy
import urllib

In [2]:
DATA_DIR = "../../data"

SOLR_URL = "http://localhost:8983/solr/tmdbindex/"
LETOR_FILE_TEMPLATE = os.path.join(DATA_DIR, "diy_features_{:s}.txt")

FEATURE_LIST = [
    "origScore", "titleSimTFIDF", "titleSimBM25", "descSimTFIDF", "descSimBM25",
    "docRecency", "isGoHands", "isAniplex", "isThriller", "isForeign",
    "isDrama", "isWar", "isAction", "isComedy", "isMusic", 
    "isRomance", "isAdventure", "isFamily", "isFantasy", "isCrime",
    "isHorror", "isHistory", "isMystery", "isAnimation", "isDocumentary",
    "isWestern"
]
QUERY_LIST = [
    "murder", "musical", "biography", "police", "world war ii",
    "comedy", "superhero", "nazis", "romance", "martial arts",
    "extramarital", "spy", "vampire", "magic", "wedding",
    "sport", "prison", "teacher", "alien", "dystopia"
]

In [3]:
nlp = spacy.load("en")

In [4]:
def get_search_results(query, num_docs):
    payload = {
        "q": query,
        "defType": "edismax",
        "qf": "title_t description_t",
        "pf": "title_t description_t",
        "mm": 2,
        "fl": "*,score",            
        "rows": num_docs
    }
    params = urllib.parse.urlencode(payload, quote_via=urllib.parse.quote_plus)
    search_url = SOLR_URL + "select?" + params
    resp = requests.get(search_url)
    resp_json = json.loads(resp.text)
    docs = resp_json["response"]["docs"]
    return docs


docs = get_search_results("martial arts", 100)
assert(len(docs) <= 100)

## TF-IDF Similarity

In [5]:
def get_tfidf_similarities(query, docs, field_name):
    fields = []
    for doc in docs:
        try:
            fields.append(doc[field_name])
        except KeyError:
            fields.append(" ")
    tfidf = TfidfVectorizer()
    field_vecs = tfidf.fit_transform(fields)
    query_vec = np.sum(tfidf.transform(query.split(" ")), axis=0)
    sims = linear_kernel(query_vec, field_vecs).flatten()
    tfidf = None
    return sims


desc_sims_tfidf = get_tfidf_similarities("martial arts", docs, "description_t")
assert(len(desc_sims_tfidf) == len(docs))

## BM25 Similarity

In [6]:
def get_bm25_similarities(query, docs, field_name):
    """ Code adapted from:
        https://stackoverflow.com/questions/40966014/how-to-use-gensim-bm25-ranking-in-python
    """
    fields = []
    for doc in docs:
        try:
            fields.append(nlp(doc[field_name].lower()))
        except KeyError:
            fields.append(nlp(" "))
    field_tokens = []
    for field in fields:
        field_tokens.append([token.text for token in field])
    dictionary = gensim.corpora.Dictionary(field_tokens)
    corpus = [dictionary.doc2bow(token) for token in field_tokens]
    bm25 = gensim.summarization.bm25.BM25(corpus)
    avg_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
    query_tokens = [token.text for token in nlp(query.lower())]
    query_vec = dictionary.doc2bow(query_tokens)
    sims = bm25.get_scores(query_vec, avg_idf)
    dictionary, corpus, bm25 = None, None, None
    return sims


desc_sims_bm25 = get_bm25_similarities("martial arts", docs, "description_t")
assert(len(desc_sims_bm25) == len(docs))

## Document Recency

We will get this as the number of seconds since epoch divided by `365*24*60*60` (so decimal value in years).

In [7]:
def get_doc_recencies(docs, field_name):
    epoch = datetime.datetime.utcfromtimestamp(0)
    recencies = []
    for doc in docs:
        try:
            field = doc[field_name]
            field_dttm = datetime.datetime.strptime(doc[field_name], "%Y-%m-%dT%H:%M:%SZ")
            total_years = (field_dttm - epoch).total_seconds() / (365 * 24 * 60 * 60)
        except KeyError:
            total_years = 0
        recencies.append(total_years)
    return recencies

doc_recencies = get_doc_recencies(docs, "released_dt")
assert(len(doc_recencies) == len(docs))

## Document Categories

In [8]:
def get_doc_categories(docs):
    categories = []
    for doc in docs:
        category_dict = {}
        try:
            genres = set(doc["genres_ss"])
            for feature in FEATURE_LIST[6:]:
                feat_val = feature[2:]
                if feat_val in genres:
                    category_dict[feature] = 1
                else:
                    category_dict[feature] = 0
        except KeyError:
            category_dict = {feature: 0 for feature in FEATURE_LIST[6:]}
        categories.append(category_dict)
    return categories

categories = get_doc_categories(docs)
assert(len(categories) == len(docs))

## Generate Features

In [9]:
def rating2label(rating):
    """ convert 0-10 continuous rating to 1-5 categorical labels """
    return int(rating // 2) + 1

assert(rating2label(6.4) == 4)
assert(rating2label(9.8) == 5)

In [10]:
feature_name2id = {name: idx + 1 for idx, name in enumerate(FEATURE_LIST)}

assert(feature_name2id["isRomance"] == 16)

In [11]:
def format_letor(doc_id, rating, qid, query, orig_score, title_sim_tfidf, desc_sim_tfidf,
                 title_sim_bm25, desc_sim_bm25, doc_recency, doc_categories):
    label = rating2label(rating)
    features = {
        "origScore": "{:.5f}".format(orig_score),
        "titleSimTFIDF": "{:.5f}".format(title_sim_tfidf),
        "titleSimBM25": "{:.5f}".format(title_sim_bm25),
        "descSimTFIDF": "{:.5f}".format(desc_sim_tfidf),
        "descSimBM25": "{:.5f}".format(desc_sim_bm25),
        "docRecency": "{:.5f}".format(doc_recency),
        "isGoHands": "{:.3f}".format(doc_categories["isGoHands"]),
        "isAniplex": "{:.3f}".format(doc_categories["isAniplex"]),
        "isThriller": "{:.3f}".format(doc_categories["isThriller"]),
        "isForeign": "{:.3f}".format(doc_categories["isForeign"]),
        "isDrama": "{:.3f}".format(doc_categories["isDrama"]),
        "isWar": "{:.3f}".format(doc_categories["isWar"]),  
        "isAction": "{:.3f}".format(doc_categories["isAction"]),
        "isComedy": "{:.3f}".format(doc_categories["isComedy"]),
        "isMusic": "{:.3f}".format(doc_categories["isMusic"]),
        "isRomance": "{:.3f}".format(doc_categories["isRomance"]),
        "isAdventure": "{:.3f}".format(doc_categories["isAdventure"]),
        "isFamily": "{:.3f}".format(doc_categories["isFamily"]),
        "isFantasy": "{:.3f}".format(doc_categories["isFantasy"]),
        "isCrime": "{:.3f}".format(doc_categories["isCrime"]),
        "isHorror": "{:.3f}".format(doc_categories["isHorror"]),
        "isHistory": "{:.3f}".format(doc_categories["isHistory"]),
        "isMystery": "{:.3f}".format(doc_categories["isMystery"]),
        "isAnimation": "{:.3f}".format(doc_categories["isAnimation"]),
        "isDocumentary": "{:.3f}".format(doc_categories["isDocumentary"]),
        "isWestern": "{:.3f}".format(doc_categories["isWestern"])
    }
    feat_pairs = []
    for feat_name in FEATURE_LIST:
        feat_id = str(feature_name2id[feat_name])
        feat_val = features[feat_name]
        feat_pairs.append(":".join([feat_id, feat_val]))
    return "{:d} qid:{:d} {:s} # docid:{:d} query:{:s}".format(
        label, qid, " ".join(feat_pairs), doc_id, query)

In [12]:
random.shuffle(QUERY_LIST)
train_queries = QUERY_LIST[0:12]
val_queries = QUERY_LIST[12:15]
test_queries = QUERY_LIST[15:]
feat_suffixes = ["train", "val", "test"]
qid = 1
for qt_idx, queries in enumerate([train_queries, val_queries, test_queries]):
    fletor = open(LETOR_FILE_TEMPLATE.format(feat_suffixes[qt_idx]), "w")
    for query in queries:
        print("generating feature for {:s} ({:s})".format(query, feat_suffixes[qt_idx]))
        docs = get_search_results(query, 100)
        # features from search result
        orig_scores = [doc["score"] for doc in docs]
        title_sims_tfidf = get_tfidf_similarities(query, docs, "title_t")
        desc_sims_tfidf = get_tfidf_similarities(query, docs, "description_t")
        title_sims_bm25 = get_bm25_similarities(query, docs, "title_t")
        desc_sims_bm25 = get_bm25_similarities(query, docs, "description_t")
        doc_recencies = get_doc_recencies(docs, "released_dt")
        doc_categories = get_doc_categories(docs)
        for i in range(len(docs)):
            doc = docs[i]
            # get additional fields
            doc_id = int(doc["id"])
            rating = doc["rating_f"]
            # write record
            fletor.write("{:s}\n".format(format_letor(doc_id, rating, qid, query, orig_scores[i],
                                                      title_sims_tfidf[i], desc_sims_tfidf[i],
                                                      title_sims_bm25[i], desc_sims_bm25[i],
                                                      doc_recencies[i], doc_categories[i])))
        qid += 1
    fletor.close()
print("number of queries, train {:d}, test {:d}, validation {:d}".format(
    len(train_queries), len(test_queries), len(val_queries)))

generating feature for dystopia (train)
generating feature for superhero (train)
generating feature for nazis (train)
generating feature for biography (train)
generating feature for wedding (train)
generating feature for murder (train)
generating feature for magic (train)
generating feature for vampire (train)
generating feature for comedy (train)
generating feature for prison (train)
generating feature for martial arts (train)
generating feature for teacher (train)
generating feature for romance (val)
generating feature for sport (val)
generating feature for police (val)
generating feature for spy (test)
generating feature for extramarital (test)
generating feature for world war ii (test)
generating feature for musical (test)
generating feature for alien (test)
number of queries, train 12, test 5, validation 3
