## Query Parsing using Py-AhoCorasick

We want to parse out keywords from the text and weight them higher in our query. Since our number of keywords are fairly small, we will use an in-memory trie supplied by the py-AhoCorasick library to parse each query as it comes in, and construct a query which contains these keywords as multi-word phrases.

In [1]:
import ahocorasick
import os

In [2]:
DATA_DIR = "../data"

CURATED_KEYWORDS = os.path.join(DATA_DIR, "raw_keywords.txt")

KEYWORD_NEARDUP_MAPPINGS = os.path.join(DATA_DIR, "keyword_neardup_mappings.tsv")
KEYWORD_DEDUPE_MAPPINGS = os.path.join(DATA_DIR, "keyword_dedupe_mappings.tsv")

In [3]:
keywords = set()
# load from curated list
with open(CURATED_KEYWORDS, "r") as fcurated:
    for line in fcurated:
        keywords.add(line.strip().lower())
# load from near dup mappings
with open(KEYWORD_NEARDUP_MAPPINGS, "r") as fneardup:
    for line in fneardup:
        kleft, kright = line.strip().lower().split("\t")
        keywords.add(kleft)
        keywords.add(kright)
# load from dedupe mappings
with open(KEYWORD_DEDUPE_MAPPINGS, "r") as fdedupe:
    for line in fdedupe:
        kleft, kright, _ = line.strip().lower().split("\t")
        keywords.add(kleft)
        keywords.add(kright)

keywords_list = list(keywords)
print("{:d} keywords loaded".format(len(keywords_list)))

2282 keywords loaded


In [4]:
A = ahocorasick.Automaton()
for idx, keyword in enumerate(keywords_list):
    A.add_word(keyword, (idx, keyword))
A.make_automaton()

In [5]:
query = "neural networks with attention mechanism"
phrases = [item[1][1] for item in A.iter(query)]
print(phrases)

['neural net', 'neural network', 'neural networks', 'attention mechanism']


In [6]:
clauses = []
query_fields = ["title", "abstract", "text"]
query_field_boosts = [10, 5, 1]
for query_field, boost in zip(query_fields, query_field_boosts):
    query_field_clause = []
    # entire input query, highest boost
    query_field_clause.append("{:s}:\"{:s}\"^5".format(query_field, query))
    # each phrase is boosted to an intermediate boost
    for phrase in phrases:
        query_field_clause.append("{:s}:\"{:s}\"^2".format(query_field, phrase))
#     # each word of query (optional)
#     for word in query.split(" "):
#         query_field_clause.append("{:s}:{:s}".format(query_field, word))
    # join the field and boost it
    clauses.append("({:s})^{:d}".format(" ".join(query_field_clause), boost))
print(" ".join(clauses))

(title:"neural networks with attention mechanism"^5 title:"neural net"^2 title:"neural network"^2 title:"neural networks"^2 title:"attention mechanism"^2)^10 (abstract:"neural networks with attention mechanism"^5 abstract:"neural net"^2 abstract:"neural network"^2 abstract:"neural networks"^2 abstract:"attention mechanism"^2)^5 (text:"neural networks with attention mechanism"^5 text:"neural net"^2 text:"neural network"^2 text:"neural networks"^2 text:"attention mechanism"^2)^1
