

# Recipe #3:  Synonym Candidates



For synonym candidates, we need to index the following data-shape:  

(**document**, **query[]**)

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

In [None]:
import pandas
from extract import extract, parse_json
from synonym.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek


## Source data 


In [None]:
source = extract('./data/real_estate_queries.tsv')

# View the source data
source.head(5)


##  Transform to fit the recipe


In [None]:
reshaped = reshape(source)

# Verify the data shape
reshaped.head(50)

E.g. 126.realnetsystems.com  received search clicks for multiple "grand junction" queries.

In [None]:
load_ready = preload(reshaped)

# Verify the elasticsearch bulk index format
[print(j) for j in load_ready[0:10]]


## Load the index


In [None]:
create_index(index='synonym', mapping_file='synonym/index_definition.json')

# Show the mapping
index_mapping(index='synonym')

In [None]:
response = populate_index(body=load_ready)


## Query for related searches


In [None]:
# Show the query
peek(filename='synonym/query_template.mustache')

In [None]:
# Run the query
result = query_template(index='synonym', 
               template_file='synonym/query_template.mustache',
               params={ 'text': 'license', 'size': 15 }
              )

# Parse the synonym candidate queries out of the three response buckets into a single list
synonym_candidates = parse_json(result, jsonpath="aggregations.*.buckets[*]")

# Resort final list by score
df = pandas.DataFrame(synonym_candidates).sort_values(by='score', ascending=False)

# peek
df.head(25)