

# Recipe #2:  Related Queries



For related queries, we need to index the following data-shape:  

(**document**, **query[]**)

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

In [None]:
import pandas
from extract import extract, parse_json
from related.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek


## Source data 


In [None]:
source = extract('./data/real_estate_queries.tsv')

# View the source data
source.head(5)


##  Transform to fit the recipe


In [None]:
reshaped = reshape(source)

# Verify the data shape
reshaped.head(50)

E.g. 126.realnetsystems.com  received search clicks for multiple "grand junction" queries.

In [None]:
load_ready = preload(reshaped)

# Verify the elasticsearch bulk index format
[print(j) for j in load_ready[0:10]]


## Load the index


In [None]:
# Create the index with mapping
create_index(index='related', mapping_file='related/index_definition.json')

# Show the mapping
index_mapping(index='related')

In [None]:
# Load the index
response = populate_index(body=load_ready)


## Query for related searches


In [None]:
# Show the query for "related queries"
peek(filename='related/query_template.mustache')

In [None]:
# Run the query
result = query_template(index='related', 
               template_file='related/query_template.mustache',
               params={ 'documents': [
                  'http://www.dllr.state.md.us',
                  'http://www.online-education.net',
                  'http://www.dos.state.ny.us',
                  'http://www.usarealestatelicense.com',
                  'http://www.myflorida.com',
                  'http://www.mortgagenewsdaily.com',
                  'http://www.licensetutor.com',
                  'http://www.state.nj.us',
                  'http://www.re.state.az.us',
                  'http://www.dre.cahwnet.gov',
                  'http://www.dos.state.pa.us',
                  'http://www.dol.wa.gov',
                  'http://www.cbprimus.com',
                  'http://real-estate-careers.longandfoster.com',
                  'http://www.uic.edu',
                  'http://www.thelearningsource.net',
                  'http://www.rolandschoolofre.com',
                  'http://www.realestatelicense.com',
                  'http://www.parealtor.org',
                  'http://www.onerealtorplace.com'
               ], 'size': 10 }
              )

# Parse the related queries out of the response
related_queries = parse_json(result, jsonpath="aggregations.related_queries.buckets[*].key")

# Sort them
sorted_related_queries = sorted(related_queries)

# Show
print(*sorted_related_queries, sep='\n')