

# Recipe #4:  Taste Profiles



For taste profiles, we need to index the following data-shape:  

(**user**, **query[]**, **document[]**)

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

In [None]:
import pandas
import json
from extract import extract, parse_json
from taste.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek, get


## Source data 


In [None]:
source = extract('./data/real_estate_queries.tsv')

# View the source data
source.head(5)


##  Transform to fit the recipe


In [None]:
reshaped = reshape(source)

# Verify the data shape
reshaped.head(50)

E.g. User 756 searched for "chesapeake real estate assesser" and "virginia beach" and at some point via search clicked on http://www.chesapeake.va.us and http://www.vbgov...

In [None]:
load_ready = preload(reshaped)

# Verify the elasticsearch bulk index format
[print(j) for j in load_ready[0:10]]


## Load the index


In [None]:
# Create the index with mapping
create_index(index='taste', mapping_file='taste/index_definition.json')

# Show the mapping
index_mapping(index='taste')

In [None]:
# Load the index
response = populate_index(body=load_ready)


## Taste profile for one user


In [None]:
# Fetch a user doc
profile = get('10008', 'taste')

print(json.dumps(profile, indent=2))

## What other users share these interests? 

In [None]:
# Show the recommendation query
peek(filename='taste/query_template.mustache')

##  And what queries do **they** use?

In [None]:
# Run the query
result = query_template(index='taste', 
               template_file='taste/query_template.mustache',
               params={ 'user': '10008', 'size': 50 }
              )

# Parse the candidate queries out of the response
query_suggestion_candidates = parse_json(result, jsonpath="aggregations.query_recommendations.buckets[*].key")

# Remove the candidate queries already run by the user
query_suggestion = [s for s in query_suggestion_candidates if s not in profile['_source']['query']]

# Show
pandas.DataFrame({'Suggested Queries': query_suggestion}).head(25)

In [None]:
# Parse the candidate documents out of the response
document_suggestion_candidates = parse_json(result, jsonpath="aggregations.document_recommendations.buckets[*].key")

# Remove the candidate documents already visited by the user
document_suggestion = [s for s in document_suggestion_candidates if s not in profile['_source']['document']]

# Show 
pandas.DataFrame({'Suggested Documents': document_suggestion}).head(25)