

# Recipe #4:  Taste Profiles



For taste profiles, we need to index the following data-shape:  

(**user**, **query[]**, **document[]**)

In [1]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [1]:
import pandas
import json
from extract import extract, parse_json
from taste.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek, get


## Source data 


In [2]:
source = extract('./data/real_estate_queries.tsv')

# peek
source.head(5)

Unnamed: 0,user,query,timestamp,position,document,state
0,1337,national real estate settlement services,2006-03-14 15:59:13,1.0,http://www.realtms.com,
1,1337,national real estate settlement services,2006-03-14 15:59:13,7.0,http://dmoz.org,
2,1337,sunbury pennsylvania real estate settlement se...,2006-03-14 16:06:28,14.0,http://pa.optimuslaw.com,pennsylvania
3,1337,integrated real estate,2006-03-27 14:52:29,1.0,http://www.integratedreal.com,
4,1337,integrated real estate,2006-03-27 14:52:29,2.0,http://www.irisnet.net,



##  Transform to fit the recipe


In [3]:
reshaped = reshape(source)

# peek
reshaped.head(50)

Unnamed: 0,user,query,document
0,475,"[real estates lookup in charlotte, real estate...","[http://meckcama.co.mecklenburg.nc.us, http://..."
1,756,"[chesapeake real estate assesser, virginia bea...","[http://www.chesapeake.va.us, http://www.vbgov..."
2,1133,"[huntsville real estate, huntsville real estate]","[http://www.valleymls.com, http://www.valleyml..."
3,1337,"[national real estate settlement services, nat...","[http://www.realtms.com, http://dmoz.org, http..."
4,1338,"[real estate chesapeake appreciation 2005, rea...","[http://realtytimes.com, http://realtytimes.co..."
5,2470,[harbor country real estate],[http://www.harborcountry.org]
6,2761,"[norco real estate, norco real estate, corona ...","[http://www.neighborhoodscout.com, http://www...."
7,3769,[little rock real estate],[http://cityguide.aol.com]
8,4781,[tucker group real estate and florida],[http://www.bizjournals.com]
9,4945,"[real estate courses, real estate courses, fis...","[http://www.online-education.net, http://www.u..."


E.g. User 756 searched for "chesapeake real estate assesser" and "virginia beach" and at some point via search clicked on http://www.chesapeake.va.us and http://www.vbgov...

In [4]:
load_ready = preload(reshaped)

# peek
[print(j) for j in load_ready[0:10]]


{'index': {'_index': 'taste', '_id': 475}}
{'user': 475, 'query': ['real estates lookup in charlotte', 'real estates lookup in lexington', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in winston-salem', 'real estates lookup in winston-salem', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in rural-hall', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup in charlotte', 'real estates lookup and phone numbers', 'real estates lookup and phone numbers', 'real estates lookup in charlotte', 'real estates lookup in winston-salem', 'real estates lookup in winston-salem', 'real estates lookup in charlotte', 'real 

[None, None, None, None, None, None, None, None, None, None]


## Load the index


In [5]:
create_index(index='taste', mapping_file='taste/index_definition.json')

index_mapping(index='taste')

Mapping for  taste
{
  "taste": {
    "mappings": {
      "properties": {
        "document": {
          "type": "keyword"
        },
        "query": {
          "type": "text",
          "fields": {
            "raw": {
              "type": "keyword"
            },
            "sayt": {
              "type": "search_as_you_type",
              "analyzer": "query_log_analyzer",
              "max_shingle_size": 3
            }
          },
          "analyzer": "query_log_analyzer"
        },
        "user": {
          "type": "keyword"
        }
      }
    }
  }
}


In [6]:
response = populate_index(body=load_ready)


## Taste profile for one user


In [16]:
profile = get('10008', 'taste')

print(json.dumps(profile, indent=2))

{
  "_index": "taste",
  "_type": "_doc",
  "_id": "10008",
  "_version": 1,
  "_seq_no": 15,
  "_primary_term": 1,
  "found": true,
  "_source": {
    "user": 10008,
    "query": [
      "cape cod real estate.com",
      "new hampshire me real estate",
      "new hampshire me real estate",
      "new hampshire me real estate",
      "nancy clayton real estate falmouth ma",
      "nancy clayton real estate falmouth ma",
      "nancy clayton real estate falmouth ma",
      "nancy clayton real estate falmouth ma",
      "new hampshire maine real estate",
      "new hampshire maine real estate",
      "new hampshire maine real estate",
      "cape cod real estate associates",
      "marcy broden era key real estate framingham ma",
      "hudson nh real estate",
      "salem nh real estate",
      "salem nh real estate"
    ],
    "document": [
      "http://www.capecodrealestate.com",
      "http://www.seacoastrealestate.com",
      "http://www.nneren.com",
      "http://www.nneren.com",


## What other users share these interests? 

In [8]:
peek(filename='taste/query_template.mustache')

{
  "size": 0,
  "query": {
    "more_like_this" : {
      "fields" : ["query", "document"],
      "like" : [
      {
        "_index" : "taste",
        "_id" : "{{user}}"
      }
      ],
      "min_term_freq" : 1, 
      "max_query_terms" : 8
    }
  },
  "aggs": {
    "query_recommendations": {
      "terms": {
        "field": "query.raw",
        "size": {{^size}}50{{/size}}{{size}}
      }
    },
    "document_recommendations": {
      "terms": {
        "field": "document",
        "size": {{^size}}50{{/size}}{{size}}
      }
    }
  }
}



##  And what queries do **they** use?

In [36]:
# Run the query
result = query_template(index='taste', 
               template_file='taste/query_template.mustache',
               params={ 'user': '10008', 'size': 50 }
              )

# Parse the candidate queries out of the response
query_suggestion_candidates = parse_json(result, jsonpath="aggregations.query_recommendations.buckets[*].key")

# Remove the candidate queries already run by the user
query_suggestion = [s for s in query_suggestion_candidates if s not in profile['_source']['query']]

# Show
pandas.DataFrame({'Suggested Queries': query_suggestion}).head(25)

Unnamed: 0,Suggested Queries
0,new hampshire real estate
1,maine real estate
2,bath maine real estate
3,brunswick maine real estate
4,north carolina real estate
5,oregon real estate
6,prudential real estate
7,south carolina real estate
8,united country real estate
9,98903 real estate


In [37]:
# Parse the candidate documents out of the response
document_suggestion_candidates = parse_json(result, jsonpath="aggregations.document_recommendations.buckets[*].key")

# Remove the candidate documents already visited by the user
document_suggestion = [s for s in document_suggestion_candidates if s not in profile['_source']['document']]

# Show 
pandas.DataFrame({'Suggested Documents': document_suggestion}).head(25)

Unnamed: 0,Suggested Documents
0,http://www.realtor.com
1,http://www.realestate.com
2,http://www.homegain.com
3,http://realestate.yahoo.com
4,http://www.househunt.com
5,http://www.city-data.com
6,http://www.harmonhomes.com
7,http://homes.point2.com
8,http://www.homesandland.com
9,http://www.hometownlocator.com
