# Recipe #1:  Autosuggest

For simple autosuggest, we need to index the following data-shape:  

(**last_query_date**, **query**, **count**)

In [1]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

import pandas
from extract import extract, parse_json
from autosuggest.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.




## Source data 


In [2]:
source = extract('./data/real_estate_queries.tsv')

# peek
source.head(5)

Unnamed: 0,user,query,timestamp,position,document,state
0,1337,national real estate settlement services,2006-03-14 15:59:13,1.0,http://www.realtms.com,
1,1337,national real estate settlement services,2006-03-14 15:59:13,7.0,http://dmoz.org,
2,1337,sunbury pennsylvania real estate settlement se...,2006-03-14 16:06:28,14.0,http://pa.optimuslaw.com,pennsylvania
3,1337,integrated real estate,2006-03-27 14:52:29,1.0,http://www.integratedreal.com,
4,1337,integrated real estate,2006-03-27 14:52:29,2.0,http://www.irisnet.net,



##  Transform to fit the recipe


In [3]:
reshaped = reshape(source)

# peek
reshaped.head(5)

Unnamed: 0,query,count,timestamp
0,real estate,306,2006-05-31 21:55:34
1,prudential real estate,189,2006-05-31 20:24:33
2,century 21 real estate,101,2006-05-30 22:59:36
3,florida real estate,79,2006-05-30 18:39:25
4,remax real estate,77,2006-05-29 10:10:16


E.g. 189 distinct users searched for "prudential real estate"

In [4]:
create_index(index='autosuggest', mapping_file='autosuggest/index_definition.json')

index_mapping(index='autosuggest')

Mapping for  autosuggest
{
  "autosuggest": {
    "mappings": {
      "properties": {
        "count": {
          "type": "integer"
        },
        "last_query_date": {
          "type": "date",
          "format": "yyyy-MM-dd HH:mm:ss"
        },
        "query": {
          "type": "search_as_you_type",
          "max_shingle_size": 3
        }
      }
    }
  }
}


In [5]:
load_ready = preload(reshaped)

# peek
[print(j) for j in load_ready[0:10]]


{'index': {'_index': 'autosuggest', '_id': 'real-estate'}}
{'query': 'real estate', 'count': 306, 'timestamp': '2006-05-31 21:55:34'}
{'index': {'_index': 'autosuggest', '_id': 'prudential-real-estate'}}
{'query': 'prudential real estate', 'count': 189, 'timestamp': '2006-05-31 20:24:33'}
{'index': {'_index': 'autosuggest', '_id': 'century-21-real-estate'}}
{'query': 'century 21 real estate', 'count': 101, 'timestamp': '2006-05-30 22:59:36'}
{'index': {'_index': 'autosuggest', '_id': 'florida-real-estate'}}
{'query': 'florida real estate', 'count': 79, 'timestamp': '2006-05-30 18:39:25'}
{'index': {'_index': 'autosuggest', '_id': 'remax-real-estate'}}
{'query': 'remax real estate', 'count': 77, 'timestamp': '2006-05-29 10:10:16'}


[None, None, None, None, None, None, None, None, None, None]


## Load the index


In [6]:
response = populate_index(body=load_ready)


## Query for suggestions


In [7]:
peek(filename='autosuggest/query_template.mustache')

{
    "size": "{{^size}}5{{/size}}{{size}}",
    "query": {
        "function_score": {
            "query": {
              "multi_match": {
                "query": "{{text}}",
                "type": "bool_prefix",
                "fields": [
                  "query",
                  "query._2gram",
                  "query._3gram"
                ]
              }
            },
            "functions": [
                {
                    "field_value_factor": {
                        "field": "count",
                        "modifier": "ln1p"
                    }
                }
            ],
            "boost_mode": "multiply"
        }
    }
}



In [15]:
result = query_template(index='autosuggest', 
               template_file='autosuggest/query_template.mustache',
               field='query',
               params={ 'text': 'presc', 'size': 10 }
              )

suggestions = parse_json(result, jsonpath="hits.hits[*]._source.query")
print(*suggestions, sep='\n')

prescott real estate
prescott arizona real estate
prescott arizona real estate forclosures
prescott arizona real estate for sale
windmere commercial real estate prescott
commercial real estate prescott
