

# Recipe #3:  Synonym Candidates



In [24]:
import sys
!conda install --yes --prefix {sys.prefix} elasticsearch jsonlines jsonpath-ng python-slugify

import pandas
from extract import extract, parse_json
from synonym.transform import reshape, preload
from index import create_index, populate_index, query_index, index_mapping, query_template, peek


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.




## Source data 


In [25]:
source = extract('./data/real_estate_queries.tsv')

# peek
source.head(5)

Unnamed: 0,user,query,timestamp,position,document,state
0,1337,national real estate settlement services,2006-03-14 15:59:13,1.0,http://www.realtms.com,
1,1337,national real estate settlement services,2006-03-14 15:59:13,7.0,http://dmoz.org,
2,1337,sunbury pennsylvania real estate settlement se...,2006-03-14 16:06:28,14.0,http://pa.optimuslaw.com,pennsylvania
3,1337,integrated real estate,2006-03-27 14:52:29,1.0,http://www.integratedreal.com,
4,1337,integrated real estate,2006-03-27 14:52:29,2.0,http://www.irisnet.net,



##  Transform to fit the recipe


In [26]:
reshaped = reshape(source)

# peek
reshaped.head(50)

Unnamed: 0,document,query
0,http://0-realestate.yahoo.com.portia.nesl.edu,[yahoo real estate]
1,http://0132.deltagroup.com,[real estate for sale johnsonville tn]
2,http://1046391.wkg0iqbv.info,[pureto rico real estate]
3,http://1133.palletmarket.cz,[bakersfied real estate]
4,http://123relocation.com,[las cruces real estate]
5,http://126.realnetsystems.com,"[real estate grand junction co, grand junction..."
6,http://1603.n0jp7p.info,[practice real estate test]
7,http://170.142.31.248,"[real estate assessment data, real estate asse..."
8,http://206.173.89.38,[tuson az real estate]
9,http://206.173.89.41,[panama real estate carribean]


E.g. 126.realnetsystems.com  received search clicks for multiple "grand junction" queries.

In [27]:
load_ready = preload(reshaped)

# peek
[print(j) for j in load_ready[0:10]]


{'index': {'_index': 'synonym', '_id': 'http://0-realestate.yahoo.com.portia.nesl.edu'}}
{'document': 'http://0-realestate.yahoo.com.portia.nesl.edu', 'query': ['yahoo real estate']}
{'index': {'_index': 'synonym', '_id': 'http://0132.deltagroup.com'}}
{'document': 'http://0132.deltagroup.com', 'query': ['real estate for sale johnsonville tn']}
{'index': {'_index': 'synonym', '_id': 'http://1046391.wkg0iqbv.info'}}
{'document': 'http://1046391.wkg0iqbv.info', 'query': ['pureto rico real estate']}
{'index': {'_index': 'synonym', '_id': 'http://1133.palletmarket.cz'}}
{'document': 'http://1133.palletmarket.cz', 'query': ['bakersfied real estate']}
{'index': {'_index': 'synonym', '_id': 'http://123relocation.com'}}
{'document': 'http://123relocation.com', 'query': ['las cruces real estate']}


[None, None, None, None, None, None, None, None, None, None]


## Load the index


In [29]:
create_index(index='synonym', mapping_file='synonym/index_definition.json')

index_mapping(index='synonym')

Mapping for  synonym
{
  "synonym": {
    "mappings": {
      "properties": {
        "document": {
          "type": "keyword"
        },
        "query": {
          "type": "search_as_you_type",
          "analyzer": "query_log_analyzer",
          "max_shingle_size": 3
        },
        "state": {
          "type": "keyword"
        }
      }
    }
  }
}


In [30]:
response = populate_index(body=load_ready)


## Query for related searches


In [31]:
peek(filename='synonym/query_template.mustache')

{
    "size": 0,
    "query": {
      "multi_match": {
        "query": "{{text}}",
        "type": "phrase",
        "fields": [
           "query",
           "query._2gram",
           "query._3gram"
         ],
         "minimum_should_match": "100%"
       }
    },
    "aggs": {
      "significant_queries_1": {
        "significant_text": {
          "field": "query",
          "size": "{{^size}}10{{/size}}{{size}}"
        }
      },
      "significant_queries_2": {
        "significant_text": {
          "field": "query._2gram",
          "size": "{{^size}}10{{/size}}{{size}}"
        }
      },
      "significant_queries_3": {
        "significant_text": {
          "field": "query._3gram",
          "size": "{{^size}}10{{/size}}{{size}}"
        }
      }
    }
}



In [35]:
result = query_template(index='synonym', 
               template_file='synonym/query_template.mustache',
               params={ 'text': 'license', 'size': 15 }
              )

synonym_candidates = parse_json(result, jsonpath="aggregations.*.buckets[*]")
df = pandas.DataFrame(synonym_candidates).sort_values(by='score', ascending=False)

df.head(25)


Unnamed: 0,key,doc_count,score,bg_count
15,license,121,12255.0,0
16,conditional,11,101.198347,0
17,requirements,19,100.574164,3
0,real_estate license,100,83.710129,100
18,licenses,8,53.508367,1
19,renewal,9,33.828222,2
20,department,18,24.507678,11
21,licensing,21,22.899051,16
1,license in,25,20.927532,25
22,licence,10,20.844888,4
