In [15]:
from datetime import datetime
from time import time
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [None]:


doc = {
    'author': 'kimchy',
    'text': 'Elasticsearch: cool. bonsai cool.',
    'timestamp': datetime.now(),
}
res = es.index(index="test-index", doc_type='tweet', id=1, body=doc)
print(res['result'])

res = es.get(index="test-index", doc_type='tweet', id=1)
print(res['_source'])

es.indices.refresh(index="test-index")

res = es.search(index="test-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

In [1]:
# start simple

import random

names = ['John', 'Doe', 'Gabriel', 'Alex', 'Lucy', 'Gabe', 'Vito', 'Lucy']
professions = ['clerk', 'police officer', 'software developer', 'window washer', 'banker', 'unemployed', 'wordpress developer', 'gangster', 'politician']
interests = ['paragliding', 'rock climbing', 'killing', 'murdering', 'cycling', 'bicyle', 'outdoors', 'computers', 'programming', 'ladies']
age = range(15, 65)
city = ['LA', 'Chicago', 'Utah']

In [2]:
def get_person(n):
    i = 0
    while i < n:
        i += 1
        yield {
            'name': "{} {}".format(random.choice(names), random.choice(names)),
            'age': random.choice(age),
            'city': random.choice(city),
            'profession': random.choice(professions),
            'interests': random.sample(interests, k=random.randint(0,3))
        }

In [3]:
data = [person for person in get_person(500)]

## task 1
- index the data
- supply a simple search function

In [None]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

In [None]:
for idx, ppl in enumerate(data):
    es.index(index='people', doc_type='person', id=idx, body=ppl)

In [None]:
def simple_search(term, no_res):
    query = {
        "query": {
            "match": {'name': term}
        }
    }
    res = es.search(index="people", size=no_res, body=query)
    print("found {} results".format(res['hits']['total']))
    return res

## task 2
- index with mapping, standard analyzers
- query for matching all fields

In [None]:
#delete index
es.indices.delete(index='people', ignore=[400, 404])

In [None]:
#create new index
index_body = {
    "settings" : {
        "number_of_shards" : 1
    },
    "mappings" : {
        "person": {
            "properties" : {
                "name" : { "type" : "text", "copy_to": "all_fields" },
                "city" : { "type" : "text", "copy_to": "all_fields" },
                "age" : { "type" : "integer"},
                "profession" : { "type" : "text", "copy_to": "all_fields" },
                "interests" : { "type" : "keyword", "copy_to": "all_fields" },
                "all_fields": {"type": "text"}
            }
        }
    }
}
es.indices.create('people', body=index_body)

In [None]:
import time

In [None]:
t0 = time.time()
for idx, ppl in enumerate(data):
    es.index(index='people', doc_type='person', id=idx, body=ppl)
print("indexing took {} ms".format((time.time()-t0)*1000))

In [None]:
def simple_all_search(term, no_res):
    query = {
        "query": {
            "match": {'all_fields': term}
        }
    }
    res = es.search(index="people", size=no_res, body=query)
    print("found {} results".format(res['hits']['total']))
    return res

## Task 3
- create index with analyzers good for autocomplete
- make autocomplete search function

In [145]:
#delete index
es.indices.delete(index='people', ignore=[400, 404])

{'error': {'index': 'people',
  'index_uuid': '_na_',
  'reason': 'no such index [people]',
  'resource.id': 'people',
  'resource.type': 'index_or_alias',
  'root_cause': [{'index': 'people',
    'index_uuid': '_na_',
    'reason': 'no such index [people]',
    'resource.id': 'people',
    'resource.type': 'index_or_alias',
    'type': 'index_not_found_exception'}],
  'type': 'index_not_found_exception'},
 'status': 404}

In [30]:
#create new index
index_body = {
    'settings': {},
    'mappings': {}
}

index_body['settings']['number_of_shards'] = 1
analysis = {        
    'analyzer': {
          "edge_ngram": {
            "filter": [
              "lowercase",
              "edge_ngram_filter"
            ],
            "tokenizer": "keyword",
            "type": "custom"
          },
        "stopword_analyzer": {
          "stopwords": [
            "and",
            "the"
          ],
          "type": "standard"
        }
    },
    "filter": {
      "edge_ngram_filter": {
        "max_gram": "15",
        "min_gram": "1",
        "side": "front",
        "type": "edgeNGram"
      }
    }
}

index_body['mappings'] = {
    "properties" : {
        "name" : { "type" : "text", "copy_to": "suggest" },
        "city" : { "type" : "text", "copy_to": "suggest" },
        "age" : { "type" : "integer"},
        "profession" : { "type" : "text", "copy_to": "suggest" },
        "interests" : { "type" : "keyword", "copy_to": "suggest" },
        "suggest": {
            "type": "completion",
        }
    }
}

es.indices.create('people', body=index_body)

{'acknowledged': True, 'index': 'people', 'shards_acknowledged': True}

In [31]:
t0 = time()
for idx, ppl in enumerate(data):
    es.index(index='people', doc_type='_doc', id=idx, body=ppl)
print("indexing took {} ms".format((time()-t0)*1000))

indexing took 3751.1346340179443 ms


In [32]:
def simple_autocomplete(term, no_res=3):
    query = {
        "suggest": {
            "person-suggest" : {
                "prefix" : term, 
                "completion" : {
                    "field" : "suggest",
                    "size": no_res
                }
            }
        }
    }
    res = es.search(index="people", body=query)
#     print("found {} suggest".format(len(res['suggest']['person-suggest'][0]['options'])))
    return res

In [33]:
simple_autocomplete('Doe L', 5)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'suggest': {'person-suggest': [{'length': 5,
    'offset': 0,
    'options': [{'_id': '21',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 27,
       'city': 'Utah',
       'interests': [],
       'name': 'Doe Lucy',
       'profession': 'banker'},
      '_type': '_doc',
      'text': 'Doe Lucy'},
     {'_id': '104',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 23,
       'city': 'Chicago',
       'interests': [],
       'name': 'Doe Lucy',
       'profession': 'politician'},
      '_type': '_doc',
      'text': 'Doe Lucy'},
     {'_id': '116',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 19,
       'city': 'Utah',
       'interests': [],
       'name': 'Doe Lucy',
       'profession': 'unemployed'},
      '_type': '_doc',
      'text': 'Doe Lucy'},
   

## Task 4
- make query
- make filter
- make get

In [41]:
query = {
    "suggest": {
        "person-suggest" : {
            "prefix" : 'rock', 
            "completion" : {
                "field" : "suggest",
                "size": 5
            }
        }
    }
}
es.search(index="people", body=query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [],
  'max_score': None,
  'total': {'relation': 'eq', 'value': 0}},
 'suggest': {'person-suggest': [{'length': 4,
    'offset': 0,
    'options': [{'_id': '41',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 64,
       'city': 'Chicago',
       'interests': ['paragliding', 'cycling', 'rock climbing'],
       'name': 'Lucy Lucy',
       'profession': 'politician'},
      '_type': '_doc',
      'text': 'rock climbing'},
     {'_id': '55',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 50,
       'city': 'Utah',
       'interests': ['computers', 'rock climbing'],
       'name': 'Lucy Lucy',
       'profession': 'banker'},
      '_type': '_doc',
      'text': 'rock climbing'},
     {'_id': '72',
      '_index': 'people',
      '_score': 1.0,
      '_source': {'age': 38,
       'city': 'Chicago',
       'interests': ['programming', 'rock climbing'],
    

In [87]:
query = {
    "query": {
        'bool': {
            'must': [
                {'match': {'name': {
                    'query':'jnh',
                    'fuzziness': 2
                }}},
                {'match': {'city': 'chicago'}},
            ],
            'filter': [
                {'term': {'profession': 'politician'}},
                {'range': {'age': {'gte': 40, 'lte': 50}}}
            ]
        }
    }
}
es.search(index="people", body=query)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '340',
    '_index': 'people',
    '_score': 1.5388111,
    '_source': {'age': 49,
     'city': 'Chicago',
     'interests': [],
     'name': 'Gabriel John',
     'profession': 'politician'},
    '_type': '_doc'},
   {'_id': '447',
    '_index': 'people',
    '_score': 1.5388111,
    '_source': {'age': 44,
     'city': 'Chicago',
     'interests': ['programming'],
     'name': 'John Alex',
     'profession': 'politician'},
    '_type': '_doc'}],
  'max_score': 1.5388111,
  'total': {'relation': 'eq', 'value': 2}},
 'timed_out': False,
 'took': 11}

In [95]:
query = {
    "query": {
        'bool': {
            'filter': [
                {'term': {'profession': 'politician'}},
                {'range': {'age': {'gte': 40, 'lte': 50}}}
            ]
        }
    }
}
r = es.search(index="people", body=query)

In [110]:
r['hits']['hits'][2]['_source']

{'age': 40,
 'city': 'LA',
 'interests': ['murdering'],
 'name': 'Vito Gabriel',
 'profession': 'politician'}

In [132]:
def get_gen(res):
    for i in range(len(res['hits']['hits'])):
        yield res['hits']['hits'][i]['_source']

In [133]:
gr = get_gen(r)

In [135]:
for i in gr:
    print(i)

{'city': 'LA', 'name': 'Lucy John', 'interests': ['paragliding', 'computers', 'murdering'], 'profession': 'politician', 'age': 49}
{'city': 'Chicago', 'name': 'Doe Lucy', 'interests': ['bicyle', 'killing', 'murdering'], 'profession': 'politician', 'age': 48}
{'city': 'LA', 'name': 'Vito Gabriel', 'interests': ['murdering'], 'profession': 'politician', 'age': 40}
{'city': 'LA', 'name': 'Gabriel Gabriel', 'interests': [], 'profession': 'politician', 'age': 48}
{'city': 'LA', 'name': 'Doe Lucy', 'interests': ['cycling', 'outdoors'], 'profession': 'politician', 'age': 49}
{'city': 'Utah', 'name': 'Gabe Gabe', 'interests': ['bicyle', 'murdering'], 'profession': 'politician', 'age': 45}
{'city': 'Chicago', 'name': 'Gabriel John', 'interests': [], 'profession': 'politician', 'age': 49}
{'city': 'Utah', 'name': 'Lucy Lucy', 'interests': ['computers'], 'profession': 'politician', 'age': 46}
{'city': 'LA', 'name': 'Lucy Gabe', 'interests': ['outdoors'], 'profession': 'politician', 'age': 50}
{'c

In [3]:
d = {'v':10, 't':8}
d.get('x')