# Elastic Search Tutoral

### Download and unzip elastic search
    https://www.elastic.co/downloads/elasticsearch

### Run elasticsearch instance:
	cd ~/Downloads/elasticsearch-x.x.x
	bin/elasticsearch
	

### Check it is running:
	Python3
		import requests
		res = requests.get('http://localhost:9200')
		print(res.content)
	
	or simply navigate to http://localhost:9200


### Import elasticsearch-py to talk to es with python:
	pip install elasticsearch (OR) pip3 install elasticsearch


### Fill with data/delete:
	Python3 -- see https://elasticsearch-py.readthedocs.io/en/master/api.html
	you dont need to have the index and doc_type created before the index. using index will create them for you.

		#connect to the correct cluster
		from elasticsearch import Elasticsearch
		es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

		#add test data to the test-index database
		es.index(index='test-index', doc_type='test', id=1, body={'test': 'test'})

		#delete by specifying the index, doc_type, and id
		#doc_type is like a specific table in the database
		es.delete(index='test-index', doc_type='test', id=1)

		#retrieve data by id number (assuming it is there)
		es.get(index='test-index', doc_type='test', id=1)

		#search an index. there are LOTS of optional parameters you can send with the search query 
		#searching returns results with associated scores (how much they match the seeach query)
		es.search(index="test-index", body={"query": {"prefix" : {"test" : "test"}}})


### Have an autocomplete/misspelling functionality:
	If the user types "lu" we can send this query to es and get some values returned, with which we can display them as options to the user. 
	es.search(index="test-index", body={"query": {"prefix" : { "test" : "te" }}})

	If we want to catch typos in the search term we can do match... match is a simple query operator that does fuzzy matching, so “go” would also match “going”, “goes”, ...
	es.search(index="indexname", body={"query": {"match" : {"test" : "te"}}})


### For more advanced searching parameters:
	https://www.bitquabit.com/post/having-fun-python-and-elasticsearch-part-3/
    
    
### In this Notebook: Some code to show the functionality of elasticsearch. Includes

>creating tables by indexing documents

>searching the tables for different queries

>a function that does some searching/returning


### Make sure to have elasticsearch running in the background before executing these cells

### (cd into your elasticsearch installation and run bin/elasticsearch from terminal)

In [1]:
%load_ext autoreload
%autoreload 2

In [33]:
import json
import elasticsearch
from elasticsearch import Elasticsearch
es = Elasticsearch()
from datetime import datetime
import time

1. #Print all indices

k=[k for k in es.indices.get_alias('*').keys()]
print("Keys:", k)

Keys: ['suggest_index']


In [32]:
### --> Delete Indexes temporary indices we will populate them aagain

if ('internal' in es.indices.get_alias('*').keys()):
    es.indices.delete(index='internal')
if ('external' in es.indices.get_alias('*').keys()):
    es.indices.delete(index='external')
if ('test' in es.indices.get_alias('*').keys()):
    es.indices.delete(index='test')
if ('internal1' in es.indices.get_alias('*').keys()):
    es.indices.delete(index='internal1')


In [24]:
es = Elasticsearch("http://localhost:9200")  # use default of localhost, port 9200
es = Elasticsearch()  # use default of localhost, port 9200
host, port = "localhost", 9200
Elasticsearch([{'host': host, 'port': port}])
#fill with some data similar to what our data will look like

#making an internal database
#elasticsearch 6.x only supports a single type per index so we can specify 
#what the document is with a type field in the body
es.index(index='internal', id=1, body={ 'content': 'test'} )

{'_index': 'internal',
 '_id': '1',
 '_version': 4,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 3,
 '_primary_term': 1}

In [25]:
ls = es.search(index='internal',  filter_path=[''], q="test", size=5)
ls

{'took': 470,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 0.2876821,
  'hits': [{'_index': 'internal',
    '_id': '1',
    '_score': 0.2876821,
    '_source': {'content': 'test'}}]}}

In [28]:
index,id,query = "internal1", "test", "testQ"
es.index(index=index, id=id, 
         body={
            'title':query, 
            'count': 0,
            'id': id,      
            'users': {},
            'timeUTC': datetime.utcnow(), 
            'timeLst': datetime.utcnow(), 
        })
        

{'_index': 'internal1',
 '_id': 'test',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [22]:
#PREPARE SUGGEST INDEX >>>>

es.index(index='suggest_index', doc_type = 'internal', id=1, body={
    'rev':1.1,
    'type': 'report',
    'author': 'Boeing Martin',
    'title': 'LM airplane better than boeing airplane',
    'topics': ['airplane', 'engine', 'lockheed', 'boeing', 'pilot'],
    'link': 'linktoactualreport',
    'text': 'we can put actual text from the body of the document here',
    'priority': 1
})

def testSuggestedIndex(IDS=[], q=None,):
    #lets make sure we got one entry in the system
    matchAll={"query": {"match_all": {}} }
    searchIDS = {"query": {"terms": { "_id": [IDS] }} }
                 
    # -> Search for all docs and print it
    if (len(IDS) > 0):
        ls = es.search(index='suggest_index', doc_type = 'text', body= searchIDS)
    elif( q is not None):
        ls = es.search(index='suggest_index', doc_type = 'text', q=q);
    else:
        ls = es.search(index='suggest_index', doc_type = 'text', body= matchAll)
        
    for i,h in enumerate(ls['hits']['hits']):
        s = Map(h['_source'])
        ui= s.userid if s.userid else ""
        ta= ":".join ([str(k) for k in s.keys()])
        co= s.count if 'count' in s else "NA"
        print(h['_id'], f"{i}/{ls['hits']['total']} {co:3} {ui:3} {s.utc} {s.title[0:10]} Tags: {ta}    ")
        
    return ls;

# Insert something so that es-search wont fail
def initIndex( index='suggest_index', docType="text"):
    ls = testSuggestedIndex('_id')
    #if ls['hits']['total'] <= 0:
    #    es.index(index=index, doc_type = docType, id="_id", body={'title':"S", 'count': 0, 'utc': time.time()})
    print(ls)

initIndex()
    
def addToSuggestedInex( title= 'stuff', user='user' ):
    userid = hash(user)
    id = hash(title)
    timet = int(time.time())
    es.index(index='suggest_index', doc_type = 'text', id=id, body={
        'title':title, 'count': 0,
        'id': id,      'userid': userid,
        'utc' : timet, 'timeUTC': datetime.utcnow(), 'timeLocal': datetime.now(),
        })

    body={
     "script": {
        #"source": "ctx._source.val = 1.2",     #NOTE: It wont set a value if it is already set
        "source": "ctx._source.count = 0",
        "source": "ctx._source.count = ctx._source.count + 1",
         "lang": "painless"
      },
      "query": { "terms": { "_id": [id] } }
    }
    es.update_by_query(index='suggest_index', doc_type='text', body=body)
    return id;

# We can also delete somethings from suggest_index

def delFromSuggestedInex( title= 'stuff', id=None ):
    id = id if id else hash(title)
    timet = int(time.time())
    es.delete(index='suggest_index', doc_type='text', id=id)

def incrementCount(IDS=[], index='suggest_index', doc_type='text'):
    body={
        "script": {
            "source": "ctx._source.count = ctx._source.count + 1" ,
            "source": f"ctx._source.utc  = {int(time.time())}",
             "lang": "painless"
        },
        "query": {"terms": { "_id": IDS }}
     }
    es.update_by_query(index=index, doc_type=doc_type, body=body)
    
def updateAllDocs(index='suggest_index', doc_type='text'):
    body={
        "script": {
            #"source": "ctx",
            "source": f"ctx._source.utc  = {int(time.time())}",
             "lang": "painless"
        },
        "query": {"match_all": {}}
    }
    try:
        ls=es.update_by_query(index=index, doc_type=doc_type, body=body)
        return ls
    except Exception as e:
        print(e)
        return None                  
        
#id = addToSuggestedInex("SADA", "SAdananda Narayanappa")
time.sleep(1/1000)
#initIndex()
time.sleep(1/1000)
ls = testSuggestedIndex('_id')
time.sleep(1/1000)
#ls = testSuggestedIndex()
time.sleep(1/1000)
#updateAllDocs()
time.sleep(1)
#incrementCount(['_id'])

{'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}


In [5]:
es = elasticsearch.Elasticsearch()  # use default of localhost, port 9200

#fill with some data similar to what our data will look like

#making an internal database
#elasticsearch 6.x only supports a single type per index so we can specify 
#what the document is with a type field in the body
es.index(index='internal', doc_type = 'internal', id=1, body={
    'rev':1.1,
    'type': 'report',
    'author': 'Boeing Martin',
    'title': 'LM airplane better than boeing airplane',
    'topics': ['airplane', 'engine', 'lockheed', 'boeing', 'pilot'],
    'link': 'linktoactualreport',
    'content': 'we can put actual text from the body of the document here',
    'priority': 1
})
es.index(index='internal', doc_type = 'internal', id=2, body={
    'rev':1.2,
    'type': 'comp_intel',
    'author': 'Lockheed Martin',
    'title': 'The Competitive Edge',
    'topics': ['strategy', 'solution', 'commercial', 'aircraft', 'space'],
    'link': 'linktoactualdoc',
    'content': 'we can put actual text from the body of the document here',
    'priority': 1
})
es.index(index='internal', doc_type = 'internal', id=3, body={
    'rev':1.3,
    'type': 'comp_intel',
    'author': 'Lockheed Martin',
    'title': 'The Competitive Edge',
    'topics': ['strategy', 'solution', 'commercial', 'aircraft', 'space'],
    'link': 'linktoactualdoc',
    'content': 'we can put actual text from the body of the document here',
    'priority': 1
})

for i in range(4,1000):
    es.index(index='internal', doc_type = 'internal', id=i, body={
    'rev': float(f'1.{i}'),
    'type': 'comp_intel',
    'author': 'Lockheed Martin ' + str(i),
    'title': 'The Competitive Edge',
    'topics': ['strategy', 'solution', 'commercial', 'aircraft', 'space'],
    'link': 'linktoactualdoc',
    'content': 'we can put actual text from the body of the document here',
    'priority': 1
})

#making an external database
#patents, blogs, government reports...
es.index(index='external', doc_type = 'external', id=1, body={
    'rev':1.0,
    'type': 'patent',
    'author': 'Patent Author',
    'title': 'The best patent ever',
    'topics': ['patent', 'new', 'unique', 'better'],
    'content': 'we can put actual text from the body of the document here',
    'priority': 0.2
})
es.index(index='external', doc_type = 'external', id=2, body={
    'rev':1.1,
    'type': 'blog',
    'author': 'Defense Daily Author',
    'blog': 'Defense Daily',
    'title': 'CIA investigates startup',
    'topics': ['CIA', 'investigation', 'illegal', 'startup', 'crime'],
    'content': 'we can put actual text from the body of the document here',
    'priority': 0.1
})
es.index(index='external', doc_type = 'external', id=3, body={
    'rev':1.0,
    'type': 'gov report',
    'author': 'Government Author',
    'blog': 'Government Blog',
    'title': 'Report on defense spending',
    'topics': ['money', 'government', 'defense', 'military'],
    'content': 'we can put actual text from the body of the document here Here airplane',
    'priority': 0.9
})

RequestError: RequestError(400, 'no handler found for uri [/internal/internal/1] and method [PUT]', 'no handler found for uri [/internal/internal/1] and method [PUT]')

In [32]:
#search a specific field, like author
def FormatSearch(h):
        if (not h): return
        h = Map(h)
        s = Map(h._source)
        print(f"{h._type:10}, id:{h._id:3}, Rev: {s.rev:4}, Type: {s.type}, Author:[{s.author}] =>Title: {s.title}, Val: {s.val} New:s.new")

def ESGetID(es, id, index='internal', doc_type='internal'):
    h=None
    try:
        h = es.get(index=index, doc_type=doc_type, id=10)
        FormatSearch(h)
    except:
        pass
    return h
    
def ESSearch(es, index, q = 'rev: >=1.1', *args, **kwargs):
    ls=es.search(index, q=q,)
    print( "Found total Hits:" , ls['hits']['total']," Total Returned hits: ", len(ls['hits']['hits'] ))
    for h in ls['hits']['hits']:
        FormatSearch(h)
        
    return h

q='CIA'                                  # Search acrross all the terms
q="rev: 1.2"                             # Search for fields
q="author:'mahd martyr' airplane"    # Search specific fields and across all fields
q="rev: <=1.2"                             # Search for fields
q=None
#h=ESSearch(es, index='internal', q=q)
es.search("", q=q)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1004, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'external',
    '_type': 'external',
    '_id': '1',
    '_score': 1.0,
    '_source': {'rev': 1.0,
     'type': 'patent',
     'author': 'Patent Author',
     'title': 'The best patent ever',
     'topics': ['patent', 'new', 'unique', 'better'],
     'text': 'we can put actual text from the body of the document here',
     'priority': 0.2}},
   {'_index': 'external',
    '_type': 'external',
    '_id': '2',
    '_score': 1.0,
    '_source': {'rev': 1.1,
     'type': 'blog',
     'author': 'Defense Daily Author',
     'blog': 'Defense Daily',
     'title': 'CIA investigates startup',
     'topics': ['CIA', 'investigation', 'illegal', 'startup', 'crime'],
     'text': 'we can put actual text from the body of the document here',
     'priority': 0.1}},
   {'_index': 'external',
    '_type': '

In [None]:
#Retrieve specific items based on tags
q="rev: 1.2"                             # Search for fields
h=ESSearch(es, index='internal', doc_type='internal', q=q)

In [23]:
#Update all documents with a Tag value

## -- MATCH_ALL
IDS=1
MATCHES = {
    'ALL': { "match_all": {}},
    'ID1': { "terms": { "_id": [IDS] } }
}
## -- Search BY ID

body={
 "script": {
    #"source": "ctx._source.val = 1.2",     #NOTE: It wont set a value if it is already set
    "source": "ctx._source.likes = 0",
    "source": "ctx._source.likes = ctx._source.likes + 1",

     "lang": "painless"
  },
  "query": MATCHES['ALL']
}
es.update_by_query(index='internal', doc_type='_doc', body=body)

ls=es.search(index='', body={'query': body['query'] } )
for h in ls['hits']['hits']:
    print(h['_id'], h['_source'])
print("=========")
ls=es.search(index='', body={'query': body['query'] })
for h in ls['hits']['hits']:
    print(h['_id'], h['_source'])


1 {'rev': 1.0, 'type': 'patent', 'author': 'Patent Author', 'title': 'The best patent ever', 'topics': ['patent', 'new', 'unique', 'better'], 'text': 'we can put actual text from the body of the document here', 'priority': 0.2}
2 {'rev': 1.1, 'type': 'blog', 'author': 'Defense Daily Author', 'blog': 'Defense Daily', 'title': 'CIA investigates startup', 'topics': ['CIA', 'investigation', 'illegal', 'startup', 'crime'], 'text': 'we can put actual text from the body of the document here', 'priority': 0.1}
3 {'rev': 1.0, 'type': 'gov report', 'author': 'Government Author', 'blog': 'Government Blog', 'title': 'Report on defense spending', 'topics': ['money', 'government', 'defense', 'military'], 'text': 'we can put actual text from the body of the document here', 'priority': 0.9}
1 {'rev': 1.1, 'type': 'report', 'author': 'Boeing Martin', 'title': 'LM airplane better than boeing airplane', 'topics': ['airplane', 'engine', 'lockheed', 'boeing', 'pilot'], 'link': 'linktoactualreport', 'text':

In [25]:
ls=es.search(index='', q="sada" )
for h in ls['hits']['hits']:
    print(h['_id'], h['_source'])


In [35]:
ls=es.search(index='my_cija', body={'query': 
    {'match_all': {'term': 'GAO*'}}
     }  )
for h in ls['hits']['hits']:
    print(h['_id'], h['_source'])


RequestError: RequestError(400, 'parsing_exception', '[1:24] [match_all] unknown field [term]')

In [None]:
ls=es.search(index='my_cija', body={'query': 
    {'match': {
        'content': 'GAO*'
        }
     } } )
for h in ls['hits']['hits']:
    print(h['_id'], h['_source'])


In [None]:
'''
An example function that takes in the database (internal or external), a query,
and a bool for whether or not to return the JSON raw or formatted
'''
import json

def search(database, query, raw_result):
    es = elasticsearch.Elasticsearch()
    
    matches = es.search(database, q=query)
    hits = matches['hits']['hits']
    
    if not hits:
        click.echo('No matches found')
   
    else:
        if raw_result:
            return json.dumps(matches, indent=4)
        for hit in hits:
            return 'Title of Document:{}\nLink to Document: {}\n\n'.format(hit['_source']['title'], hit['_source']['link'])

In [None]:
'''
the end of https://www.bitquabit.com/post/having-fun-python-and-elasticsearch-part-2/
has a good script on running through emails and putting them + their metadata into 
an elasticsearch table. we could use the same logic but with output from a web crawler,
file upload, etc system.
'''

In [None]:
import pkgutil
import os
import encodings

def all_encodings():
    modnames = set(
        [modname for importer, modname, ispkg in pkgutil.walk_packages(
            path=[os.path.dirname(encodings.__file__)], prefix='')])
    aliases = set(encodings.aliases.aliases.values())
    return modnames.union(aliases)
def readfile( filename = '/tmp/filename.html'):
    encodings = all_encodings()
    for enc in encodings:
        try:
            with open(filename, encoding=enc) as f:
                # print the encoding and the first 500 characters
                c = f.read()
                print(enc, c[0:32])
                sendEmail(ad, ad, "Subject: {enc}", c, attach = [emlfile]);
                break;
        except Exception:
            print("FAILED " , enc)
            pass