In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Elasticsearch API python
from elasticsearch import Elasticsearch
from elasticsearch import helpers

# Env. Vars
from dotenv import load_dotenv

# System
import os

In [2]:
# Create the client instance
load_dotenv('.env')

host=os.getenv('ELASTICSEARCH_HOST')
passwd = os.getenv('ELASTICSEARCH_PASSWORD')

es = Elasticsearch([f'{host}:9200'],
basic_auth=('elastic', passwd)
)

### Criação e deleção de indíces

In [8]:
# Criar index
es.indices.create(index='teste-ramon', ignore=400)

  es.indices.create(index='teste-ramon', ignore=400)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'teste-ramon'})

In [None]:
# Criando index especificando o mapping e settings.
es.indices.create(index='teste-ramon', mappings=mappings, settings=settings, ignore=400)

In [7]:
# Deletando index
es.indices.delete(index='teste-ramon', ignore=[400, 404])

  es.indices.delete(index='teste-ramon', ignore=[400, 404])


ObjectApiResponse({'acknowledged': True})

### Informações sobre os index

In [3]:
# Informações gerais da conexão do ES
es.info()

ObjectApiResponse({'name': 'elasticsearch', 'cluster_name': 'docker-cluster', 'cluster_uuid': '26HxHJtJTAKP5Sj1PFsSnA', 'version': {'number': '8.6.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f67ef2df40237445caa70e2fef79471cc608d70d', 'build_date': '2023-01-04T09:35:21.782467981Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
# Ver todos os índices da instância
es.indices.get_alias(index="*")

  es.indices.get_alias(index="*")


ObjectApiResponse({'.kibana_task_manager_8.6.0_001': {'aliases': {'.kibana_task_manager': {'is_hidden': True}, '.kibana_task_manager_8.6.0': {'is_hidden': True}}}, '.kibana_security_session_1': {'aliases': {'.kibana_security_session': {'is_hidden': True}}}, '.items-default-000001': {'aliases': {'.items-default': {'is_write_index': True}}}, 'produtos_cayena_testes_outer': {'aliases': {}}, 'produtos_cayena_train': {'aliases': {}}, 'felipe_teste': {'aliases': {}}, '.apm-custom-link': {'aliases': {}}, '.security-profile-8': {'aliases': {'.security-profile': {'is_hidden': True}}}, 'search_app_cayena': {'aliases': {}}, '.transform-notifications-000002': {'aliases': {'.transform-notifications-read': {'is_hidden': True}}}, 'produtos_cayena': {'aliases': {}}, '.transform-internal-007': {'aliases': {}}, '.fleet-enrollment-api-keys-7': {'aliases': {'.fleet-enrollment-api-keys': {'is_write_index': True, 'is_hidden': True}}}, '.lists-default-000001': {'aliases': {'.lists-default': {'is_write_index'

In [6]:
# Contar o número de documentos de uma outra forma
es.count(index='teste-ramon')

ObjectApiResponse({'count': 8, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [7]:
# Contar o número de documentos em um dado index
es.cat.count(index='teste-ramon')

TextApiResponse('1679166551 19:09:11 8\n')

In [14]:
# Refresh index
es.indices.refresh(index='teste-ramon')

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [15]:
# Informações sobre um index em específico
es.indices.get(index="teste-ramon")

ObjectApiResponse({'teste-ramon': {'aliases': {}, 'mappings': {}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'teste-ramon', 'creation_date': '1679079154570', 'number_of_replicas': '1', 'uuid': 'z6QiqzCrQI6PqLCbpHP4ng', 'version': {'created': '8060099'}}}}})

### Criando index manualmente (um a um)

In [8]:
doc = {
    'genre': ['IMAX', 'sci-fi'],
    'title': 'Insterstellar',
    'year': 2014
}

In [9]:
es.index(index='teste-ramon', id=1, document=doc)

ObjectApiResponse({'_index': 'teste-ramon', '_id': '1', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 8, '_primary_term': 1})

In [18]:
# Removendo um filme com id=1
es.delete(index='teste-ramon', id=1)

ObjectApiResponse({'_index': 'teste-ramon', '_id': '1', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

In [10]:
# Consulta do conteúdo no id=1 e no index='teste-ramon'
res2 = es.get(index='teste-ramon', id=4)
res2['_source']

{'genre': 'kids', 'title': 'The Lion King', 'year': 1998}

### Adicionando vários filmes

In [11]:
d = [
    ['action', 'The Davinci Code', 2005],
    ['sci-fi', 'Star Wars', 2005],
    [['sci-fi', 'IMAX'], 'Insterstellar', 2014],
    ['kids', 'Toy Story 1', 1995],
    ['kids', 'The Lion King', 1998],
    ['terror', 'Terrifier 2', 2022],
    ['terror', 'Terrifier', 2019],
    ['sci-fi', 'Star Trek', 2011]
]

In [12]:
filmes = pd.DataFrame(data=d, columns=['genre', 'title', 'year'])

In [13]:
filmes

Unnamed: 0,genre,title,year
0,action,The Davinci Code,2005
1,sci-fi,Star Wars,2005
2,"[sci-fi, IMAX]",Insterstellar,2014
3,kids,Toy Story 1,1995
4,kids,The Lion King,1998
5,terror,Terrifier 2,2022
6,terror,Terrifier,2019
7,sci-fi,Star Trek,2011


In [14]:
def generate_docs():
    for idx, info in filmes.iterrows():
        doc = {
            '_index': 'teste-ramon',
            '_id': idx,
            '_source': {
                'genre': info['genre'],
                'title': info['title'],
                'year': info['year']
            }
        }

        yield doc

helpers.bulk(es, generate_docs())

(8, [])

### Buscas no elasticsearch

In [15]:
# Busca simples por um termo
search = 'Star Trek'

resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'match': {
        'title': f"{search}"
    }
})

In [16]:
resp['hits']['hits'][0]['_source']

{'genre': 'sci-fi', 'title': 'Star Trek', 'year': 2011}

In [17]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '7',
  '_score': 3.2834144,
  '_source': {'genre': 'sci-fi', 'title': 'Star Trek', 'year': 2011}},
 {'_index': 'teste-ramon',
  '_id': '1',
  '_score': 1.3862942,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}}]

## Busca simples, mas com analyzers

In [18]:
es.indices.delete(index='teste-ramon', ignore=[400, 404])

  es.indices.delete(index='teste-ramon', ignore=[400, 404])


ObjectApiResponse({'acknowledged': True})

In [19]:
mappings = {
    'properties':{
        'genre': {'type': 'keyword', 'analyzer': 'english'},
        'title': {'type': 'text'},
        'date': {'type': 'date'}
    }
}

In [20]:
es.indices.create(index='teste-ramon', mappings=mappings, ignore=400)

  es.indices.create(index='teste-ramon', mappings=mappings, ignore=400)


ObjectApiResponse({'error': {'root_cause': [{'type': 'mapper_parsing_exception', 'reason': 'unknown parameter [analyzer] on mapper [genre] of type [keyword]'}], 'type': 'mapper_parsing_exception', 'reason': 'Failed to parse mapping: unknown parameter [analyzer] on mapper [genre] of type [keyword]', 'caused_by': {'type': 'mapper_parsing_exception', 'reason': 'unknown parameter [analyzer] on mapper [genre] of type [keyword]'}}, 'status': 400})

In [21]:
helpers.bulk(es, generate_docs())

(8, [])

In [22]:
# Busca simples por um termo
search = 'Sci-fi'

resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'match': {
        'genre': f"{search}"
    }
})

In [23]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '1',
  '_score': 1.6622524,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}},
 {'_index': 'teste-ramon',
  '_id': '7',
  '_score': 1.6622524,
  '_source': {'genre': 'sci-fi', 'title': 'Star Trek', 'year': 2011}},
 {'_index': 'teste-ramon',
  '_id': '2',
  '_score': 1.3405261,
  '_source': {'genre': ['sci-fi', 'IMAX'],
   'title': 'Insterstellar',
   'year': 2014}}]

## Filters, must == and

In [26]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'bool': {
        'must': {"term":{'title': 'star'}},
        'filter': {'range':{'year': {'gte': 2000}}}
    }
})

In [27]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '1',
  '_score': 1.3125186,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}},
 {'_index': 'teste-ramon',
  '_id': '7',
  '_score': 1.3125186,
  '_source': {'genre': 'sci-fi', 'title': 'Star Trek', 'year': 2011}}]

## Filters, should == OR

In [40]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'bool': {
        'should': {"term":{'title': 'toy'}},
        'filter': {'term': {'genre': 'sci'}},
        'filter': {'range': {'year': {'gte': 2006}}}
    }
})

In [41]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '2',
  '_score': 0.0,
  '_source': {'genre': ['sci-fi', 'IMAX'],
   'title': 'Insterstellar',
   'year': 2014}},
 {'_index': 'teste-ramon',
  '_id': '5',
  '_score': 0.0,
  '_source': {'genre': 'terror', 'title': 'Terrifier 2', 'year': 2022}},
 {'_index': 'teste-ramon',
  '_id': '6',
  '_score': 0.0,
  '_source': {'genre': 'terror', 'title': 'Terrifier', 'year': 2019}}]

## Filters, must_not == NOT

In [42]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'bool': {
        'must_not': {"term":{'title': 'toy'}},
        'filter': {'term': {'genre': 'sci'}}
    }
})

In [43]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '1',
  '_score': 0.0,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}},
 {'_index': 'teste-ramon',
  '_id': '2',
  '_score': 0.0,
  '_source': {'genre': ['sci-fi', 'IMAX'],
   'title': 'Insterstellar',
   'year': 2014}},
 {'_index': 'teste-ramon',
  '_id': '7',
  '_score': 0.0,
  '_source': {'genre': 'sci-fi', 'title': 'Star Trek', 'year': 2011}}]

## Match Phrase

In [51]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'match_phrase': {
        'title': 'Star wars',
    }
})

In [52]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '1',
  '_score': 3.1484585,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}}]

## Match Phrase com slop

In [61]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'match_phrase': {
        'title': {'query': 'Star wars', 'slop':2}
    }
})

In [62]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '1',
  '_score': 3.1484585,
  '_source': {'genre': 'sci-fi', 'title': 'Star Wars', 'year': 2005}}]

## Fuzziness

In [83]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'fuzzy': {
        'title': {'value': 'Instersteller', 'fuzziness': "AUTO"}
    }
})

In [84]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '2',
  '_score': 1.9352322,
  '_source': {'genre': ['sci-fi', 'IMAX'],
   'title': 'Insterstellar',
   'year': 2014}}]

## Partial Matches

### Prefix

In [99]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'prefix': {
        'genre': 'ter'
    }
})

In [100]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '5',
  '_score': 1.0,
  '_source': {'genre': 'terror', 'title': 'Terrifier 2', 'year': 2022}},
 {'_index': 'teste-ramon',
  '_id': '6',
  '_score': 1.0,
  '_source': {'genre': 'terror', 'title': 'Terrifier', 'year': 2019}}]

### Wildcards

In [105]:
resp = es.search(index="teste-ramon", from_=0, size=3,  query={
    'wildcard': {
        'genre': 'i*'
    }
})

In [106]:
resp['hits']['hits']

[{'_index': 'teste-ramon',
  '_id': '2',
  '_score': 1.0,
  '_source': {'genre': ['sci-fi', 'IMAX'],
   'title': 'Insterstellar',
   'year': 2014}}]