## Connect

In [1]:
from opensearchpy import OpenSearch

host = 'localhost'
port = 9200
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.

# Create the client with SSL/TLS enabled, but hostname and certificate verification disabled.
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

# Do a few checks before we start indexing:
print(client.cat.health())
print(client.cat.indices())

1695832708 16:38:28 docker-cluster yellow 1 1 true 18 18 0 0 13 0 - 58.1%

yellow open search_fun_bulk                           tnCiHcNhQ1GTeeQ6HeNYUw 1 1     4   0   8.7kb   8.7kb
yellow open test-index-to-delete-1                    8WkNqdj_Ra-kOzeVgBNSRw 1 1  6000 500   2.2mb   2.2mb
green  open .kibana_92668751_admin_1                  bruHV_IvRymvc8IduVDd_Q 1 0     1   0   5.1kb   5.1kb
yellow open security-auditlog-2023.09.24              BxumpqiRREiQ_i137LxLWg 1 1   236   0 424.4kb 424.4kb
yellow open search_fun_revisited_custom_mappings      brgsBN-CTMOGO21mF476pA 1 1     4   0  21.1kb  21.1kb
green  open .kibana_1                                 fVVE40BkSEqQvTlA5A1Tng 1 0    61   0    37kb    37kb
green  open .opendistro_security                      88369CGsQ3CShUM32lIe4w 1 0    10   0  75.6kb  75.6kb
yellow open movies                                    GDknLevWRUmTtoQu-LzgKg 1 1  5000   0  11.2mb  11.2mb
yellow open .plugins-ml-config                        gpaVzMC9Rm-ixr8

Here is a list of OpenSearchpy methods on `client.cat`

| Method                              | Description                                      |
|-------------------------------------|--------------------------------------------------|
| `client.cat.aliases()`              | List aliases.                                    |
| `client.cat.allocation()`           | Show shard allocation on nodes.                 |
| `client.cat.count()`                | Display the count of documents in an index.    |
| `client.cat.fielddata()`            | List field data statistics.                     |
| `client.cat.health()`               | Display cluster health.                         |
| `client.cat.indices()`              | List indices.                                   |
| `client.cat.master()`               | Show the master node.                           |
| `client.cat.nodes()`                | List nodes in the cluster.                     |
| `client.cat.pending_tasks()`        | List cluster pending tasks.                    |
| `client.cat.plugins()`              | List installed plugins.                         |
| `client.cat.recovery()`             | Display shard recovery information.             |
| `client.cat.repositories()`          | List registered snapshot repositories.          |
| `client.cat.segments()`             | List segments in indices.                      |
| `client.cat.shards()`               | List shard information.                         |
| `client.cat.snapshots()`            | List snapshots.                                 |
| `client.cat.templates()`            | List index templates.                           |
| `client.cat.thread_pool()`          | Display thread pool statistics.                |

In [2]:
# If you still have your documents from the Dev Tools test, we should be able to check them here:
try:
    print(client.cat.count("search_fun_test", params={"v": "true"}))
except:
    print("search_fun_test doesn't exist, that's OK")

epoch      timestamp count
1695831914 16:25:14  4



## Create Index

In [3]:
# %pip install pyyaml
import yaml
# Create an index with non-default settings.
# client.indices.delete("search_fun_revisited")
index_name = 'search_fun_revisited'
index_body = yaml.safe_load("""
settings:
  index:
    query:
      default_field: body
""")
client.indices.create(index_name, body=index_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'search_fun_revisited'}

In [4]:
import pandas as pd
# Add our sample document to the index.
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]
pd.DataFrame(docs)

Unnamed: 0,id,title,body,price,in_stock,category
0,doc_a,Fox and Hounds,The quick red fox jumped over the lazy brown d...,5.99,True,childrens
1,doc_b,Fox wins championship,"Wearing all red, the Fox jumped out to a lead ...",15.13,True,sports
2,doc_c,Lead Paint Removal,All lead must be removed from the brown and re...,150.21,False,instructional
3,doc_d,The Three Little Pigs Revisted,"The big, bad wolf huffed and puffed and blew t...",3.51,True,childrens


## Add Documents

In [None]:
def index_docs(docs: list, index_name: str, id_col: str = 'id'):
    for doc in docs:
        print(f"Indexing {doc[id_col]}")
        response = client.index(
            index = index_name, 
            body = doc, 
            id = doc[id_col], 
            refresh = True
        )
        print(response)

In [5]:
index_docs(docs, index_name)

# Verify they are in:
print(client.cat.count(index_name, params={"v": "true"}))

Indexing doc_a
Indexing doc_b
Indexing doc_c
Indexing doc_d
epoch      timestamp count
1695832740 16:39:00  4



In [7]:
# Get the index mappings
client.indices.get_mapping(index_name)

{'search_fun_revisited': {'mappings': {'properties': {'body': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'category': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'in_stock': {'type': 'boolean'},
    'price': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'title': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}

## Index > Mappings

In [10]:
# Create a new index, this time with different mappings
index_name = 'search_fun_revisited_custom_mappings'
# client.indices.delete(index_name)
index_body = yaml.safe_load("""
settings:
  index:
    query:
      default_field: body
mappings:
  properties:
    title: 
      type: text
      analyzer: english
    body: 
      type: text 
      analyzer: english
    in_stock:
      type: boolean
    category: 
      type: keyword
      ignore_above: 256}
    price: 
      type: float
""")
client.indices.create(index_name, body=index_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'search_fun_revisited_custom_mappings'}

In [11]:
index_docs(docs, index_name)

Indexing doc_a

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_a', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Indexing doc_b

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_b', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Indexing doc_c

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_c', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Indexing doc_d

	Response:
{'_index': 'search_fun_revisited_custom_mappings', '_id': 'doc_d', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}


## Search

In [12]:
# Do some searches
q = 'dogs'
query = yaml.safe_load(f"""
size: 5
query:
  multi_match:
    query: {q}
    fields: ['title^2', 'body']
""")

client.search(
  body = query, 
  index = index_name
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 0.71833557,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 0.71833557,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_b',
    '_score': 0.6548753,
    '_source': {'id': 'doc_b',
     'title': 'Fox wins championship',
     'body': 'Wearing all red, the Fox jumped out to a lead in the race over the Dog.',
     'price': '15.13',
     'in_stock': True,
     'category': 'sports'}}]}}

In [16]:
# try a phrase query
q = 'fox dog'
query = yaml.safe_load(f"""
size: 5
query:
  match_phrase:
    body:
      query: {q}
""")

client.search(
  body = query,
  index = index_name
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [17]:
# try a phrase query with slop
q = 'fox dog'
query = yaml.safe_load(f"""
size: 5
query:
  match_phrase:
    body:
      query: {q}
      slop: 10
""")

client.search(
    body=query,
    index=index_name
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 0.39418244,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 0.39418244,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_b',
    '_score': 0.19532394,
    '_source': {'id': 'doc_b',
     'title': 'Fox wins championship',
     'body': 'Wearing all red, the Fox jumped out to a lead in the race over the Dog.',
     'price': '15.13',
     'in_stock': True,
     'category': 'sports'}}]}}

In [16]:
# try a match all query with a filter and a price factor
query = yaml.safe_load("""
size: 5
query:
  function_score:
    query:
      bool:
        must:
          match_all:
        filter:
          term:
            category: childrens
    field_value_factor:
        field: price
        missing: 1                      
""")

client.search(
  body = query,
  index = index_name
)

{'took': 16,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 2, 'relation': 'eq'},
  'max_score': 5.99,
  'hits': [{'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_a',
    '_score': 5.99,
    '_source': {'id': 'doc_a',
     'title': 'Fox and Hounds',
     'body': 'The quick red fox jumped over the lazy brown dogs.',
     'price': '5.99',
     'in_stock': True,
     'category': 'childrens'}},
   {'_index': 'search_fun_revisited_custom_mappings',
    '_id': 'doc_d',
    '_score': 3.51,
    '_source': {'id': 'doc_d',
     'title': 'The Three Little Pigs Revisted',
     'price': '3.51',
     'in_stock': True,
     'body': 'The big, bad wolf huffed and puffed and blew the house down. The end.',
     'category': 'childrens'}}]}}

## Aggregations

### Terms

In [17]:
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "category": {
            "terms": {
                "field": "category",
                "size": 10,
                "missing": "N/A",
                "min_doc_count": 0
            }
        }
    }
}

client.search(
    body = query,
    index = index_name
)

{'took': 29,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'category': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 'childrens', 'doc_count': 2},
    {'key': 'instructional', 'doc_count': 1},
    {'key': 'sports', 'doc_count': 1},
    {'key': 'N/A', 'doc_count': 0}]}}}

In [18]:
# Terms on price
query = {
    'size': 0,
    'query': {
        "match_all": {}
    },
    'aggs': {
        "price": {
            "terms": {
                "field": "price",
                "size": 10,
                "min_doc_count": 0
            }
        }
    }
}

client.search(
    body = query,
    index = index_name
)

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'price': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 3.509999990463257, 'doc_count': 1},
    {'key': 5.989999771118164, 'doc_count': 1},
    {'key': 15.130000114440918, 'doc_count': 1},
    {'key': 150.2100067138672, 'doc_count': 1}]}}}

### Range

In [36]:
# Range aggregation
query = yaml.safe_load("""
size: 0
query:
  match_all:
aggs:
  category:
    terms:
      field: category
      size: 10
      missing: "N/A"
      min_doc_count: 0
  price:
    range:
      field: price
      ranges:
        - to: 5
        - from: 5
          to: 20
        - from: 20                      
""")


client.search(
  body = query,
  index = index_name
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 4, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'price': {'buckets': [{'key': '*-5.0',
     'to': 5.0,
     'doc_count': 1},
    {'key': '5.0-20.0', 'from': 5.0, 'to': 20.0, 'doc_count': 2},
    {'key': '20.0-*', 'from': 20.0, 'doc_count': 1}]}}}

## Delete

In [None]:
# if you want to delete the documents, but keep the index, run the following:
for doc in docs:
    doc_id = doc["id"]
print("Indexing {}".format(doc_id))
response = client.delete(
    index=index_name,
    id=doc_id,
)
print('\n\tResponse:')
print(response)

# If at any time you want to start over, run this command to delete the index and then you can start from the toop
# Delete the index.
response = client.indices.delete(index=index_name)

print('\nDeleting index:')
print(response)