# OpenSearch Revisited

## Setup

In [1]:
from opensearchpy import OpenSearch
host = 'localhost'
port = 9200
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.

# Create the client with SSL/TLS enabled, but hostname and certificate verification disabled.
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
# Do a few checks before we start indexing:
print(client.cat.health())
print(client.cat.indices())

1695621067 05:51:07 docker-cluster yellow 1 1 true 12 12 0 0 7 0 - 63.2%

yellow open .plugins-ml-config                        gpaVzMC9Rm-ixr8sC2AMOA 1 1     1 0   3.9kb   3.9kb
green  open .opensearch-observability                 Bfrp1mOqQYanRKHFWrbByA 1 0     0 0    208b    208b
yellow open search_fun_bulk                           4m4mPzJ4QXy4-qaWNEZ34A 1 1     4 0   8.8kb   8.8kb
green  open .kibana_92668751_admin_1                  bruHV_IvRymvc8IduVDd_Q 1 0     1 0   5.1kb   5.1kb
yellow open security-auditlog-2023.09.24              BxumpqiRREiQ_i137LxLWg 1 1   236 0 424.4kb 424.4kb
green  open opensearch_dashboards_sample_data_flights 3ipTpd0SSQ-xhjnIQiaIdQ 1 0 13059 0   5.8mb   5.8mb
yellow open search_fun_test                           aPRLpy-eRzG185DHcTQLhg 1 1     4 0   6.9kb   6.9kb
green  open .opendistro_security                      88369CGsQ3CShUM32lIe4w 1 0    10 0  75.6kb  75.6kb
green  open .kibana_1                                 fVVE40BkSEqQvTlA5A1Tng 1 0    61

In [2]:
# If you still have your documents from the Dev Tools test, we should be able to check them here:
try:
    print(client.cat.count("search_fun_test", params={"v": "true"}))
except:
    print("search_fun_test doesn't exist, that's OK")

epoch      timestamp count
1695621095 05:51:35  4



In [3]:
import json
def print_json(x):
    print(json.dumps(x, indent=2))

### Create Index

In [37]:
client.indices.delete("search_fun_revisited")

{'acknowledged': True}

In [38]:
index_name = 'search_fun_revisited'
index_body = {
  'settings': {
    'index': {
      'query':{
          'default_field': "body"
      }
    }
  }
}

try:
  response = client.indices.create(index_name, body=index_body)
  print(f'\nCreating index: {index_name}')
  print_json(response)
except:
  print(f"Index {index_name} already exists!")



Creating index: search_fun_revisited
{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "search_fun_revisited"
}


### Add Documents > Sequential

In [39]:
docs = [
    {
        "id": "doc_a",
        "title": "Fox and Hounds",
        "body": "The quick red fox jumped over the lazy brown dogs.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},
    {
        "id": "doc_b",
        "title": "Fox wins championship",
        "body": "Wearing all red, the Fox jumped out to a lead in the race over the Dog.",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.",
        "category": "childrens"}
]
import pandas as pd
pd.DataFrame(docs)

Unnamed: 0,id,title,body,price,in_stock,category
0,doc_a,Fox and Hounds,The quick red fox jumped over the lazy brown d...,5.99,True,childrens
1,doc_b,Fox wins championship,"Wearing all red, the Fox jumped out to a lead ...",15.13,True,sports
2,doc_c,Lead Paint Removal,All lead must be removed from the brown and re...,150.21,False,instructional
3,doc_d,The Three Little Pigs Revisted,"The big, bad wolf huffed and puffed and blew t...",3.51,True,childrens


In [40]:
for doc in docs:
    doc_id = doc["id"]
    print("Indexing {}".format(doc_id))
    response = client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )
    print('\n\tResponse:')
    print_json(response)

Indexing doc_a

	Response:
{
  "_index": "search_fun_revisited",
  "_id": "doc_a",
  "_version": 1,
  "result": "created",
  "forced_refresh": true,
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0
  },
  "_seq_no": 0,
  "_primary_term": 1
}
Indexing doc_b

	Response:
{
  "_index": "search_fun_revisited",
  "_id": "doc_b",
  "_version": 1,
  "result": "created",
  "forced_refresh": true,
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0
  },
  "_seq_no": 1,
  "_primary_term": 1
}
Indexing doc_c

	Response:
{
  "_index": "search_fun_revisited",
  "_id": "doc_c",
  "_version": 1,
  "result": "created",
  "forced_refresh": true,
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0
  },
  "_seq_no": 2,
  "_primary_term": 1
}
Indexing doc_d

	Response:
{
  "_index": "search_fun_revisited",
  "_id": "doc_d",
  "_version": 1,
  "result": "created",
  "forced_refresh": true,
  "_shards": {
    "total": 2,
    "successful": 1,
    "failed": 0


### Add Documents > Bulk

In [98]:
index_name = 'search_fun_bulk'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    }
}

try:
    client.indices.create(index_name, body=index_body)
    print(f"Created index {index_name}")
except:
    print(f"Index {index_name} already exists!")

Created index search_fun_bulk


In [99]:
docs_bulk = docs
for doc in docs_bulk:
    doc["_id"] = doc["id"]
    doc["_index"] = index_name
pd.DataFrame(docs_bulk)

Unnamed: 0,id,title,body,price,in_stock,category,_index,_id
0,doc_a,Fox and Hounds,The quick red fox jumped over the lazy brown d...,5.99,True,childrens,search_fun_bulk,doc_a
1,doc_b,Fox wins championship,"Wearing all red, the Fox jumped out to a lead ...",15.13,True,sports,search_fun_bulk,doc_b
2,doc_c,Lead Paint Removal,All lead must be removed from the brown and re...,150.21,False,instructional,search_fun_bulk,doc_c
3,doc_d,The Three Little Pigs Revisted,"The big, bad wolf huffed and puffed and blew t...",3.51,True,childrens,search_fun_bulk,doc_d


In [100]:
from opensearchpy.helpers import bulk
bulk(client, docs)

print(client.cat.count(index_name, params={"v": "true"}))

epoch      timestamp count
1695623716 06:35:16  0



In [73]:
print_json(client.indices.get_mapping(index_name))

{
  "search_fun_bulk": {
    "mappings": {
      "properties": {
        "body": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "category": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "id": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "in_stock": {
          "type": "boolean"
        },
        "price": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "title": {
          "type": "text",
          "fields": {
            "keyword": {

In [93]:
client.indices.delete("search_fun_revisited_custom_mappings")

{'acknowledged': True}

In [94]:
index_name = 'search_fun_revisited_custom_mappings'
index_body = {
    'settings': {
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

try:
    client.indices.create(index_name, body=index_body)
    print(f"Created index {index_name}")
except:
    print(f"Index {index_name} already exists!")

Created index search_fun_revisited_custom_mappings


In [95]:
for doc in docs:
    doc["_index"] = index_name
bulk(client, docs)
print(client.cat.count(index_name, params={"v": "true"}))

epoch      timestamp count
1695623523 06:32:03  0



In [103]:
s = requests.Session()
s.auth = auth
s.verify = False


In [113]:
def get_all_documents(index_name):
    response = s.get(f"https://localhost:9200/{index_name}/_search?q=body:*")
    return pd.DataFrame([x['_source'] for x in response.json()['hits']['hits']])

get_all_documents('search_fun_revisited_custom_mappings')

Unnamed: 0,id,title,body,price,in_stock,category
0,doc_a,Fox and Hounds,The quick red fox jumped over the lazy brown d...,5.99,True,childrens
1,doc_b,Fox wins championship,"Wearing all red, the Fox jumped out to a lead ...",15.13,True,sports
2,doc_c,Lead Paint Removal,All lead must be removed from the brown and re...,150.21,False,instructional
3,doc_d,The Three Little Pigs Revisted,"The big, bad wolf huffed and puffed and blew t...",3.51,True,childrens


In [123]:
index_name = 'search_fun_revisited_custom_mappings'
response = s.get(f"https://localhost:9200/{index_name}")
print_json(response.json()[index_name]['mappings'])

{
  "properties": {
    "body": {
      "type": "text",
      "analyzer": "english"
    },
    "category": {
      "type": "keyword",
      "ignore_above": 256
    },
    "id": {
      "type": "text",
      "fields": {
        "keyword": {
          "type": "keyword",
          "ignore_above": 256
        }
      }
    },
    "in_stock": {
      "type": "boolean"
    },
    "price": {
      "type": "float"
    },
    "title": {
      "type": "text",
      "analyzer": "english"
    }
  }
}


In [96]:
!curl -s -XGET "https://localhost:9200/search_fun_revisited" -u admin:admin --insecure | jq

[1;39m{
  [0m[34;1m"search_fun_revisited"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"aliases"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"mappings"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"properties"[0m[1;39m: [0m[1;39m{
        [0m[34;1m"body"[0m[1;39m: [0m[1;39m{
          [0m[34;1m"type"[0m[1;39m: [0m[0;32m"text"[0m[1;39m,
          [0m[34;1m"fields"[0m[1;39m: [0m[1;39m{
            [0m[34;1m"keyword"[0m[1;39m: [0m[1;39m{
              [0m[34;1m"type"[0m[1;39m: [0m[0;32m"keyword"[0m[1;39m,
              [0m[34;1m"ignore_above"[0m[1;39m: [0m[0;39m256[0m[1;39m
            [1;39m}[0m[1;39m
          [1;39m}[0m[1;39m
        [1;39m}[0m[1;39m,
        [0m[34;1m"category"[0m[1;39m: [0m[1;39m{
          [0m[34;1m"type"[0m[1;39m: [0m[0;32m"text"[0m[1;39m,
          [0m[34;1m"fields"[0m[1;39m: [0m[1;39m{
            [0m[34;1m"keyword"[0m[1;39m: [0m[1;39m{
              [0m[34;1m"type"[0m[1;39m: