In [1]:

import json

import nltk
from opensearchpy import OpenSearch

# Do some setup work
nltk.download('words')
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('popular')

host = 'localhost'
port = 9200
auth = ('admin', 'admin')  # For testing only. Don't store credentials in code.

# Create the client with SSL/TLS enabled, but hostname and certificate verification disabled.
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

[nltk_data] Downloading package words to /Users/sengopal/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/sengopal/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sengopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sengopal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/sengopal/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/sengopal/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading packa

In [2]:
# Create an index with non-default settings.
# Create a new index, this time with different mappings
index_name = 'searchml_week2'
index_body = {
    'settings': {
        "analysis": {
            "filter": {
                "pos_filter": {
                    "type": "pattern_capture",
                    "preserve_original": True,
                    "patterns": ["(.*)#(.*)"]
                }
            },
            "analyzer": {

                "body_pos": {
                    "tokenizer": "whitespace",  # we can't use standard b/c it strips or delimiters
                    "filter": ["pos_filter", "lowercase"]  # put whatever else here
                },
                "body_pos_search": {
                    "tokenizer": "whitespace",  # we can't use standard b/c it strips or delimiters
                    "filter": ["lowercase"]  # put whatever else here
                }

            }
        },
        'index': {
            'query': {
                'default_field': "body"
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "body": {"type": "text", "analyzer": "english"},
            "body_sentences": {"type": "text", "analyzer": "english"},
            # Notice the different search analyzer
            "body_pos": {"type": "text", "analyzer": "body_pos", "search_analyzer": "body_pos_search"},

            "body_ne": {"type": "text", "analyzer": "standard"},
            "in_stock": {"type": "boolean"},
            "category": {"type": "keyword", "ignore_above": "256"},
            "price": {"type": "float"}
        }
    }
}

In [3]:
try:
    client.indices.delete(index_name)
except:
    pass
client.indices.create(index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'searchml_week2'}

In [4]:
# Add our sample document to the index.
docs = [
    {
        "id": "doc_b",
        "title": "Wayne Gretzky",
        "body": "The greatest hockey player of all time is Wayne Gretzky. He holds a record for holding the most records!  Who else even comes close?",
        "price": "15.13",
        "in_stock": True,
        "category": "sports"},
    {
        "id": "doc_a",
        "title": "Apple iPhone 13",
        "body": "The all new Apple iPhone 13 has 3 cameras and the fastest chip on the market.  The phone retails for $699 for 64GB of storage.",
        "price": "5.99",
        "in_stock": True,
        "category": "childrens"},

    {
        "id": "doc_c",
        "title": "Lead Paint Removal",
        "body": "All lead must be removed from the brown and red paint.  Use the Glidden Lead Paint Killer solvent to keep the paint, but remove the lead!",
        "price": "150.21",
        "in_stock": False,
        "category": "instructional"},
    {
        "id": "doc_d",
        "title": "The Three Little Pigs Revisted",
        "price": "3.51",
        "in_stock": True,
        "body": "The big, bad wolf huffed and puffed and blew the house down. The end.  Well, not quite.  It seems the pigs filed an injunction against the wolf and now the wolf has to pay restitution.",
        "category": "childrens"},
    {
        "id": "doc_e",
        "title": "Green apples and Spam",
        "price": "2.99",
        "in_stock": True,
        "body": "The little green apple fell from the tree.  It was not a bad apple, so no one could understand why it fell.",
        "category": "childrens"},
    {
        "id": "doc_f",
        "title": "Fun with Spans",
        "price": "4.99",
        "in_stock": True,
        "body": "Dan is the President. The United States Government has arrested him.",
        "category": "childrens"}

]

In [5]:
def get_entities(named_entities, entity_types):
    result = ""
    for ent in named_entities:  # two cases: we have a NNP or we have a tree
        if isinstance(ent, tuple):
            e_type = ent[1]
            if e_type in entity_types:
                result += ent[0] + " "
        elif isinstance(ent, nltk.Tree):
            if ent.label() in entity_types:
                # these are tuples, we want all of them, but just the first part
                result += "_".join([x[0] for x in ent.leaves()])

    return result

In [7]:
for doc in docs:
    doc_id = doc["id"]
    for item in ["body"]:  # Just do body for now
        value = doc[item]
        tokens = nltk.word_tokenize(value)
        sentences = nltk.sent_tokenize(value)
        pos = nltk.pos_tag(tokens)
        named_entities = nltk.ne_chunk(pos)
        doc["%s_sentences" % item] = " ".join("__SB__ %s __SE__" % x for x in sentences)
        doc["%s_pos" % item] = " ".join(["#".join([x[0], "__%s__" % x[1]]) for x in pos])
        doc["%s_ne" % item] = get_entities(named_entities, {"ORGANIZATION", "PERSON", "NNP"})
    print("Indexing {} as: {}".format(doc_id, json.dumps(doc, indent=4)))

    client.index(
        index=index_name,
        body=doc,
        id=doc_id,
        refresh=True
    )

Indexing doc_b as: {
    "id": "doc_b",
    "title": "Wayne Gretzky",
    "body": "The greatest hockey player of all time is Wayne Gretzky. He holds a record for holding the most records!  Who else even comes close?",
    "price": "15.13",
    "in_stock": true,
    "category": "sports",
    "body_sentences": "__SB__ The greatest hockey player of all time is Wayne Gretzky. __SE__ __SB__ He holds a record for holding the most records! __SE__ __SB__ Who else even comes close? __SE__",
    "body_pos": "The#__DT__ greatest#__JJS__ hockey#__NN__ player#__NN__ of#__IN__ all#__DT__ time#__NN__ is#__VBZ__ Wayne#__NNP__ Gretzky#__NNP__ .#__.__ He#__PRP__ holds#__VBZ__ a#__DT__ record#__NN__ for#__IN__ holding#__VBG__ the#__DT__ most#__RBS__ records#__NNS__ !#__.__ Who#__WP__ else#__RB__ even#__RB__ comes#__VBZ__ close#__RB__ ?#__.__",
    "body_ne": "Wayne_Gretzky"
}
Indexing doc_a as: {
    "id": "doc_a",
    "title": "Apple iPhone 13",
    "body": "The all new Apple iPhone 13 has 3 cameras and

In [8]:
# Verify they are in:
print(client.cat.count(index_name, params={"v": "true"}))

epoch      timestamp count
1656696124 17:22:04  6



In [9]:
print("Proper Noun Apple")
# Do fine Apple as a proper noun
q = 'apple#__NNP__'
query = {
    'size': 5,
    'query': {
        'query_string': {
            'query': q,
            'fields': ['body_pos']
        }
    }
}

rsp = client.search(
    body=query,
    index=index_name
)

print(json.dumps(rsp, indent=2))

Proper Noun Apple
{
  "took": 7,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 2.1059992,
    "hits": [
      {
        "_index": "searchml_week2",
        "_type": "_doc",
        "_id": "doc_a",
        "_score": 2.1059992,
        "_source": {
          "id": "doc_a",
          "title": "Apple iPhone 13",
          "body": "The all new Apple iPhone 13 has 3 cameras and the fastest chip on the market.  The phone retails for $699 for 64GB of storage.",
          "price": "5.99",
          "in_stock": true,
          "category": "childrens",
          "body_sentences": "__SB__ The all new Apple iPhone 13 has 3 cameras and the fastest chip on the market. __SE__ __SB__ The phone retails for $699 for 64GB of storage. __SE__",
          "body_pos": "The#__DT__ all#__DT__ new#__JJ__ Apple#__NNP__ iPhone#__NN__ 13#__CD__ has#__VBZ__ 3#

In [11]:
print("Plain ol Apple")
# Do fine Apple as a common noun
q = 'apple#__NN__'
query = {
    'size': 5,
    'query': {
        'query_string': {
            'query': q,
            'fields': ['body_pos']
        }
    }
}

rsp = client.search(
    body=query,
    index=index_name
)

print(json.dumps(rsp, indent=2))

Plain ol Apple
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 2.6311197,
    "hits": [
      {
        "_index": "searchml_week2",
        "_type": "_doc",
        "_id": "doc_e",
        "_score": 2.6311197,
        "_source": {
          "id": "doc_e",
          "title": "Green apples and Spam",
          "price": "2.99",
          "in_stock": true,
          "body": "The little green apple fell from the tree.  It was not a bad apple, so no one could understand why it fell.",
          "category": "childrens",
          "body_sentences": "__SB__ The little green apple fell from the tree. __SE__ __SB__ It was not a bad apple, so no one could understand why it fell. __SE__",
          "body_pos": "The#__DT__ little#__JJ__ green#__JJ__ apple#__NN__ fell#__VBD__ from#__IN__ the#__DT__ tree#__NN__ .#__.__ It#__PRP__ wa

In [12]:
# Sentence query.  going to use a new type of query called a SpanQuery
q = 'President United States'  # should return one match
query = {
    'size': 5,
    'query': {
        'query_string': {
            'query': q,
            'fields': ['body_pos']
        }
    }
}

rsp = client.search(
    body=query,
    index=index_name
)

print(json.dumps(rsp, indent=2))

{
  "took": 22,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 7.04297,
    "hits": [
      {
        "_index": "searchml_week2",
        "_type": "_doc",
        "_id": "doc_f",
        "_score": 7.04297,
        "_source": {
          "id": "doc_f",
          "title": "Fun with Spans",
          "price": "4.99",
          "in_stock": true,
          "body": "Dan is the President. The United States Government has arrested him.",
          "category": "childrens",
          "body_sentences": "__SB__ Dan is the President. __SE__ __SB__ The United States Government has arrested him. __SE__",
          "body_pos": "Dan#__NNP__ is#__VBZ__ the#__DT__ President#__NNP__ .#__.__ The#__DT__ United#__NNP__ States#__NNPS__ Government#__NNP__ has#__VBZ__ arrested#__VBN__ him#__PRP__ .#__.__",
          "body_ne": "President Government "
     

In [13]:
# Should return no match
query = {
    'size': 5,
    'query': {
        "span_within":{
            "little":{
                "span_near": {
                    "clauses":[
                        {"span_term": {"body_sentences": "President"}},
                        {"span_term": {"body_sentences": "United"}},
                        {"span_term": {"body_sentences": "States"}}
                    ]
                }
            },
            "big":{
                "span_near": {
                    "clauses":[
                        {"span_term": {"body_sentences": "__SB__"}},
                        {"span_term": {"body_sentences": "__SE__"}},
                    ]
                }
            }
        }
    }
}

rsp = client.search(
    body=query,
    index=index_name
)

print(json.dumps(rsp, indent=2))

{
  "took": 56,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 0,
      "relation": "eq"
    },
    "max_score": null,
    "hits": []
  }
}
