In [None]:
!pip install elasticsearch

In [None]:
pip -q install eland elasticsearch sentence_transformers transformers torch==1.11

In [None]:
from elasticsearch import Elasticsearch, helpers
from urllib.request import urlopen
import json
from pathlib import Path
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
from elasticsearch.client import MlClient
import getpass

In [None]:
# Found in the 'Manage Deployment' page
CLOUD_ID = getpass.getpass('Enter Elastic Cloud ID:  ')

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = getpass.getpass('Enter Elastic password:  ')

# Create the client instance
client = Elasticsearch(
    cloud_id=CLOUD_ID,
    basic_auth=("elastic", ELASTIC_PASSWORD),
    request_timeout=3600
)

In [None]:
# Set the model name from Hugging Face and task type
# sentence-transformers model
hf_model_id='sentence-transformers/all-mpnet-base-v2'
tm = TransformerModel(hf_model_id, "text_embedding")

#set the modelID as it is named in Elasticsearch
es_model_id = tm.elasticsearch_model_id()

# Download the model from Hugging Face
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)

# Load the model into Elasticsearch
ptm = PyTorchModel(client, es_model_id)
ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)

# Start the model
s = MlClient.start_trained_model_deployment(client, model_id=es_model_id)
s.body

In [None]:
client.indices.create(
    index="ecommerce",
    mappings= {
    "properties": {
      "product": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "description": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "category": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
  }
})

In [None]:
# Creating an ingest pipeline with inference processors to use ELSER (sparse) and all-mpnet-base-v2 (dense) to infer against data that will be ingested in the pipeline.

client.ingest.put_pipeline( 
    id="ecommerce-pipeline",
    processors = [
    {
      "inference": {
        "model_id": ".elser_model_1",
        "target_field": "ml",
        "field_map": {
          "description": "text_field"
        },
        "inference_config": {
          "text_expansion": { # text_expansion inference type (ELSER)
            "results_field": "tokens"
          }
        }
      }
    },
    {
      "inference": {
        "model_id": "sentence-transformers__all-mpnet-base-v2",
        "target_field": "description_vector", # Target field for the inference results
        "field_map": {
          "description": "text_field" # Field matching our configured trained model input. Typically for NLP models, the field name is text_field.
        }
      }
    }
  ]
)

In [None]:
INDEX = 'ecommerce-search'
client.indices.create(
            index=INDEX,
            settings={
                "index": {
                    "number_of_shards": 1,
                    "number_of_replicas": 1
                }
            },
            mappings={
# Saving disk space by excluding the ELSER tokens and the dense_vector field from document source.
# Note: That should only be applied if you are certain that reindexing will not be required in the future.
            "_source" : {
            "excludes": ["ml.tokens","description_vector.predicted_value"]
           }, 
        "properties": {
        "product": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "description": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "category": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "ml.tokens": { # The name of the field to contain the generated tokens.
        "type": "rank_features" # ELSER output must be ingested into a field with the rank_features field type.
      }, 
     "description_vector.predicted_value": { # Inference results field, target_field.predicted_value
     "type": "dense_vector", 
     "dims": 768, # The all-mpnet-base-v2 model has embedding_size of 768, so dims is set to 768.
     "index": "true", 
     "similarity": "dot_product" #  When indexing vectors for approximate kNN search, you need to specify the similarity function for comparing the vectors.
 }
  }
 
}
)

In [None]:
# Ecommerce dataset
url = "https://raw.githubusercontent.com/priscillaparodi/dataset/main/products-ecommerce.json"

response = urlopen(url)

# Load the response data into a JSON object
data_json = json.loads(response.read())

def create_index_body(doc):
    """ Generate the body for an Elasticsearch document. """
    return {
        "_index": "ecommerce",
        "_source": doc,
    }

# Prepare the documents to be indexed
documents = [create_index_body(doc) for doc in data_json]

# Use helpers.bulk to index
helpers.bulk(client, documents)

print("Done indexing documents into `ecommerce` index")

In [None]:
# Reindex data from one index 'source' to another 'dest' with the 'ecommerce-pipeline' pipeline.

client.reindex(wait_for_completion=True,
               source={
                  "index": "ecommerce"
    },
               dest= {
                  "index": "ecommerce-search",
                  "pipeline": "ecommerce-pipeline"
    }
)

In [None]:
# List of results

# BM25

print(f"\nBM25:\n")

response1 = client.search(size=3,
    index="ecommerce-search",
    query= {
            "match": {
                    "description" : {  
                    "query": "<text>"
                        }
                    }
        }
)
hits = response1['hits']['hits']

if not hits:
    print("BM25 Result: No matches found")
else:
    for hit in hits:
        score = hit['_score']
        product = hit['_source']['product']
        category = hit['_source']['category']
        description = hit['_source']['description']
        print(f"\nScore: {score}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")

# KNN

print(f"\nKNN:\n")

response2 = client.search(index='ecommerce-search', size=3,
            knn={
            "field": "description_vector.predicted_value",
            "k": 49,
            "num_candidates": 2495,
            "query_vector_builder": {
            "text_embedding": { 
            "model_id": "sentence-transformers__all-mpnet-base-v2", 
            "model_text": "<text>" 
      }
    }
            }
)

for hit in response2['hits']['hits']:
    
    score = hit['_score']
    product = hit['_source']['product']
    category = hit['_source']['category']
    description = hit['_source']['description']
    print(f"\nScore: {score}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")

# ELSER

print(f"\nELSER:\n")

response3 = client.search(index='ecommerce-search', size=3,
              query={
                  "text_expansion": {
                  "ml.tokens": {
                      "model_id":".elser_model_1",
                      "model_text":"<text>"                
        }
    }
}
)

for hit in response3['hits']['hits']:

    score = hit['_score']
    product = hit['_source']['product']
    category = hit['_source']['category']
    description = hit['_source']['description']
    print(f"\nScore: {score}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")

print(f"\nHybrid Search\n")

# BM25 + KNN

print(f"\nBM25 + KNN:\n")

response4 = client.search(index='ecommerce-search', size=3,
              query={
             "match": {
             "description" : {  
             "query": "<text>",
              "boost": 1
                        }
                        }                 
                   
},
            knn={
            "field": "description_vector.predicted_value",
            "k": 49,
            "num_candidates": 2495,
            "boost": 1,
            "query_vector_builder": {
            "text_embedding": { 
            "model_id": "sentence-transformers__all-mpnet-base-v2", 
            "model_text": "<text>" 
      }
    }
            }
)

for hit in response4['hits']['hits']:

    score = hit['_score']
    product = hit['_source']['product']
    category = hit['_source']['category']
    description = hit['_source']['description']
    print(f"\nScore: {score}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")

# BM25 + KNN (RRF)

print(f"\nBM25 + KNN (RRF):\n")

response5 = client.search(index='ecommerce-search', size=3,
             query = {
             "match": {
             "description" : {  
             "query": "<text>",
                        }
                        }                 
                   
},
             knn = {
            "field": "description_vector.predicted_value",
            "k": 49,
            "num_candidates": 2495,
            "query_vector_builder": {
            "text_embedding": { 
            "model_id": "sentence-transformers__all-mpnet-base-v2", 
            "model_text": "<text>" 
      }
    }
            },
        rank = {
        "rrf": {
            "window_size": 2495,
            "rank_constant": 10
        }
    }
)

for hit in response5['hits']['hits']:
    
    rank = hit['_rank']
    category = hit['_source']['category']
    product = hit['_source']['product']
    description = hit['_source']['description']
    print(f"\nRank: {rank}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")

# BM25 + ELSER
# Note: Client is not compatible with Elastic 8.9 to use sub_searches parameter. Not adding RRF, for now.
# Using compound query - 'should' appear in the matching document.

print(f"\nBM25 + ELSER:\n")

response6 = client.search(index='ecommerce-search', size=3,

        query= {
            "bool": {
                "should": [
                    {
                        "match": {
                            "description" : {  
                            "query": "<text>",
                            "boost": 1
                        }
                        }
                    },                   
                    {
                        "text_expansion": {
                            "ml.tokens": {
                                "model_id": ".elser_model_1",
                                "model_text": "<text>",
                                "boost": 1
                            }
                        }
                    }
                ]
            }
        }

)

for hit in response6['hits']['hits']:

    score = hit['_score']
    product = hit['_source']['product']
    category = hit['_source']['category']
    description = hit['_source']['description']
    print(f"\nScore: {score}\nProduct: {product}\nCategory: {category}\nDescription: {description}\n")