In [1]:
from elasticsearch import Elasticsearch, helpers
import json
import pandas as pd

In [2]:
# Credential file
df = pd.read_csv('./elastic credential/watsonx_discovery_credential_tzwx24.csv')

In [3]:
# setting up the credential
ELASTIC_USER = df.iloc[0]['username']
ELASTIC_PW = df.iloc[0]['password']
ELASTIC_HOST = df.iloc[0]['watsonx_discovery_url'] # WxD Endpoint
ELASTIC_PORT = df.iloc[0]['port']  #  port number
ELASTIC_CERT_FILE = "./elastic_certificate/es.cert" # path containing certiifcation
INGEST_PIPELINE_NAME="e5-pipeline"

INDEX_NAME_DOC="index-e5"
MODEL_ID=".multilingual-e5-small"
MODEL_ID_E5 = MODEL_ID
INDEX_NAME_E5 = INDEX_NAME_DOC
INGEST_PIPELINE_NAME_E5 = INGEST_PIPELINE_NAME

In [4]:
client = Elasticsearch(
     ELASTIC_HOST+':'+str(ELASTIC_PORT),  # Elasticsearch endpoint
     basic_auth=(ELASTIC_USER, ELASTIC_PW),
     ca_certs=ELASTIC_CERT_FILE,
     verify_certs=True,
     request_timeout=120
)

In [5]:
print(client.info())

{'name': 'm-0.a66ff265-f36e-48ca-baa3-23289dc4bda3.b51f17bc2e17458483740b0a4e9faede.c5kmhkid0ujpmrucb800.databases.appdomain.cloud', 'cluster_name': 'a66ff265-f36e-48ca-baa3-23289dc4bda3', 'cluster_uuid': 'k8AWPXqXT2GmN_ckkb4aTw', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Test search results with Standard Query

In [13]:
# test query
query = """
YOUR QUERY HERE?
"""

In [32]:
def print_response(response):
    score_list = list()
    id_list = list()
    text_list = list()
    record_list = list()
    record = ''
    all_records = ''
    for hit in response['hits']['hits']:
        score = hit['_score']
        text = hit['_source']['text']
        id = hit['_id']
        
        print(f"Score: {score}\nId: {id}")
        print(f"Text: {text}\n")
        score_list.append(score)
        id_list.append(id)
        text_list.append(text)

        record = f'''
        Score: {score},
        Id: {id},

        {text}

'''
        all_records = all_records + record
    return all_records

# Test search results with Semantic Query using the E5 embeddings

In [24]:
### Retrieving the answer by searching all the documents in the knowledge base

In [35]:
response = client.search(
    index=INDEX_NAME_E5, 
    _source=["title", "text"],
    knn={
       "field": "passage_embedding.predicted_value",
        "query_vector_builder": {
        "text_embedding": {
            "model_id": MODEL_ID_E5,
            "model_text": query
        }
    },
    "k": 3,
    "num_candidates": 100,
    # Remove the filter condition to search across all documents
    # "filter": {
    #     "term": {
    #       "title": "document.pdf"
    #     }
    # }
    }
)

In [None]:
print('query: ',query)
print('---------------------------------------------------------------------------------------------------------------------------------------------')
print('top-k retrieved documents: ')
all_records = print_response(response)
with open('./top-k-results.txt', 'w') as file:
    file.write(all_records)


In [10]:
# displaying the embedding of the query
embedding_response = client.ml.infer_trained_model(
  model_id=MODEL_ID_E5,
  docs={
    "text_field": query
  }
)

embedding_response = embedding_response['inference_results'][0]['predicted_value']

print(embedding_response)
print('vector dimensions: ',len(embedding_response))

[0.06569431722164154, -0.060937199741601944, -0.06180936470627785, -0.05385785549879074, 0.0354963019490242, -0.04639774188399315, 0.019259199500083923, 0.057852692902088165, 0.06701675057411194, -0.002707934007048607, 0.048455655574798584, 0.04657995328307152, 0.05171820893883705, -0.04945571348071098, -0.01599232852458954, 0.07136876881122589, 0.06835756450891495, -0.07991820573806763, -0.05303341895341873, -0.06053845211863518, 0.03884929046034813, -0.006243299692869186, -0.02369079552590847, 0.03650105372071266, 0.05961357802152634, 0.04816785454750061, -0.0615050233900547, -0.006271469406783581, 0.014892980456352234, -0.03818492218852043, -0.022313157096505165, -0.03922266140580177, 0.06258231401443481, -0.05160447955131531, 0.022579243406653404, 0.02659033052623272, -0.04115267097949982, -0.03478269651532173, 0.07060214877128601, -0.029985500499606133, -0.02844456396996975, 0.019679032266139984, 0.07415658235549927, 0.04614618048071861, 0.08956695348024368, 0.05010518059134483, -

In [32]:
#########################################################################