In [None]:
!pip install elasticsearch

In [2]:
!docker cp es01:/usr/share/elasticsearch/config/certs/http_ca.crt .

Copy your HTTPS Certificate to your local working directory (IMPORTANT)

Test to see if your certificate is there

In [None]:
!cat http_ca.crt

In [1]:
import os
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm


In [2]:
# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD') # Looks like this  "kC9vasoasdasdasdasdpQ2w" #Your Password from the initial steps of Elastic Search


In [4]:

# Create the client instance
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)


In [5]:
if es.ping():
    print("Connected to Elasticsearch cluster")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch cluster


In [6]:
# Test your connection

In [7]:
# Index name
index_name = "wiki-summary"

# Document to be indexed
document = {
    "title": "Document Title",
    "text": "This is the content of the document."
}

# Index the document
response = es.index(index=index_name, body=document)

In [8]:
response

ObjectApiResponse({'_index': 'wiki-summary', '_id': 'I68wAo4BkRXLy0xEI7QI', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 5315391, '_primary_term': 1})

# Test with the first 10 documents

In [27]:
file_path = "/Users/work/Downloads/without-punctuation/without-punctuation.txt"  # Replace with the path to your file

# Open the file and iterate through each line
count=0
num_lines = sum(1 for line in open(file_path, "r"))


with open(file_path, "r") as file:
    for line in tqdm(file, total=num_lines, desc="Reading file", unit="lines"):
        count=count+1
        if count>10:
            break
        # Split the line based on the separator "|||"
        parts = line.strip().split("|||")
        
        # Assuming you want to access the first and second parts separately
        if len(parts) == 2:
            title = parts[0].strip()
            text = parts[1].strip()
            
            document = {
                "title": title,
                "text": text
            }
            response = es.index(index=index_name, body=document)
            
        else:
            # Handle lines that don't have exactly two parts separated by "|||"
            print("Invalid line format:", line)

Reading file:   0%|          | 0/5315384 [00:00<?, ?lines/s]

Perform a Query

In [29]:
response

ObjectApiResponse({'_index': 'wiki-summary', '_id': 'La8zAo4BkRXLy0xEvrQx', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 5315401, '_primary_term': 1})

In [None]:
file_path = "/Users/work/Downloads/without-punctuation/without-punctuation.txt"  # Replace with the path to your file

# Open the file and iterate through each line
num_lines = sum(1 for line in open(file_path, "r"))


with open(file_path, "r") as file:
    for line in tqdm(file, total=num_lines, desc="Reading file", unit="lines"):
        # Split the line based on the separator "|||"
        parts = line.strip().split("|||")
        
        # Assuming you want to access the first and second parts separately
        if len(parts) == 2:
            title = parts[0].strip()
            text = parts[1].strip()
            
            document = {
                "title": title,
                "text": text
            }
            response = es_client.index(index=index_name, body=document)
            
        else:
            # Handle lines that don't have exactly two parts separated by "|||"
            print("Invalid line format:", line)

Reading file:   0%|          | 0/5315384 [00:00<?, ?lines/s]

Invalid line format: unary numeral system ||| the unary numeral system is the bijective base-1 numeral system it is the simplest numeral system to represent natural numbers in order to represent a number n an arbitrarily chosen symbol representing 1 is repeated n times for examples the numbers 1 2 3 4 5 would be represented in this system as 1 11 111 1111 11111 these numbers should be distinguished from repunits which are also written as sequences of ones but have their usual decimal numerical interpretation this system is used in tallying for example using the tally mark | the number 3 is represented as ||| in east asian cultures the number three is represented as “ 三 ” a character that is drawn with three strokes



In [None]:
## Write your wrapper function into DSPY to return as the proper format

In [13]:
#Let say your index in Elastic Search is 
index_name = "wiki-summary"

In [24]:
import dspy
from typing import Optional

class elastic_rm(dspy.Retrieve):
    def __init__(self, es_client, es_index, es_field, k=3):
        """"
        A retrieval module that uses Elastic simple vector search to return the top passages for a given query.
        Assumes that you already have instanciate your ESClient.

        The code has been tested with ElasticSearch 8.12
        For more information on how to instanciate your ESClient, please refer to the official documentation.
        Ref: https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html

        Args:
            es_client (Elasticsearch): An instance of the Elasticsearch client.
            es_index (str): The name of the index to search.
            es_field (str): The name of the field to search.
            k (Optional[int]): The number of context strings to return. Default is 3.
        """
        super().__init__()
        self.k=k
        self.es_index=es_index
        self.es_client=es_client
        self.field=es_field
        

    def forward(self, query,k: Optional[int] = None) -> dspy.Prediction:
        """Search with Elastic Search - local or cloud for top k passages for query or queries
   

        Args:
            query_or_queries (Union[str, List[str]]): The query or queries to search for.
            k (Optional[int]): The number of context strings to return, if not already specified in self.k

        Returns:
            dspy.Prediction: An object containing the retrieved passages.
        """

        k = k if k is not None else self.k

        passages = []

        # Define the index to search
        index_name = self.es_index #the name of the index of your elastic-search-dump

        # Define the search query
        search_query = {
            "query": {
                "match": {
                    self.field: query  #took for granted that your index has : title, text as document format
                }
            }
        }

        # Perform the search
        response = self.es_client.search(index=index_name, body=search_query)

        for hit in response['hits']['hits']:

            #Uncomment for debug...
            # Retrieve the score
            #score = hit["_score"]
            # Retrieve other fields from the source
            #title = hit["_source"]["title"]
            text = hit["_source"]["text"]
            #print("Score: %.2f | Tile: %s | Text: %s" % (score,title, text))
            passages.append(text)
            if len(passages) == self.k:  # Break the loop once k documents are retrieved
                break

        return dspy.Prediction(passages=passages)

In [25]:
es_retriever = elastic_rm(es,es_index=index_name, es_field="text", k=3)

In [26]:
es_retriever("Who is abraham lincoln")

Prediction(
    passages=['mary lincoln may refer to mary lincoln crume 1775-1851 daughter of abraham lincoln captain and bathsheba herring and aunt of american president abraham lincoln mary todd lincoln 1818–1882 wife of american president abraham lincoln mary johnson bailey lincoln 1844–1921 american science teacher mary mamie lincoln 1869–1938 granddaughter of abraham lincoln mary lincoln beckwith 1898–1975 prominent descendant of abraham lincoln', 'abraham lincoln a history is an 1890 ten-volume account of the life and times of abraham lincoln written by john nicolay and john hay who were his personal secretaries during the american civil war', 'inauguration of abraham lincoln may refer to first inauguration of abraham lincoln 1861 second inauguration of abraham lincoln 1865']
)