In [17]:
from pymilvus import AnnSearchRequest
from langchain_community.embeddings import OllamaEmbeddings
from pymilvus import RRFRanker
from pymilvus import connections, Collection
import ast
import re
import json
from collections import defaultdict

connections.connect(
    host="localhost", # Replace with your Milvus server IP
    port="19530"
)
embeddings = OllamaEmbeddings(model="phi3")



def prayers_search(search_text,limit=5):
    query_vector = embeddings.embed_query(search_text)
    search_param_1 = {
    "data": [query_vector], # Query vector
    "anns_field": "prayerVector", # Vector field name
    "param": {
        "metric_type": "L2", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 10}
    },
    "limit": limit # Number of search results to return in this AnnSearchRequest
}
    req1 = AnnSearchRequest(**search_param_1)
    # Store these two requests as a list in `reqs`
    reqs = [req1]
    rerank = RRFRanker()
    collection = Collection(name="prayers_collection")
    collection.load()
    res = collection.hybrid_search(
    reqs, # List of AnnSearchRequests created in step 1
    rerank, # Reranking strategy specified in step 2
    limit=10 # Number of final search results to return
)

    # The given data string
    data_str = str(res[0])+''  # Extract the string from the list
    data_list = ast.literal_eval(data_str)  # Convert the string to an actual list
    # Extracting IDs using regular expressions
    ids = [int(re.search(r'id: (\d+)', entry).group(1)) for entry in data_list]
    # Query Milvus for the documents with the specified IDs
    if len(ids)>0:
        results = collection.query(
            expr=f"prayerId in {ids}",  # Assuming "prayerId" is the field storing the IDs
            output_fields=["prayerId", "prayerText"]  # Specify the fields you want to retrieve
        )
        return [i['prayerText'] for i in results]
    else:
        return []






In [18]:
def clean_judgements(results):
    combined_text = defaultdict(lambda: defaultdict(str))
    for entry in results:
        filename = entry['judgementFileName']
        chunk_id = entry['judgementChunkId']
        text = entry['judgementText']
        combined_text[filename][chunk_id] += text + " "  # Add a space for separation between texts

    # Create a list to store the final combined texts
    result = []

    # Combine the chunks in the correct order and format the result
    for filename, chunks in combined_text.items():
        combined_texts = ''.join(chunks[i] for i in sorted(chunks.keys()))  # Sort chunks by chunk_id
        result.append({'judgementFileName': filename, 'combinedJudgementText': combined_texts.strip()})
    return result

In [20]:
def judgement_search(search_text,limit=5):
    query_vector = embeddings.embed_query(search_text)
    search_param_1 = {
    "data": [query_vector], # Query vector
    "anns_field": "judgementVector", # Vector field name
    "param": {
        "metric_type": "L2", # This parameter value must be identical to the one used in the collection schema
        "params": {"nprobe": 10}
    },
    "limit": limit # Number of search results to return in this AnnSearchRequest
}
    req1 = AnnSearchRequest(**search_param_1)
    # Store these two requests as a list in `reqs`
    reqs = [req1]
    rerank = RRFRanker()
    collection = Collection(name="judgement_collection")
    collection.load()
    res = collection.hybrid_search(
    reqs, # List of AnnSearchRequests created in step 1
    rerank, # Reranking strategy specified in step 2
    limit=10 # Number of final search results to return
)

    # The given data string
    data_str = str(res[0])+''  # Extract the string from the list
    data_list = ast.literal_eval(data_str)  # Convert the string to an actual list
    # Extracting IDs using regular expressions
    ids = [int(re.search(r'id: (\d+)', entry).group(1)) for entry in data_list]
    # Query Milvus for the documents with the specified IDs
    if len(ids)>0:
        res_files = collection.query(
            expr=f"id in {ids}",  # Assuming "prayerId" is the field storing the IDs
            output_fields=["id",'judgementFileName']  # Specify the fields you want to retrieve
        )
        file_names = [i['judgementFileName'] for i in res_files]
        results = collection.query(
        expr=f"judgementFileName in {file_names}",  # Assuming "prayerId" is the field storing the IDs
        output_fields=["id", "judgementText","judgementChunkId",'judgementFileName'])
           

        return clean_judgements(results)
    else:
        return []

In [23]:
search_text = 'Section 437 of H MC Act Award costs'
prayer_result = prayers_search(search_text)

In [24]:
prayer = prayer_result[0] if prayer_result else ''
search_text = f'''section: Section 437 of H MC Act Award costs

Sample prayer:{prayer}
'''
judgement_search(search_text)

[{'judgementFileName': '43770.pdf',
  'combinedJudgementText': "KURIAN,J.\n1. Delay condoned.\n2. Leave granted.\n3. The appellant is aggrieved since the Third\nAdditional Family Court, Chennai was not taking steps to\ndispose of his case in spite of the direction of the High\nCourt in the judgment dated 29.06.2015 to dispose of the\ncase within a period of three months. When the matter\ncame up before this Court, the following order was\npassed: passed:\n“It is seen from the impugned Judgment that the\nHigh Court had directed the Third Additional\nFamily Court, Chennai, to dispose of H.M.O.P. No.\n1606 of 2014 within a period of three months from\nthe date of the impugned Judgment. The impugned\nJudgment was delivered on 29.06.2015.\nIt is reported that even after almost an year, the\nsaid case is not likely to be disposed of. said case is not likely to be disposed of.\nThe Registry is directed to call for a report from\nthe Third Additional Family Court, Chennai, as to\nwhat is the r

In [5]:
from langchain_community.document_loaders import PyPDFLoader

# Load the PDF
loader = PyPDFLoader("/home/ubuntu/ram/code/data/act_2_of_1956.pdf")

# Load and split the PDF into pages
pages = loader.load_and_split()

# Verify the number of pages loaded
total_pages = len(pages)
print(f"Total pages loaded: {total_pages}")

Total pages loaded: 599


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
splits = text_splitter.split_documents(pages)

In [8]:
splits[0]

Document(metadata={'source': '/home/ubuntu/ram/code/data/act_2_of_1956.pdf', 'page': 0}, page_content='THE GREATER HYDERABAD MUNICIPAL CORPORATION  \nACT, 1955.  \n(ACT NO. II OF 1956)  \nARRANGEMENT OF SECTIONS  \nSection s \n CHAPTER - I \nPreliminary  \n1. Short title, extent and commencement . \n2. Definitions.  \n3. Constitution of Corporation.  \n CHAPTER - II. \nThe Municipal Constitution.  \nMunicipal Authorities.  \n4. Municipal authorities charged with the execution of \nthe Act.')

In [9]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

In [10]:
# Connect to Milvus
connections.connect(
    host="localhost", # Replace with your Milvus server IP
    port="19530"
)

In [13]:
# collection_name = "section_collection"

# # Load the collection
# collection = Collection(name=collection_name)
# # 
# # Drop (delete) the collection
# collection.drop()