In [1]:
! pip install sqlalchemy elasticsearch langchain langchain-community



In [1]:
import os
import sqlite3
from sqlalchemy import create_engine, Column, Integer, String, Text
from sqlalchemy.orm import sessionmaker, declarative_base
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import ElasticVectorSearch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict, Any

In [2]:
ELASTICSEARCH_USERNAME = 'elastic'
# ELASTICSEARCH_PASSWORD = 'mg7eoG8kBAH0G3aALDNn'
ELASTICSEARCH_PASSWORD = 'OKPtlVrlZEVz=vwmz6Nj'
ELASTICSEARCH_HOST = 'https://localhost:9300'

In [3]:
es_client = Elasticsearch(
    hosts= ELASTICSEARCH_HOST,
    
    verify_certs=False,
    basic_auth=(
        ELASTICSEARCH_USERNAME,
        ELASTICSEARCH_PASSWORD,
    ),
)
if es_client.ping():
    print("Connected to Elasticsearch successfully!!!")
    
else:
    print("Could not Connect to Elastic Search after many retries")
    

Connected to Elasticsearch successfully!!!


  _transport = transport_class(


In [4]:
# === SQLite Configuration ===
DATABASE_URL = "sqlite:///parent_chunks.db"

In [5]:
# SQLAlchemy Base and Model
Base = declarative_base()

class ParentChunk(Base):
    __tablename__ = "parent_chunks"
    par_id = Column(Integer, primary_key=True, autoincrement=True)
    text = Column(Text, nullable=False)

In [6]:
# Setup SQLite engine and session
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
db_session = Session()


In [7]:
# === Step 1: Read Essay ===
with open("essay.txt", "r", encoding="utf-8") as file:
    content = file.read()

In [8]:
# Step 2: Split content into parent and child chunks

# Parent chunk splitting with ParentDocumentRetriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,  add_start_index=True)

# Simple splitting for child chunks (400 characters)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap = 0, separators=["\n", ". ", ","], add_start_index=True)

In [9]:
# Split the document into parent chunks
parent_chunks = parent_splitter.split_text(content)

In [10]:
parent_chunks_with_id = []

for i, chunk in enumerate(parent_chunks):
    par = {"par_id": i, 
           "text": chunk}
    parent_chunks_with_id.append(par)

In [11]:
parent_chunks_with_id

[{'par_id': 0,
  'text': 'Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication. The Outback’s vast red deserts contrast sharply with lush rainforests and golden beaches, making it a paradise for adventurers and nature lovers alike. Australia\'s indigenous heritage, rooted in the traditions of Aboriginal and Torres Strait Islander peoples, adds a profound cultural depth to the country.\n\nNestled in the heart of the Himalayas, Bhutan is a small, landlocked kingdom often referred to as the "Land of the Thunder Dragon." It is celebrated for its commitment to Gross National Happiness, prioritizing well-being over economic growth. Pristine landscapes, from snow-capped mountains to verdant valleys, frame ancient monasteries like the iconic Tiger’s Nest. Bhutan’s deep-rooted Buddhist tradition

In [12]:
child_id_counter = 0  # Initialize the counter for child IDs
child_chunks = []

In [13]:
for parent_chunk in parent_chunks_with_id:
    # For each parent chunk, generate child chunks
    p = parent_chunk['text']
    pid = parent_chunk['par_id']
    children = child_splitter.split_text(p)
    for child in children:
        child_chunks.append({
            'id': child_id_counter,  # Unique ID for child chunk
            'text': child,  # Child chunk content
            'parent_id': pid # Assign the parent ID to the child chunk
        })
        child_id_counter += 1 

In [14]:
# === Step 3: Store Parent Chunks in SQLite ===
for chunk in parent_chunks_with_id:
    parent_chunk = ParentChunk(par_id = chunk['par_id'] , text=chunk["text"])
    db_session.add(parent_chunk)
db_session.commit()

print("Parent chunks stored successfully in SQLite.")

Parent chunks stored successfully in SQLite.


In [15]:
child_chunks

[{'id': 0,
  'text': 'Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication',
  'parent_id': 0},
 {'id': 1,
  'text': ". The Outback’s vast red deserts contrast sharply with lush rainforests and golden beaches, making it a paradise for adventurers and nature lovers alike. Australia's indigenous heritage, rooted in the traditions of Aboriginal and Torres Strait Islander peoples, adds a profound cultural depth to the country.",
  'parent_id': 0},
 {'id': 2,
  'text': 'Nestled in the heart of the Himalayas, Bhutan is a small, landlocked kingdom often referred to as the "Land of the Thunder Dragon." It is celebrated for its commitment to Gross National Happiness, prioritizing well-being over economic growth. Pristine landscapes, from snow-capped mountains to verdant valleys, frame ancient mo

In [69]:
# Name of the index to delete
index_name = "child_chunks_test"

try:
    # Check if the index exists
    if es_client.indices.exists(index=index_name):
        # Delete the index
        es_client.indices.delete(index=index_name)
        print(f"Index '{index_name}' deleted successfully.")
    else:
        print(f"Index '{index_name}' does not exist.")
except Exception as e:
    print(f"Error occurred: {e}")

Index 'child_chunks_test' deleted successfully.




In [70]:
# Get all indices and print their names
indices = es_client.cat.indices(format="json")

# Print the names of all indices
for index in indices:
    print(index['index'])

test_child_chunks
government
dev_khojai_collection
dev_khojai_policy_collection
government7




In [60]:
! pip install openai



In [61]:
from langchain_community.embeddings import OpenAIEmbeddings

In [62]:
from langchain.embeddings import OpenAIEmbeddings

In [63]:
# Embedding function from OpenAI
embedding_model = OpenAIEmbeddings(api_key = "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", model="text-embedding-3-small", dimensions =1536)

                    dimensions was transferred to model_kwargs.
                    Please confirm that dimensions is what you intended.


In [64]:
! pip install tiktoken



In [71]:
# Define the mapping for the child_chunks index
mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "integer"},  # ID field for child chunk
            "text": {"type": "text"},  # Store the text as keyword
            "parent_id": {"type": "keyword"},  # Parent ID field as keyword
            "embedding": {  # Dense vector field for embeddings
                "type": "dense_vector",
                "dims": 1536  # Specify the dimensions of the embedding (change to match your embedding size)
            }
        }
    }
}

In [72]:
# Create the index with the specified mapping
index_name = "child_chunks_test"

es_client.indices.create(index=index_name, body=mapping)
print(f"Index '{index_name}' created successfully.")



ConnectionTimeout: Connection timed out

In [25]:
from pprint import pprint

In [None]:
# Define the index name
index_name = "child_chunks_test"

# Define the match_all query
query = {
    "query": {
        "match_all": {}
    }
}

# Execute the search query
response = es_client.search(index=index_name, body=query)

# Print the results
print(f"Total Hits: {response['hits']['total']['value']}")
for hit in response['hits']['hits']:
    print(hit['_source']['text'])
    print(hit['_source']['parent_id'])
#     # print(i)
#     # Print the document source

Total Hits: 0




In [27]:
# Function to generate embeddings and prepare data for bulk ingestion
def gendata(child_chunks, batch_size=500):
    for chunk in child_chunks:
        embedding = embedding_model.embed_query(chunk['text'])  # Get embedding for the content
        yield {
            "_op_type": "index",
            "_index": "child_chunks_test",  # Index name in Elasticsearch
            "_id": chunk["id"],  # Unique ID for the document
            "_source": {
                "text": chunk["text"],  # Child chunk content
                "embedding": embedding,  # Embedding as a list of numbers
                "parent_id": chunk["parent_id"]  # Parent ID that the child chunk belongs to
            }
        }

In [None]:
success, failed = bulk(es_client, gendata(child_chunks))
print(f"Successfully indexed {success} child chunks into Elasticsearch.")
if failed:
    print(f"Failed to index {failed} chunks.")



Successfully indexed 30 child chunks into Elasticsearch.


In [29]:
len(parent_chunks_with_id)

4

In [None]:
# Define the match_all query to retrieve only text and parent_id
query = {
    "query": {
        "match_all": {}
    },
    "_source": ["text", "parent_id"]  # Specify the fields to retrieve
}

# Perform the query
response = es_client.search(index="child_chunks_test", body=query)

# Print the results
for hit in response['hits']['hits']:
    print(f"Text: {hit['_source']['text']}")
    print(f"Parent ID: {hit['_source']['parent_id']}")
    print("-" * 50)

Text: Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication
Parent ID: 0
--------------------------------------------------
Text: . The Outback’s vast red deserts contrast sharply with lush rainforests and golden beaches, making it a paradise for adventurers and nature lovers alike. Australia's indigenous heritage, rooted in the traditions of Aboriginal and Torres Strait Islander peoples, adds a profound cultural depth to the country.
Parent ID: 0
--------------------------------------------------
Text: Nestled in the heart of the Himalayas, Bhutan is a small, landlocked kingdom often referred to as the "Land of the Thunder Dragon." It is celebrated for its commitment to Gross National Happiness, prioritizing well-being over economic growth. Pristine landscapes, from snow-capped mountain



In [49]:
child_chunks

[{'id': 0,
  'text': 'Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication',
  'parent_id': 0},
 {'id': 1,
  'text': ". The Outback’s vast red deserts contrast sharply with lush rainforests and golden beaches, making it a paradise for adventurers and nature lovers alike. Australia's indigenous heritage, rooted in the traditions of Aboriginal and Torres Strait Islander peoples, adds a profound cultural depth to the country.",
  'parent_id': 0},
 {'id': 2,
  'text': 'Nestled in the heart of the Himalayas, Bhutan is a small, landlocked kingdom often referred to as the "Land of the Thunder Dragon." It is celebrated for its commitment to Gross National Happiness, prioritizing well-being over economic growth. Pristine landscapes, from snow-capped mountains to verdant valleys, frame ancient mo

In [47]:
text_response = es_client.search(
    index="child_chunks_test",
    body={
        "query": {
            "match_all": {}
        }
    }
)

print("The text response:", text_response)

The text response: {'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 30, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'child_chunks_test', '_id': '0', '_score': 1.0, '_source': {'text': 'Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication', 'embedding': [0.016798522087728005, 0.026166118314384493, 0.034796937812017245, 0.08230854455406975, 0.010325407464503441, 0.014367156460511019, -0.0023958417250904544, 0.0012169981903113645, 0.005631083543584545, 0.013609329047628538, -0.005223224331930291, -0.03616523691944843, -0.06807821893562441, -0.008188753273953998, 0.035344257454989716, -0.007867728249632383, -0.057763337278711614, 0.03414436381842849, 0.023534770724848154, 0.00829400669324772, 0.0037



In [67]:
text_response = es_client.search(
    index="child_chunks_test",
    query={
            "match": {
                "text": {"query": "China"}
                }
            }
    )

print("The text response:", text_response)



ApiError: ApiError(503, 'search_phase_execution_exception', None)

In [44]:
if es_client.indices.exists(index="child_chunks_test"):
    print("Index exists.")
else:
    print("Index does not exist.")

Index exists.




In [40]:
mapping = es_client.indices.get_mapping(index="child_chunks_test")
print("Field mapping:", mapping["child_chunks_test"]["mappings"])

Field mapping: {'properties': {'embedding': {'type': 'dense_vector', 'dims': 1536, 'index': True, 'similarity': 'cosine'}, 'id': {'type': 'integer'}, 'parent_id': {'type': 'keyword'}, 'text': {'type': 'keyword'}}}




In [37]:
def rrf_search(user_input):
    # create embedding of user input
    # embedding_model = OpenAIEmbeddings(api_key = "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", model="text-embedding-3-small", dimensions =1536)
    token_vector = embedding_model.embed_query(user_input)
    
    # text search (match query)
    text_es_query = {
        "query": {
            "term": {
                "text": {
                    "value": user_input
                }
            }
        }
    }
    text_response = es_client.search(
        index="child_chunks_test",
        body=text_es_query
    )

    
    #KNN search (on embeddings)
    knn_es_query = {
        "knn": {
            "field": "embedding",
            "query_vector": token_vector,
            "k": 7,  # Number of nearest neighbors to retrieve
            "num_candidates": 100
        }
    }
    
    knn_response = es_client.search(
        index="child_chunks_test",
        body=knn_es_query
    )
    print("The knn response: ", knn_response)
    
    # Text search results
    text_data = []
    for hit in text_response['hits']['hits']:
        text_data.append({
            "id": hit["_id"],
            "text": hit["_source"].get("content", ""),  # Fetch content field
            "parent_id": hit["_source"].get("parent_id", ""),  # Fetch parent_id field
            "score": hit["_score"],
            "source": "text"
        })
    
    # KNN search results
    knn_data = []
    for hit in knn_response['hits']['hits']:
        knn_data.append({
            "id": hit["_id"],
            "text": hit["_source"].get("text", ""),  # Fetch text field
            "parent_id": hit["_source"].get("parent_id", ""),  # Fetch parent_id field
            "score": hit["_score"],
            "source": "knn"
        })
    
    # Combined results
    combined_data = text_data + knn_data
    
    # Apply RRF 
    k = 20  # RRF parameter
    rrf_scores = {}
    for item in combined_data:
        if item["id"] not in rrf_scores:
            rrf_scores[item["id"]] = 0
        # Calculate RRF score
        rank = 1 + item["score"]
        rrf_scores[item["id"]] += 1 / (k + rank)
    
    # Aggregate results and sort by RRF score
    aggregated_results = {}
    for item in combined_data:
        if item["id"] not in aggregated_results:
            aggregated_results[item["id"]] = {
            "id": item["id"],
            "text": item["text"],  # Child chunk content
            "parent_id": item["parent_id"],  # Parent ID
            "rrf_score": rrf_scores[item["id"]],
        }
    
    # Sort aggregated results by RRF score in descending order
    sorted_results = sorted(
        aggregated_results.values(),
        key=lambda x: x["rrf_score"],
        reverse=True
    )
    print("The sorted results are:", sorted_results)
    # Return the Answer of the top 5 results as context for LLM
    context = []

    # Ensure there are results to process
    if sorted_results:
        # Iterate through the top N results or fewer if less available
        context = []
        for i, result in enumerate(sorted_results[:2], start=1):
            # context.append(f"Chunk {i}: {result['text']}\nParent ID: {result['parent_id']}")
            context.append(result)
    else:
        context = "No relevant results found."
    return context
    
    # return sorted_results



In [34]:
a = rrf_search("Portugal, on the Iberian Peninsula, is a country of maritime heritage and scenic beauty.")

User input is:  Portugal, on the Iberian Peninsula, is a country of maritime heritage and scenic beauty.
The text response: {'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}
The knn response:  {'took': 6, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 7, 'relation': 'eq'}, 'max_score': 0.9357196, 'hits': [{'_index': 'child_chunks_test', '_id': '28', '_score': 0.9357196, '_source': {'text': 'Portugal, on the Iberian Peninsula, is a country of maritime heritage and scenic beauty. Lisbon, its capital, charms with its historic trams, cobblestone streets, and vibrant neighborhoods. The Douro Valley, famous for its vineyards, and the Algarve, with its golden beaches, highlight Portugal’s natural diversity', 'embedding': [-0.012627121389034351, -0.0016893513159111433, 0.03417102456722646, 0.04



In [54]:
a

[{'id': '10',
  'text': 'France is synonymous with art, romance, and gastronomy. Paris, the "City of Light," enchants with the Eiffel Tower, the Louvre, and Notre Dame. Beyond the capital, regions like Provence, Normandy, and the French Riviera showcase diverse landscapes and cultural identities. France’s culinary tradition, from baguettes and croissants to fine wines and cheeses, is world-famous. The country’s historic',
  'parent_id': 1,
  'rrf_score': 0.04612634697408881},
 {'id': '12',
  'text': 'Germany, a land of innovation and tradition, is at the heart of Europe. Known for its engineering excellence and cultural contributions, it offers a mix of historic cities like Berlin and Munich alongside picturesque towns such as Heidelberg. The Black Forest, Rhine Valley, and Bavarian Alps highlight Germany’s natural beauty. Its rich history, from the medieval Holy Roman Empire to reunification',
  'parent_id': 1,
  'rrf_score': 0.04612123567295124}]

In [39]:
def get_parent_text_combined(data):
    """
    Retrieves unique parent_ids from a list of dictionaries, fetches their corresponding text from the database,
    and combines all parent texts into a single string separated by '\n\n'.

    Parameters:
        data (list): A list of dictionaries containing a 'parent_id' key.
        db_session (Session): An active SQLAlchemy session for database queries.

    Returns:
        str: A single string combining all parent texts, separated by '\n\n'.
    """
    db_session=Session()
    # Extract unique parent_ids
    unique_parent_ids = {item['parent_id'] for item in data}

    # Query the database for the texts corresponding to these parent_ids
    results = db_session.query(ParentChunk.text).filter(ParentChunk.par_id.in_(unique_parent_ids)).all()

    # Initialize an empty string and append parent texts separated by '\n\n'
    combined_text = "\n\n".join(text for (text,) in results)
    
    return combined_text

In [40]:
print(get_parent_text_combined(a))

Australia, the "Land Down Under," is renowned for its unique landscapes, vibrant culture, and fascinating wildlife. With iconic landmarks like the Sydney Opera House and the Great Barrier Reef, it offers a blend of natural wonders and urban sophistication. The Outback’s vast red deserts contrast sharply with lush rainforests and golden beaches, making it a paradise for adventurers and nature lovers alike. Australia's indigenous heritage, rooted in the traditions of Aboriginal and Torres Strait Islander peoples, adds a profound cultural depth to the country.

Nestled in the heart of the Himalayas, Bhutan is a small, landlocked kingdom often referred to as the "Land of the Thunder Dragon." It is celebrated for its commitment to Gross National Happiness, prioritizing well-being over economic growth. Pristine landscapes, from snow-capped mountains to verdant valleys, frame ancient monasteries like the iconic Tiger’s Nest. Bhutan’s deep-rooted Buddhist traditions shape its serene way of lif