In [6]:
import pymongo
import os
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.docling import DoclingReader
from dotenv import load_dotenv
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever



In [2]:
# Load environment variables
load_dotenv()

user = os.getenv("user")
pwd = os.getenv("pwd")

# Define embedding model
embed_model = OpenAIEmbedding()

# Read data from the provided source
reader = DoclingReader()

docs = reader.load_data("https://arxiv.org/pdf/2411.05442")

# MongoDB Atlas configuration
mongo_uri = f"mongodb+srv://{user}:{pwd}@cluster0.lymvb.mongodb.net/?retryWrites=true&w=majority"

mongo_client = pymongo.MongoClient(mongo_uri)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 116508.44it/s]


In [4]:
# Initialize the vector store
store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name="llamaindex",
    collection_name="test",
    vector_index_name="vector_index"
)
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context, embed_model=embed_model
)

In [12]:
# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)
# Prompt the LLM
response = query_engine.query('what is intellbot?')
print(response)
print("\nSource documents: ")
print(response.source_nodes)

IntellBot is an advanced cyber security chatbot that leverages cutting-edge technologies like Large Language Models and Langchain. It is designed to provide tailored responses to user inquiries related to cyber security, including information on known vulnerabilities, recent cyber attacks, and emerging threats. IntellBot gathers data from various sources to create a comprehensive knowledge base, offering insights and advice to both cyber security professionals and the general public. By utilizing Large Language Models, IntellBot can deliver contextually relevant information across different domains and adapt to evolving conversational contexts, enhancing threat intelligence and incident response.

Source documents: 
[NodeWithScore(node=TextNode(id_='841c6a53-ccb5-4ecf-9c68-843581ddcc3c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4bd580af-2b88-4c01-8c55-e15f027b0b73