# 7. Frameworks, Libraries and APIs

### Import Libraries

In [1]:
import getpass, os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient
from dotenv import load_dotenv

load_dotenv()

True

### Load Environment Variables

In [2]:
OPENAI_API_KEY: str | None = os.getenv(
    key="OPENAI_API_KEY",
    default=None,
)
MONGODB_CONNECTION_STRING: str | None = os.getenv(
    key="MONGODB_CONNECTION_STRING",
    default=None,
)

### Connect to MongoDB

In [3]:
# Connect to MongoDB
client = MongoClient(
    host=MONGODB_CONNECTION_STRING, tls=True, tlsAllowInvalidCertificates=True
)

In [4]:
mm_ai_db = client.get_database(name="mm_ai")
print(mm_ai_db)
mm_ai_db.list_collection_names()

Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'mm_ai')


['test']

In [5]:
ca_coll = mm_ai_db.get_collection(name="test")
print(ca_coll)
ca_coll.list_search_indexes()

Collection(Database(MongoClient(host=['mmdev-shard-00-01.z7q8g.mongodb.net:27017', 'mmdev-shard-00-02.z7q8g.mongodb.net:27017', 'mmdev-shard-00-00.z7q8g.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-b4um4g-shard-0', tls=True, tlsallowinvalidcertificates=True, tlsdisableocspendpointcheck=True), 'mm_ai'), 'test')


<pymongo.synchronous.command_cursor.CommandCursor at 0x10da15b20>

In [6]:
vector_search_index: str = "vector_index"

### Fetch and split public PDF documents

In [63]:
from langchain_core.documents.base import Document
from pprint import pprint

URL: str = "https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP"

# Load Document
loader = PyPDFLoader(file_path=URL)
data: list[Document] = loader.load()

print(len(data), data)
pprint(data[0])

Document(metadata={'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 0}, page_content='MongoDB Atlas Best PracticesJanuary 2019\nA MongoDB White Paper\n')


In [64]:
# Split PDF into Documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
)

docs = text_splitter.split_documents(documents=data)

print(len(docs), docs)
pprint(docs[0])

Document(metadata={'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 0}, page_content='MongoDB Atlas Best PracticesJanuary 2019\nA MongoDB White Paper')


### Create Vector Store

In [65]:
# Instantiate Embedding Model
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=3072,
    disallowed_special=(),
)

In [7]:
# Check if vector search index already exists
existing_indexes = list(ca_coll.list_search_indexes())
print(existing_indexes)

[{'id': '678372d5219f0d65a7001e98', 'name': 'vector_index', 'type': 'vectorSearch', 'status': 'READY', 'queryable': True, 'latestDefinitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 12, 7, 44, 21, 691000)}, 'latestDefinition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 3072, 'similarity': 'cosine'}, {'type': 'filter', 'path': 'page'}]}, 'statusDetail': [{'hostname': 'atlas-b4um4g-shard-00-02', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 12, 7, 44, 21)}, 'definition': {'fields': [{'type': 'vector', 'path': 'embedding', 'numDimensions': 3072, 'similarity': 'cosine'}, {'type': 'filter', 'path': 'page'}]}}}, {'hostname': 'atlas-b4um4g-shard-00-00', 'status': 'READY', 'queryable': True, 'mainIndex': {'status': 'READY', 'queryable': True, 'definitionVersion': {'version': 0, 'createdAt': datetime.datetime(2025, 1, 12, 7, 44, 

In [None]:
# ca_coll.drop_search_index(name="vector_index")

In [78]:
# Create the Vector Store
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=embedding_model,
    collection=ca_coll,
    index_name=vector_search_index,
)

In [77]:
# Create Index
from pymongo.operations import SearchIndexModel

# Define search index model
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 3072,
                "similarity": "cosine",
            },
            {
                "type": "filter",
                "path": "page",
            },
        ]
    },
    name=vector_search_index,
    type="vectorSearch",
)

ca_coll.create_search_index(model=search_index_model)
print("Vector search index created successfully.")

Vector search index created successfully.


### Langchain Semantic Search with Score

In [81]:
query: str = "MongoDB Atlas Security"

result = vector_store.similarity_search_with_score(
    query=query,
    k=3,
)
pprint(object=result)

[(Document(metadata={'_id': '678373aea320cfc6a18b74c8', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
  0.8972679376602173),
 (Document(metadata={'_id': '678370aaa320cfc6a18b7325', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
  0.89713454246521),
 (Document(metadata={'_id': '678373aea320cfc6a18b74cb', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='automatically enabled.\nReview the security section of the MongoDB Atlas\ndocumentation to learn more about each of 

### Semantic Search with Pre-Filtering
> Pre-filtering narrows down the Search Space before performing a more computationally intensive vector search. Benefits: Increased Performance, Improved Accuracy and Enhanced Query Relevance.

In [82]:
query: str = "MongoDB Atlas Security"

result = vector_store.similarity_search_with_score(
    query=query,
    k=3,
    pre_filter={
        "page": {
            "$eq": 17,
        },
    },
)
pprint(object=result)

[(Document(metadata={'_id': '678373aea320cfc6a18b74c8', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
  0.8972679376602173),
 (Document(metadata={'_id': '678370aaa320cfc6a18b7325', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
  0.89713454246521),
 (Document(metadata={'_id': '678373aea320cfc6a18b74cb', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='automatically enabled.\nReview the security section of the MongoDB Atlas\ndocumentation to learn more about each of 

### Implementing a basic RAG with Langchain
1. Setup a MongoDB Atlas Vector Search Retriever for Similarity-Based Search.
2. Return the 10 most relevant documents.
3. Utiliza a custom RAG prompt with a LLM to answer based on the retrieved documents.

In [86]:
# Instantiate Atlas Vector Search as a Retriever
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 3,
    },
)

retriever.get_relevant_documents(query=query)

[Document(metadata={'_id': '678373aea320cfc6a18b74c8', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
 Document(metadata={'_id': '678370aaa320cfc6a18b7325', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
 Document(metadata={'_id': '678373aea320cfc6a18b74cb', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='automatically enabled.\nReview the security section of the MongoDB Atlas\ndocumentation to learn more about each of the security\nfeatures discussed below.\nIP Whi

In [88]:
# Define a prompt template
template = """
Use the following pieces of context to answer the question at the end.If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
"""

custom_rag_prompt = PromptTemplate.from_template(template=template)

print(custom_rag_prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template="\nUse the following pieces of context to answer the question at the end.If you don't know the answer, just say that you don't know, don't try to make up an answer.\n{context}\nQuestion: {question}\n"


In [89]:
llm = ChatOpenAI(
    name="gpt-4o",
    api_key=OPENAI_API_KEY,
)
print(llm)

name='gpt-4o' client=<openai.resources.chat.completions.Completions object at 0x1174687a0> async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x11744c6e0> root_client=<openai.OpenAI object at 0x11742b7d0> root_async_client=<openai.AsyncOpenAI object at 0x11745edb0> model_kwargs={} openai_api_key=SecretStr('**********')


In [90]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [95]:
print(docs[0])
print(format_docs(docs=docs[0:5]))

page_content='MongoDB Atlas Best PracticesJanuary 2019
A MongoDB White Paper' metadata={'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 0}
MongoDB Atlas Best PracticesJanuary 2019
A MongoDB White Paper

Table of Contents
1Introduction
2Preparing for a MongoDB Deployment
9Scaling a MongoDB Atlas Cluster
11Continuous Availability & Data Consistency
12Managing MongoDB
16Security

16Security
17Business Intelligence with MongoDB Atlas
18Considerations for Proofs of Concept
18MongoDB Stitch: Serverless Platform from MongoDB
19We Can Help
19Resources

Introduction
MongoDB Atlas provides all of the features of MongoDB,
without the operational heavy lifting required for any new
application. MongoDB Atlas is available on-demand

through a pay-as-you-go model and billed on an hourly
basis, letting you focus on what you do best.
It’s easy to get started – use a simple GUI to select the


In [97]:
# Construct a RAG Chain to answer question based on the data
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [99]:
# Prompt the Chain
question = "How can I secure my MongoDB Atlas cluster?"
answer = rag_chain.invoke(input=question)

print(f"QUESTION: {question}")
print(f"ANSWER: {answer}")

QUESTION: How can I secure my MongoDB Atlas cluster?
ANSWER: You can secure your MongoDB Atlas cluster by utilizing the security features such as IP whitelisting and other controls provided by MongoDB Atlas. For more information, review the security section of the MongoDB Atlas documentation.


In [100]:
# Return Source Documents
documents = retriever.get_relevant_documents(query=question)
print("Source Documents: ")
pprint(object=documents)

Source Documents: 
[Document(metadata={'_id': '678373aea320cfc6a18b74c8', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
 Document(metadata={'_id': '678370aaa320cfc6a18b7325', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='MongoDB Atlas features extensive capabilities to defend,\ndetect, and control access to MongoDB, offering among\nthe most complete security controls of any modern\ndatabase:'),
 Document(metadata={'_id': '678373aea320cfc6a18b74cb', 'source': 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP', 'page': 17}, page_content='automatically enabled.\nReview the security section of the MongoDB Atlas\ndocumentation to learn more about each of the security\nfeatures discu