# MongoDB Vector Search Demo

## Connect to MongoDB and OpenAI

In [None]:
import pymongo
from _secrets import OPENAI_API_KEY, MONGODB_URI
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
client = pymongo.MongoClient(MONGODB_URI)

db_name = 'vector_search_demo'
collection_name = 'documents'
index_name = 'vector_index'

db = client[db_name]
db.drop_collection(collection_name)

## Create Vector Search index

In [None]:
collection = db.create_collection(collection_name)

vector_index = {
  "definition": {
    "fields": [
      {
        "numDimensions": 1536,
        "path": "embedding",
        "similarity": "cosine",
        "type": "vector"
      },
      {
        "path": "page",
        "type": "filter"
      }
    ]
  },
  "name": index_name,
  "type": "vectorSearch"
}

db.command({
  "createSearchIndexes": collection_name,
  "indexes": [vector_index]
})

## Load sample PDF, chunk, and generate embeddings 

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter

loader = PyPDFLoader("./sample.pdf")
data = loader.load()

text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=100)
docs = text_splitter.split_documents(data)

vector_search = MongoDBAtlasVectorSearch.from_documents(
  documents = docs,
  embedding = OpenAIEmbeddings(disallowed_special=()),
  collection = collection,
  index_name = index_name
)

## Convert text query to vector embedding

In [None]:
query = 'MongoDB Atlas security'
query_embedding = OpenAIEmbeddings().embed_query(text=query)

query_embedding

## Semantic search for relevant chunks of documents
### MongoDB Aggregation

In [None]:

res = collection.aggregate([
  { '$vectorSearch': {
    'index': index_name,
    'path': 'embedding',
    'queryVector': query_embedding,
    'numCandidates': 40,
    'limit': 4
  }},
  { '$project': {
    '_id': 0,
    'text': 1
  } }
])

list(res)

### Langchain Similarity Search

In [None]:
vector_search.similarity_search(query)

### Layer on filters and scores

In [None]:
vector_search.similarity_search_with_score(
  query = query,
  pre_filter = { "page": { "$in": [17, 18] } }
)

## Build a RAG chain

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

retriever = vector_search.as_retriever(
  search_type = "similarity",
  search_kwargs = {"k": 10, "score_threshold": 0.75}
)

template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Take a deep breath and answer step by step.
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)
llm = ChatOpenAI()

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
  { "context": retriever | format_docs, "question": RunnablePassthrough()}
  | custom_rag_prompt
  | llm
  | StrOutputParser()
)

## Prompt the chain

In [None]:
question = "How can I secure my MongoDB clusters?"
rag_chain.invoke(question)

### Display the context used

In [None]:
retriever.invoke(question)