<a href="https://colab.research.google.com/github/prakul/MongoDB-AI-Resources/blob/main/Langchain%2BMongoDB_%24vectorSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install prerequisites dependencies


In [None]:
!pip install langchain pypdf pymongo openai python-dotenv tiktoken

# Setup the environment

In [None]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv(override=True)

# Add an environment file to the notebook root directory called .env with MONGO_URI="xxx" to load these envvars

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MONGO_URI = os.environ["MONGO_URI"]
DB_NAME = "langchain-test-2"
COLLECTION_NAME = "test"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "default"
EMBEDDING_FIELD_NAME = "embedding"
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
MONGODB_COLLECTION = db[COLLECTION_NAME]

ModuleNotFoundError: ignored

## INSERT DATA

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
data = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
docs = text_splitter.split_documents(data)

In [None]:
docs[0]

Document(page_content='GPT-4 Technical Report\nOpenAI∗\nAbstract\nWe report the development of GPT-4, a large-scale, multimodal model which can\naccept image and text inputs and produce text outputs. While less capable than\nhumans in many real-world scenarios, GPT-4 exhibits human-level performance\non various professional and academic benchmarks, including passing a simulated\nbar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-', metadata={'source': '/tmp/tmp01sv5lz3/tmp.pdf', 'page': 0})

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

# insert the documents in MongoDB Atlas Vector Search
x = MongoDBAtlasVectorSearch.from_documents(
     documents=docs, embedding=OpenAIEmbeddings(disallowed_special=()), collection=MONGODB_COLLECTION, index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
 )


## CREATE INDEX

 Create an Atlas search index via Atlas UI -> Search -> JSON Editor with the following definition
 https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/
 ```
  {
   "mappings": {
     "dynamic": true,
     "fields": {
       "embedding": {
         "dimensions": 1536,
         "similarity": "cosine",
         "type": "knnVector"
       }
     }
   }
 }
```

```
{
  "mappings": {
    "dynamic": true,
    "fields": {
      "embedding": {
        "dimensions": 1536,
        "similarity": "cosine",
        "type": "knnVector"
      },
      "source": [
        {
          "normalizer": "lowercase",
          "type": "token"
        }
      ]
    }
  }
}
```

# DATA QUERY

In [None]:
import pymongo

client = pymongo.MongoClient(MONGO_URI)
db = client["langchain-test-2"]
collection = db["test"]

In [None]:
import openai

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")

model = "text-embedding-ada-002"
def get_embedding(text: str) -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


## $VECTORSEARCH MQL Query without Filter

In [None]:
query="gpt-4"
results = collection.aggregate([
{
"$vectorSearch": {
"index": "default",
"queryVector": get_embedding(query),
"numCandidates": 200,
"limit": 20,
"path": "embedding"
}},
 {
    "$project": {
      "_id": 0,
      "source": 1,
      "score": { "$meta": "vectorSearchScore" }
    }
}

])

for document in results:
    print(document)


{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9324517250061035}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9298850297927856}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9282032251358032}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9266623854637146}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9254909157752991}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9251236915588379}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9243311285972595}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9239716529846191}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9236046075820923}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9231781959533691}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9222846031188965}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9207495450973511}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9207268357276917}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 0.9202592968940735}
{'source': '/tmp/tmpevpslauk/tmp.pdf', 'score': 

## $VECTORSEARCH MQL Query with Filter

In [None]:
query="gpt-4"
results = collection.aggregate([
{
"$vectorSearch": {
    "index": "default",
    "queryVector": get_embedding(query),
    "limit": 2,
    "numCandidates": 200,
    "path": "embedding",
    "filter": {
        "source": { "$eq": "/tmp/tmpevpslauk/tmp1.pdf"}
    }
 }},
 {
    "$project": {
      "_id": 0,
      "source": 1,
      "score": { "$meta": "vectorSearchScore" }
    }
}

])

for document in results:
    print(document)


{'source': '/tmp/tmpevpslauk/tmp1.pdf', 'score': 0.9191571474075317}


## Langchain semantic search without filter

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGO_URI,
    DB_NAME + "." + COLLECTION_NAME,
    OpenAIEmbeddings(disallowed_special=()),
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)


In [None]:
query = "gpt-4"
results = vector_search.similarity_search(
    query=query,
    k=20,
)

# Display results
#print(dict(results[0].metadata).keys())
for result in results:
    print( result)


## Langchain semantic search with filter

In [None]:
query = "gpt-4"
results = vector_search.similarity_search(
    query=query,
    k=20,
    pre_filter=
            {
        "source": { "$eq": "/tmp/tmpevpslauk/tmp1.pdf"}
    },
)

# Display results
#print(dict(results[0].metadata).keys())
for result in results:
    print( dict(result.metadata)["source"])


/tmp/tmpevpslauk/tmp1.pdf


-------------------------------------------------


## Langchain QA without filters

In [None]:
qa_retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 200,
        "post_filter_pipeline": [{"$limit": 25}]
    }
)


In [None]:
from langchain.prompts import PromptTemplate
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

qa = RetrievalQA.from_chain_type(llm=OpenAI(),chain_type="stuff", retriever=qa_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT})

docs = qa({"query": "gpt-4 compute requirements"})

print(docs["result"])
print(docs['source_documents'])

## Langchain QA with filters

In [None]:
qa_retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 20,
        "pre_filter":
            {
        "source": { "$eq": "/tmp/tmpevpslauk/tmp1.pdf"}
    },

        "post_filter_pipeline": [{"$limit": 2}]
    }
)


In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

qa = RetrievalQA.from_chain_type(llm=OpenAI(),chain_type="stuff", retriever=qa_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT})

docs = qa({"query": "gpt-4 compute requirements"})

print(docs['result'])
#print(docs['source_documents'])

## Langchain QA misc

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
question = "How much better is GPT-4 in reducing hallucinations over GPT-3.5"
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


{'query': 'How much better is GPT-4 in reducing hallucinations over GPT-3.5',
 'result': "I'm sorry, but I don't have access to information about the specific capabilities or improvements of GPT-4 over GPT-3.5. My knowledge is based on GPT-3, and I don't have real-time updates on subsequent versions."}

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vector_search.as_retriever())
qa_chain({"query": question})


{'query': 'How much better is GPT-4 in reducing hallucinations over GPT-3.5',
 'result': 'GPT-4 scores 19 percentage points higher than GPT-3.5 in reducing hallucinations.'}