<a href="https://colab.research.google.com/github/prakul/MongoDB-AI-Resources/blob/main/Langchain_%2B_MongoDB_%2B_localEmbed_101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install prerequisites dependencies


In [8]:
!pip install langchain pypdf pymongo openai python-dotenv



In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m153.6/156.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1


# Setup the environment

In [3]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv(override=True)

# Add an environment file to the notebook root directory called .env with MONGO_URI="xxx" to load these envvars

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MONGO_URI = os.environ["MONGO_URI"]
DB_NAME = "langchain-test-3"
COLLECTION_NAME = "test2"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_search"

client = MongoClient(MONGO_URI)
db = client[DB_NAME]
MONGODB_COLLECTION = db[COLLECTION_NAME]

#DATA PREPARATION

 Create an Atlas search index via Atlas UI -> Search -> Vector Search JSON Editor with the following definition
 ```
 {
   "fields":[
      {
         "type": "vector",
         "path": "embedding",
         "numDimensions": 1536,
         "similarity": "cosine"
      },
      {
         "type": "filter",
         "path": "page"
      }
   ]
}

```

In [7]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
data = loader.load()

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
docs = text_splitter.split_documents(data)

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-large")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [14]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

# insert the documents in MongoDB Atlas Vector Search
x = MongoDBAtlasVectorSearch.from_documents(
     documents=docs, embedding=embeddings, collection=MONGODB_COLLECTION, index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
 )


#DATA QUERY

In [9]:
# initialize a connection to MongoDB Vector Search if not done in previous Steps

from langchain.vectorstores import MongoDBAtlasVectorSearch

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
    MONGO_URI,
    DB_NAME + "." + COLLECTION_NAME,
    embeddings,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)


## SEMANTIC SEARCH

In [None]:
question = "How much better is GPT-4 in reducing hallucinations over GPT-3.5"

results = vector_search.similarity_search(question)


## RAG (WIP)

In [None]:
from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)



In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


{'query': 'How much better is GPT-4 in reducing hallucinations over GPT-3.5',
 'result': "I'm sorry, but I don't have access to information about the specific capabilities or improvements of GPT-4 over GPT-3.5. My knowledge is based on GPT-3, and I don't have real-time updates on subsequent versions."}

In [None]:
question = "How much better is GPT-4 in reducing hallucinations over GPT-3.5"


In [None]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vector_search.as_retriever())
qa_chain({"query": question})


{'query': 'How much better is GPT-4 in reducing hallucinations over GPT-3.5',
 'result': 'GPT-4 scores 19 percentage points higher than GPT-3.5 in reducing hallucinations.'}