<a href="https://colab.research.google.com/github/ramahasiba/NLP/blob/LangChain/Build_a_Semantic_Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Build a Semantic Search Engine](https://python.langchain.com/docs/tutorials/retrievers/)

In [None]:
!pip install -qU langchain-chroma

In [None]:
!pip install langchain-community pypdf -q

## Setup

In [None]:
import os
from pprint import pprint
from dotenv import load_dotenv
import getpass

try:
  load_dotenv('.env')
except ImportError:
  print('No .env file found')

# Setup LangSmith to be able to inspect what exactly goes inside my chain or agent
os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
  os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
      prompt = "Enter the Langsmith api key:"
  )

if "LANGSMITH_PROJECT" not in os.environ:
  os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
      prompt = "Enter langsmith project name: "
  )
  if not os.environ.get("LANGSMITH_PROJECT"):
    os.environ["LANGSMITH_PROJECT"] = "default"

os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')
os.environ["HF_TOKEN"] = os.getenv('HF_TOKEN')

## Documents and Document Loader

In [None]:
from langchain_core.documents import Document

# generate a sample documents
documents = [
    Document(
        page_content="Dogs are great companies, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"}
    ),
]

## Loading Documents

In [None]:
from langchain_community.document_loaders import PyPDFLoader

# loading the PDF into a sequence of documents objects, PyPDFLoader selected to it's fairly lightweight
file_path = "/content/nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load() # Load one Document object per PDF page

print(len(docs))

In [None]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata) # metadata contains file name, page number and other information

The PyPDFLoader loads one Document object per PDF page.

## Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Partition text based on characters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True, # set this to true to preserve the character index where the split document starts
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

## Embeddings

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

## Vector Stores

In [None]:
from langchain_chroma import Chroma

# instantiate the vector store
vector_store = Chroma(
    collection_name = "nike",
    embedding_function=embeddings,
    persist_directory="./db"
)

In [None]:
# index the documents into the vector store
idx = vector_store.add_documents(documents=all_splits)

In [None]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

Async query:

In [None]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

Return with scores:

In [None]:
# different providers generate different scores

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Return documents based on similarity to an embedded query:

In [None]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

## Retrievers

In [None]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain

# creating a simple version of the retriever without subclassing Retriever
# building one around the similarity search method
@chain
def retriever(query: str) -> List[Document]:
  return vector_store.similarity_search(query, k=1)

retriever.batch([
    "How many distribution centers does Nike have in the US?",
    "When was Nike incorporated?"
])

# ------------------ Above code replication ------------------
retriever = vector_store.as_retriever(
    search_type="similarity",
    searh_kwargs={"k": 1}
)

retriever.batch([
    "How many distribution centers does Nike have in the US?",
    "When was Nike incorporated?"
])