# Langchain With Vector Embeddings Search
LoadData->Transform->Embeddings->VectorStore->Retriever

In [4]:
import sys
sys.path.insert(1, '../../')
import init_creds as creds
 
AZURE_OPENAI_API_KEY = creds.get_api_key()
AZURE_OPENAI_ENDPOINT = creds.get_endpoint()
# print(AZURE_OPENAI_API_KEY)
# print(AZURE_OPENAI_ENDPOINT)
 
if not AZURE_OPENAI_API_KEY:
    raise ValueError("No AZURE_OPENAI_API_KEY set for Azure OpenAI API")
if not AZURE_OPENAI_ENDPOINT:
    raise ValueError("No AZURE_OPENAI_ENDPOINT set for Azure OpenAI API")

## Load the Data

In [5]:
#Code to import CSV file data for our tasks
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='../materials/kindData.csv', csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ['Words']
})

#Assigning the data inside the csv to our variable here
data = loader.load()

#Display the data
print(data)

[Document(metadata={'source': '../materials/kindData.csv', 'row': 0}, page_content='Words: \ufeffWords'), Document(metadata={'source': '../materials/kindData.csv', 'row': 1}, page_content='Words: Elephant'), Document(metadata={'source': '../materials/kindData.csv', 'row': 2}, page_content='Words: Lion'), Document(metadata={'source': '../materials/kindData.csv', 'row': 3}, page_content='Words: Tiger'), Document(metadata={'source': '../materials/kindData.csv', 'row': 4}, page_content='Words: Dog'), Document(metadata={'source': '../materials/kindData.csv', 'row': 5}, page_content='Words: Cricket'), Document(metadata={'source': '../materials/kindData.csv', 'row': 6}, page_content='Words: Footbal'), Document(metadata={'source': '../materials/kindData.csv', 'row': 7}, page_content='Words: Tennis'), Document(metadata={'source': '../materials/kindData.csv', 'row': 8}, page_content='Words: Basketball'), Document(metadata={'source': '../materials/kindData.csv', 'row': 9}, page_content='Words: Ap

## Load Embeddings

#### Below cell is used for an embedding model that is open source from hugging face as we dont have an OpenAI API key else we could have used OpenAIEmbeddings directly for document search or some from Hugging Face Hub

In [6]:
# from langchain.embeddings import SentenceTransformerEmbeddings
# import os
# os.environ["SENTENCE_TRANSFORMERS_HOME"] = "sentence_transformers"
# emb_model = "all-MiniLM-L6-v2"
# embeddings = SentenceTransformerEmbeddings(model_name=emb_model,
#     cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME'))

In [7]:
import os

os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = AZURE_OPENAI_ENDPOINT
os.environ["AZURE_OPENAI_API_VERSION"]="2024-07-01-preview"

from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-large",
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)

## Load the vector db with embedding run on the data

In [8]:
# Create a vector store with a sample text
from langchain_core.vectorstores import InMemoryVectorStore

text = "LangChain is the framework for building context-aware reasoning applications"

vectorstore = InMemoryVectorStore.from_texts(
    [text],
    embedding=embeddings,
)

# Use the vectorstore as a retriever
retriever = vectorstore.as_retriever()

# Retrieve the most similar text
retrieved_documents = retriever.invoke("What is LangChain?")

# show the retrieved document's content
retrieved_documents[0].page_content

'LangChain is the framework for building context-aware reasoning applications'

In [9]:
%pip install faiss-cpu
%pip install langchain-chroma
%pip install chromadb
%pip install langchain-community
%pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_chroma.vectorstores import Chroma
from langchain_community.vectorstores.faiss import FAISS
chromadb=Chroma.from_documents(data,embeddings)
faissdb=FAISS.from_documents(data,embeddings)

## Search vector db

In [None]:
query="lemon"
# Facebook AI Similarity Search (Faiss), a library that allows us to quickly search for multimedia documents
docsFoundFromFAISS=faissdb.similarity_search(query, k=10)
print("Results from FAISS!!!!!!!")
for i,item in enumerate(docsFoundFromFAISS):
    print(item.page_content)

In [None]:
print("-------------------------------------")
faissdb.docstore.__dict__

In [None]:
chromadb._collection.get(include=['documents','embeddings'])


## Retriever

In [None]:
retriever = faissdb.as_retriever(search_kwargs={"k":4})
retriever

In [None]:
retriever.get_relevant_documents("Cricket")