In [1]:
import os
import cohere
import faiss
import numpy as np
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import Cohere
from dotenv import load_dotenv

In [2]:
# Step 1: Load environment variables (for Cohere API key)
load_dotenv()

True

In [3]:
# Step 2: Initialize Cohere client
co = cohere.Client("Lt7NBVBllT6ZU1FwA3bUq1NmTYASnP14fLIInOct")


In [4]:
# Step 3: PDF Reader function to load and process the PDF
def read_pdf(file_path):
    file_loader = PyPDFLoader(file_path)
    documents = file_loader.load()
    return documents

In [5]:
# Example: Load a PDF document
doc = read_pdf(r'D:\ChatBot_LLM3\sigma.pdf')  # Modify with your PDF file path
print(f"Total Pages in the document: {len(doc)}")

Total Pages in the document: 9


In [6]:
# Step 4: Chunk the document using a Recursive Text Splitter
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs_chunked = text_splitter.split_documents(docs)
    return docs_chunked

In [7]:
# Chunk the PDF document into smaller parts
documents = chunk_data(docs=doc)
print(f"Total Chunks Created: {len(documents)}")

Total Chunks Created: 31


In [8]:
from langchain.embeddings.base import Embeddings 
# Step 5: Cohere Embedding Function
class CohereEmbeddings(Embeddings):
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        response = co.embed(texts=texts)
        return response.embeddings
    
    def embed_query(self, text: str) -> list[float]:
        response = co.embed(texts=[text])
        return response.embeddings[0]
    


In [9]:
# Step 6: Store embeddings in FAISS
def store_faiss_embeddings(embeddings, documents):
    dimension = len(embeddings[0])
    index = faiss.IndexFlatL2(dimension)  # Create FAISS index with L2 distance
    index.add(np.array(embeddings))  # Add embeddings to FAISS index
    return index

In [10]:
# Generate Cohere embeddings for the document chunks
texts = [doc.page_content for doc in documents]
cohere_embedder = CohereEmbeddings()
embeddings = cohere_embedder.embed_documents(texts)

In [11]:
import faiss
import numpy as np
# Store embeddings in FAISS
faiss_index = store_faiss_embeddings(embeddings, documents)

# Optionally, save the FAISS index to disk
faiss.write_index(faiss_index, "faiss_index.idx")
print("FAISS Index has been created and saved.")

FAISS Index has been created and saved.


In [12]:
# Step 7: Search FAISS for relevant documents based on query
def search_faiss_index(query):
    # Embed the query using Cohere
    query_embedding = cohere_embedder.embed_documents([query])
    query_vector = np.array(query_embedding)
    
    # Search the FAISS index for the top 5 similar chunks
    D, I = faiss_index.search(query_vector, k=5)
    return I  # Return indices of matching documents

In [13]:
# Example query for retrieving matching chunks
query = "give me a information about Akshay Residency"
result_indices = search_faiss_index(query)
print(f"Matching Chunk Indices: {result_indices}")

Matching Chunk Indices: [[23  3 10 15  4]]


In [14]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import Cohere

# Initialize Cohere LLM with the correct API key parameter
llm = Cohere(cohere_api_key='Lt7NBVBllT6ZU1FwA3bUq1NmTYASnP14fLIInOct')  # Replace with your actual API key

# Load the QA chain
qa_chain = load_qa_chain(llm, chain_type="stuff")


  llm = Cohere(cohere_api_key='Lt7NBVBllT6ZU1FwA3bUq1NmTYASnP14fLIInOct')  # Replace with your actual API key
stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  qa_chain = load_qa_chain(llm, chain_type="stuff")


In [15]:
# Get the matching documents based on the retrieved indices from FAISS
matching_documents = [documents[i] for i in result_indices[0]]

In [16]:
answer = qa_chain.run(input_documents=matching_documents, question=query)
print(f"Answer: {answer}")

  answer = qa_chain.run(input_documents=matching_documents, question=query)


Answer:  Here's the information I have on file for Akshay Residency: 

1. Location: Wakad, Pune
2. Developer: Akshay Developers
3. Date of Completion: November 2017
4. Status: Completed
5. Connectivity: Close to Wakad Road, minutes from Mumbai-Pune Expressway
6. Amenities: 
- 24/7 Security
- Kids Play Area
- Parking
- Garden & Landscaping

I hope this information is helpful. Please let me know if there's anything else I can assist you with. 
