 **REQUIREMENTS**

In [1]:
!pip install langchain-community
!pip install "unstructured[all-docs]"
!pip install langchain_ollama
!pip install -U langchain-unstructured
!pip install pymupdf langchain
!pip install chromadb
!pip install -U langchain-openai
!pip install -U langchain langchain-openai chromadb sentence-transformers
!pip install langchain-cohere langchain-vectara
!pip install --upgrade langchain
!pip install langchain langchain-community flashrank
!pip install pytesseract pdf2image
!pip install pytesseract
!pip install pymupdf



Collecting langchain-unstructured
  Downloading langchain_unstructured-0.1.6-py3-none-any.whl.metadata (3.3 kB)
Collecting onnxruntime<=1.19.2,>=1.17.0 (from langchain-unstructured)
  Downloading onnxruntime-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Downloading langchain_unstructured-0.1.6-py3-none-any.whl (7.0 kB)
Downloading onnxruntime-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnxruntime, langchain-unstructured
  Attempting uninstall: onnxruntime
    Found existing installation: onnxruntime 1.21.0
    Uninstalling onnxruntime-1.21.0:
      Successfully uninstalled onnxruntime-1.21.0
Successfully installed langchain-unstructured-0.1.6 onnxruntime-1.19.2
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.

In [101]:
# --- Step 1: Imports ---

import os
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [117]:
# --- Step 2: EXTRACTING TEXT FROM DIGITAL/SCANNED PDF & CONVERTING TEXT INTO DOCUMENTS OBJECT ---

import fitz  # PyMuPDF
from pdf2image import convert_from_path
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document


def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        # Try direct text extraction (digital PDFs)
        doc = fitz.open(pdf_path)
        for page in doc:
            page_text = page.get_text().strip()
            if page_text:
                text += page_text + "\n"
        doc.close()

        if text.strip():
            return text
    except Exception as e:
        print(f"Digital extraction failed: {e}")

    # Fallback to OCR for scanned PDFs
    print("Falling back to OCR extraction...")
    images = convert_from_path(pdf_path)
    for img in images:
        ocr_text = pytesseract.image_to_string(img)
        text += ocr_text + "\n"

    return text

# --- Load and extract text ---
pdf_path = "The Alchemist by Paulo Coelho-1.pdf"
all_text = extract_text_from_pdf(pdf_path)

# --- Split text into chunks ---
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
chunks = splitter.split_text(all_text)
print("Chunks created:", len(chunks))

# --- Convert chunks to LangChain Documents ---
texts = [Document(page_content=chunk) for chunk in chunks if chunk.strip()]
print("Texts created:", len(texts))


Chunks created: 225
Texts created: 225


In [118]:
# --- Step 3:Embedding chunks using HuggingFace and storing it on chroma vector db  ---
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

chroma_db_path = "./alchemist_chroma"

# Create and persist
vectorstore = Chroma.from_documents(texts, embedding_model, persist_directory=chroma_db_path)
vectorstore.persist()

print("✅ Vector store created and saved!")


✅ Vector store created and saved!


In [119]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.retrievers.multi_query import MultiQueryRetriever

In [120]:
# --- Step 4: APPLYING DIFFERENT FEATURES ---

# --- Step : Memory Setup ---
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    input_key="question",
    output_key="result"
)

# --- Step : Groq LLM Setup ---
groq_llm = ChatOpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ["GROQ_API_KEY"],
    model_name="llama-3.1-8b-instant",
    temperature=0.7
)

# --- Step : MultiQueryRetriever ---
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(),
    llm=groq_llm
)

# ✅ Hybrid Search Function
def hybrid_search(query: str, vectorstore, k=5, keyword=None):
    semantic_results = vectorstore.similarity_search(query, k=k)

    if keyword:
        keyword_matches = [
            doc for doc in semantic_results
            if keyword.lower() in doc.page_content.lower()
        ]
    else:
        keyword_matches = []

    # Merge: prioritize keyword matches
    hybrid_results = keyword_matches + [doc for doc in semantic_results if doc not in keyword_matches]
    return hybrid_results[:k]


# --- Step : Final QA Chain ---
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=groq_llm,
    retriever=multi_query_retriever,
    memory=memory,
    return_source_documents=True,
    output_key="result"
)

In [121]:
# --- Step 5: Ask Something ---
question = "What lesson does Santiago learn from the crystal merchant?"
response = qa_chain.invoke({"question": question})
print("Answer:\n", response["result"])

Answer:
 Santiago learns that true wealth is not just about material possessions, but about understanding and living in accordance with one's heart's desires. The crystal merchant, who has spent his life accumulating wealth and possessions, is shown to be unhappy and unsatisfied, despite his outward success. 

The merchant is too afraid to take a risk and sell all his crystals at a loss in order to fulfill his dream of going to Mecca, and instead, he lives a life of regret, stuck in his daily routine. 

Santiago, on the other hand, is inspired by the merchant's story and realizes that he needs to follow his own dreams and desires, rather than accumulating material wealth. The merchant's story serves as a cautionary tale, highlighting the importance of living in accordance with one's heart and pursuing one's passions, rather than simply accumulating wealth and possessions.
