In [26]:
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint,ChatHuggingFace
from langchain_core.prompts import PromptTemplate,ChatPromptTemplate
from langchain_core.messages import HumanMessage , SystemMessage
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
load_dotenv()

True

## Define ChatModel

In [3]:
llm = HuggingFaceEndpoint(
    repo_id="openai/gpt-oss-20b",
    task="text-generation",
)

model=ChatHuggingFace(llm=llm)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


## PDF Loader

In [8]:
pdf_object = PyPDFLoader("E:\\PDF_Summerize\\how to talk anyone.pdf")

pdf_loader= pdf_object.lazy_load()

print(pdf_loader)


<generator object PyPDFLoader.lazy_load at 0x000001DB3D6931F0>


## TEXT Splitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)
docs = text_splitter.split_documents(pdf_loader)

docs

Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 120 0 (offset 0)
Ignoring wrong pointing object 171 0 (offset 0)
Ignoring wrong pointing object 217 0 (offset 0)
Ignoring wrong pointing object 256 0 (offset 0)
Ignoring wrong pointing object 305 0 (offset 0)
Ignoring wrong pointing object 336 0 (offset 0)
Ignoring wrong pointing object 374 0 (offset 0)
Ignoring wrong pointing object 431 0 (offset 0)
Ignoring wrong pointing object 470 0 (offset 0)
Ignoring wrong pointing object 509 0 (offset 0)
Ignoring wrong pointing object 547 0 (offset 0)
Ignoring wrong pointing object 580 0 (offset 0)
Ignoring wrong pointing object 621 0 (offset 0)
Ignoring wrong pointing object 662 0 (offset 0)
Ignoring wrong pointing object 710 0 (offset 0)
Ignoring wrong pointing object 741 0 (offset 0)
Ignoring wrong pointing object 762 0 (offset 0)
Ignoring wrong pointing object 783 0 (offset 0)
Ignoring wrong pointing object 831 0 (offset 0)
Ignoring wrong pointing object 862 0 (off

[Document(metadata={'producer': 'Mac OS X 10.12.3 Quartz PDFContext', 'creator': 'Safari', 'creationdate': "D:20171029160441Z00'00'", 'title': 'Untitled', 'author': 'Asadur Jaman Laskar', 'moddate': "D:20171029160441Z00'00'", 'source': 'E:\\PDF_Summerize\\how to talk anyone.pdf', 'total_pages': 364, 'page': 1, 'page_label': '2'}, page_content='Copyright © 2003 by Leil Lowndes. All rights reserved. Manufactured in the United States of America.\nExcept as permitted under the United States Copyright Act of 1976, no part of this publication may be\nreproduced or distributed in any form or by any means, or stored in a database or retrieval system, with-\nout the prior written permission of the publisher. \n0-07-143334-1\nThe material in this eBook also appears in the print version of this title: 0-07-141858-X \nAll trademarks are trademarks of their respective owners. Rather than put a trademark symbol after\nevery occurrence of a trademarked name, we use names in an editorial fashion only,

## Generate an Embedding

In [16]:
vector_store=Chroma.from_documents(documents=docs, embedding=embeddings, collection_name="pdf_vectors")

## Retrievers

In [51]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.25, "k": 3}
)

In [53]:
query=input("Enter your query: ")
docs = retriever.invoke(query)

# Extract page_content from each document and join them
context = "\n\n".join([doc.page_content for doc in docs])

print("Retrieved context:")
print(context)
print("\n" + "="*50 + "\n")

tem=ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant for providing answers based on context and query."),
    ("human", "Answer the question based on the context below.\n\n{context}\n\nQuestion: {question}")
])

# Format the messages properly
messages = tem.format_messages(context=context, question=query)

result=model.invoke(messages)

result.content


Retrieved context:
Want to learn more?
We hope you enjoy this McGraw-Hill eBook!  If you d like
more information about this book, its author, or related books
and websites, please click here.
DOI Page 6x9  10/2/02  1:33 PM  Page 1
,

Want to learn more?
We hope you enjoy this McGraw-Hill eBook!  If you d like
more information about this book, its author, or related books
and websites, please click here.
DOI Page 6x9  10/2/02  1:33 PM  Page 1
,




'The provided excerpt does not include the title of the book. It only contains a generic copyright notice and navigation prompt from a McGraw‑Hill eBook. To identify the book, you would need additional information such as the cover page, chapter headings, or any specific content that names the work.'

In [42]:
def query_from_user(query:str)->str:

    docs = retriever.invoke(query)

# Extract page_content from each document and join them
    context = "\n\n".join([doc.page_content for doc in docs])

    tem=ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant for providing answers based on context and query."),
    ("human", "Answer the question based on the context below.\n\n{context}\n\nQuestion: {question}")
    ])

# Format the messages properly
    messages = tem.format_messages(context=context, question=query)

    result=model.invoke(messages)

    return result.content.strip()