In [8]:
import os
import json
import hashlib
from pathlib import Path
from dotenv import load_dotenv

from langsmith import traceable

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.messages import BaseMessage,AIMessage,HumanMessage,SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableLambda
from langgraph.graph import StateGraph,START,END
from langgraph.graph.message import add_messages
from typing import TypedDict,Annotated

os.environ['HF_HOME'] = 'D:/huggingface_cache'

load_dotenv()

PDF_PATH = "islr.pdf"  # change to your file
INDEX_ROOT = Path(".indices")
INDEX_ROOT.mkdir(exist_ok=True)


In [9]:


class ChatStateSchema(TypedDict):
    pdf_path: str 
    chunk_size: int 
    chunk_overlap: int 
    embed_model_name: str 
    force_rebuild: bool
    context:str
    message:Annotated[list[BaseMessage],add_messages]

graph = StateGraph(ChatStateSchema)
model = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash-lite')


In [10]:

# ----------------- helpers (traced) -----------------
@traceable(name="load_pdf")
def load_pdf(path: str):
    return PyPDFLoader(path).load()  # list[Document]

@traceable(name="split_documents")
def split_documents(docs, chunk_size=1000, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(docs)

@traceable(name="build_vectorstore")
def build_vectorstore(splits, embed_model_name: str):
    emb = HuggingFaceEmbeddings(model=embed_model_name)
    return FAISS.from_documents(splits, emb)

# ----------------- cache key / fingerprint -----------------
def _file_fingerprint(path: str) -> dict:
    p = Path(path)
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return {"sha256": h.hexdigest(), "size": p.stat().st_size, "mtime": int(p.stat().st_mtime)}

def _index_key(pdf_path: str, chunk_size: int, chunk_overlap: int, embed_model_name: str) -> str:
    meta = {
        "pdf_fingerprint": _file_fingerprint(pdf_path),
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "embedding_model": embed_model_name,
        "format": "v1",
    }
    return hashlib.sha256(json.dumps(meta, sort_keys=True).encode("utf-8")).hexdigest()

# ----------------- explicitly traced load/build runs -----------------
@traceable(name="load_index", tags=["index"])
def load_index_run(index_dir: Path, embed_model_name: str):
    emb = HuggingFaceEmbeddings(model=embed_model_name)
    return FAISS.load_local(
        str(index_dir),
        emb,
        allow_dangerous_deserialization=True
    )

@traceable(name="build_index", tags=["index"])
def build_index_run(pdf_path: str, index_dir: Path, chunk_size: int, chunk_overlap: int, embed_model_name: str):
    docs = load_pdf(pdf_path)  # child
    splits = split_documents(docs, chunk_size=chunk_size, chunk_overlap=chunk_overlap)  # child
    vs = build_vectorstore(splits, embed_model_name)  # child
    index_dir.mkdir(parents=True, exist_ok=True)
    vs.save_local(str(index_dir))
    (index_dir / "meta.json").write_text(json.dumps({
        "pdf_path": os.path.abspath(pdf_path),
        "chunk_size": chunk_size,
        "chunk_overlap": chunk_overlap,
        "embedding_model": embed_model_name,
    }, indent=2))
    return vs

# ----------------- dispatcher (not traced) -----------------
def load_or_build_index(
    pdf_path: str,
    chunk_size: int = 1000,
    chunk_overlap: int = 150,
    embed_model_name: str = "facebook/bart-base",
    force_rebuild: bool = False,
):
    key = _index_key(pdf_path, chunk_size, chunk_overlap, embed_model_name)
    index_dir = INDEX_ROOT / key
    cache_hit = index_dir.exists() and not force_rebuild
    if cache_hit:
        return load_index_run(index_dir, embed_model_name)
    else:
        return build_index_run(pdf_path, index_dir, chunk_size, chunk_overlap, embed_model_name)
    
def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

@traceable(name="setup_pipeline", tags=["setup"])
def setup_pipeline(pdf_path: str, chunk_size=1000, chunk_overlap=150, embed_model_name="facebook/bart-base", force_rebuild=False):
    return load_or_build_index(
        pdf_path=pdf_path,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        embed_model_name=embed_model_name,
        force_rebuild=force_rebuild,
    )

@traceable(name="Context Finder")
def context_finder(state:ChatStateSchema):
    vectorstore = setup_pipeline(state['pdf_path'], state['chunk_size'], state['chunk_overlap'], state['embed_model_name'], state['force_rebuild'])
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})
    question = state['message'][-1].content
    docs =retriever.invoke(question)
    context = format_docs(docs)
    return {'context':context}

@traceable(name ='Chat Model')
def chat(state:ChatStateSchema):
    message_history = state['message']
    prompt = ChatPromptTemplate([
        ('placeholder', "{message_history}"),
        ('human',"Answer ONLY from the provided context. If not found, say you don't know. Based on the provided content and question\nquestion:{question}\n context:{context}")
        ])
    chain = prompt | model
    result= chain.invoke({'message_history':message_history[:-1],'question':message_history[-1].content,'context':state['context']})
    return {'message':[result]}


In [11]:


graph.add_node('context_finder',context_finder)
graph.add_node('chat',chat)

graph.add_edge(START,'context_finder')
graph.add_edge('context_finder','chat')
graph.add_edge('chat',END)


<langgraph.graph.state.StateGraph at 0x1e15baf7ce0>

In [12]:

workflow =graph.compile()


In [None]:

initial_state={
    'pdf_path':'uploaded_pdfs\islr.pdf',
    'chunk_size': 1000,
    'chunk_overlap': 150,
    'embed_model_name': "facebook/bart-base",
    'force_rebuild': False,
    'message':[HumanMessage('Who is the writer of this book')],
}
final_state =workflow.invoke(initial_state)

print(final_state)


  'pdf_path':'uploaded_pdfs\islr.pdf',
No sentence-transformers model found with name facebook/bart-base. Creating a new one with mean pooling.


{'pdf_path': 'uploaded_pdfs\\islr.pdf', 'chunk_size': 1000, 'chunk_overlap': 150, 'embed_model_name': 'facebook/bart-base', 'force_rebuild': False, 'context': 'Wage Income survey data for males in central Atlantic region of USA.\nWeekly 1,089 weekly stock market returns for 21 years.\nTABLE 1.1.A list of data sets needed to perform the labs and exercises in this\ntextbook. All data sets are available in the ISLR library, with the exception of\nBoston (part ofMASS)a n dUSArrests (part of the baseR distribution).\nIt contains a number of resources, including theR package associated with\nthis book, and some additional data sets.\nAcknowledgements\nA few of the plots in this book were taken from ESL: Figures 6.7, 8.3,\nand 10.12. All other plots are new to this book.\n\nthrough collaborations and as a member of the Institute of Medicine committee that \nled to the report Evolution of Translational Omics.\nTrevor Hastie and Robert Tibshirani are professors of statistics at Stanford Univers

In [15]:
final_state['message'][-1].content

'Gareth James, Daniela Witten, Trevor Hastie, and Robert Tibshirani'

In [7]:
while True:
    user_message =input("Chat Here:")

    print(f'User:{user_message}')

    if user_message.strip().lower() in ['exit','quit','bye']:
        break
    initial_state={
        'pdf_path':'islr.pdf',
        'chunk_size': 1000,
        'chunk_overlap': 150,
        'embed_model_name': "facebook/bart-base",
        'force_rebuild': False,
        'message':[HumanMessage(user_message)],
        }
    result = workflow.invoke(initial_state)

    print(f'AI:{result['message'][-1].content}')

User:who is the author of this book


No sentence-transformers model found with name facebook/bart-base. Creating a new one with mean pooling.


AI:G. James et al.
User:exit


In [None]:
# I want you to create a streamlit UI for a chatbot when one asks quetion question as well as asnwer should be displayed like a chat there should be a opition to upload a pdf during chat there should be a side bar with a option of new chat and also showing the old chats each chat should have a feature of rename provide full code 

In [None]:
# there is a flow in this code there a a pdf upload interface which is on the top after the heading chatbot ui when i type first query it is on top of my first query chat but when i type some thing else like next question it changes it position to so it stays on the top of my latest question

In [None]:
{'pdf_path': 'uploaded_pdfs\islr.pdf', 'chunk_size': 1000, 'chunk_overlap': 150, 'embed_model_name': 'facebook/bart-base', 'force_rebuild': False, 'context': 'through collaborations and as a member of the Institute of Medicine committee that \nled to the report Evolution of Translational Omics.\nTrevor Hastie and Robert Tibshirani are professors of statistics at Stanford University, and \nare co-authors of the successful textbook Elements of Statistical Learning. Hastie and \nTibshirani developed generalized additive models and wrote a popular book of that \ntitle. Hastie co-developed much of the statistical modeling soft ware and environment \nin R/S-PLUS and invented principal curves and surfaces. Tibshirani proposed the lasso \nand is co-author of the very successful An Introduction to the Bootstrap.\n9 781461 471370\nISBN 978-1-4614-7137-0\nSTS\n\n• Outstate : Out-of-state tuition\n• Room.Board : Room and board costs\n• Books : Estimated book costs\n• Personal : Estimated personal spending\n• PhD : Percent of faculty with Ph.D.’s\n• Terminal : Percent of faculty with terminal degree\n• S.F.Ratio : Student/faculty ratio\n• perc.alumni : Percent of alumni who donate\n• Expend : Instructional expenditure per student\n• Grad.Rate : Graduation rate\nBefore reading the data intoR, it can be viewed in Excel or a text\neditor.\n(a) Use the read.csv() function to read the data intoR.C a l lt h e\nloaded datacollege. Make sure that you have the directory set\nto the correct location for the data.\n(b) Look at the data using thefix() function. You should notice\nthattheﬁrstcolumnisjustthenameofeachuniversity.Wedon’t\nreally wantR to treat this as data. However, it may be handy to\nhave these names for later. Try the following commands:\n\n3.3 Other Considerations in the Regression Model 85\nxi =\n{\n1i f ith person is female\n−1i f ith person is male\nand use this variable in the regression equation. This results in the model\nyi = β0 +β1xi +ϵi =\n{\nβ0 +β1 +ϵi if ith person is female\nβ0 −β1 +ϵi if ith person is male.\nNow β0 can be interpreted as the overallaverage credit card balance (ig-\nnoring the gender eﬀect), andβ1 is the amount that females are above the\naverage and males are below the average. In this example, the estimate for\nβ0 would be 
519.665
,
h
a
l
f
w
a
y
b
e
t
w
e
e
n
t
h
e
m
a
l
e
a
n
d
f
e
m
a
l
e
a
v
e
r
a
g
e
s
o
f
\n
519.665,halfwaybetweenthemaleandfemaleaveragesof\n509.80 and 
529.53.
T
h
e
e
s
t
i
m
a
t
e
f
o
r
β
1
w
o
u
l
d
b
e
529.53.Theestimateforβ1wouldbe9.865, which is half of\n$19.73,the averagediﬀerence between females and males. It is important to\nnote that the ﬁnal predictions for the credit balances of males and females\nwill be identical regardless of the coding scheme used. The only diﬀerence\nis in the way that the coeﬃcients are interpreted.\nQualitative Predictors with More than Two Levels\n\n(c) Fit a ridge regression model on the training set, withλ chosen\nby cross-validation. Report the test error obtained.\n(d) Fit a lasso model on the training set, withλ chosen by cross-\nvalidation. Report the test error obtained, along with the num-\nber of non-zero coeﬃcient estimates.\n(e) Fit a PCR model on the training set, withM chosen by cross-\nvalidation. Report the test error obtained, along with the value\nof M selected by cross-validation.\n(f) Fit a PLS model on the training set, withM chosen by cross-\nvalidation. Report the test error obtained, along with the value\nof M selected by cross-validation.\n(g) Comment on the results obtained. How accurately can we pre-\ndict the number of college applications received? Is there much\ndiﬀerence among the test errors resulting from these ﬁve ap-\nproaches?\n10. Wehaveseenthatasthenumberoffeaturesusedinamodelincreases,\nthe trainingerrorwill necessarilydecrease,but the test errormaynot.\nWe will now explore this in a simulated data set.', 'message': [HumanMessage(content='who is the author of this book', additional_kwargs={}, response_metadata={}, id='da77eaf2-e869-4be6-83e5-45a972e5f2a9'), AIMessage(content='G. James et al.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash-lite', 'safety_ratings': []}, id='run--9af784d2-1a07-46f6-a5bc-2bcacf5d87ad-0', usage_metadata={'input_tokens': 1042, 'output_tokens': 7, 'total_tokens': 1049, 'input_token_details': {'cache_read': 0}}), HumanMessage(content='who is the author of the book', additional_kwargs={}, response_metadata={}, id='fd1c3987-eed8-4971-84d7-93e14facaaff'), AIMessage(content="I don't know.", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash-lite', 'safety_ratings': []}, id='run--8e42849c-c9a7-4874-930e-68cde3ae02ab-0', usage_metadata={'input_tokens': 982, 'output_tokens': 7, 'total_tokens': 989, 'input_token_details': {'cache_read': 0}})]}