In [1]:
file_path = "uploads/python.pdf"
VECTOR_FOLDER = "vectors"
index_name = "python"

In [2]:
from langchain_community.document_loaders import PDFPlumberLoader

In [3]:
loader = PDFPlumberLoader(file_path)
extracted_documents = loader.load()

In [4]:
def preprocess_document(doc, has_header=True, has_footer=True):
    """
    Preprocess the document by optionally removing headers and footers.

    Parameters:
    - doc: The document object containing `page_content`.
    - has_header (bool): Whether to remove the header (first line).
    - has_footer (bool): Whether to remove the footer (last line).

    Returns:
    - The document with updated `page_content`.
    """
    lines = doc.page_content.split("\n")  # Split the content into lines
    # Determine start and end indices based on the flags
    start_idx = 1 if has_header else 0  # Skip the first line if has header
    end_idx = -1 if has_footer else None  # Skip the last line if has footer

    body_lines = lines[start_idx:end_idx]

    # Rejoin the lines and update the document content
    doc.page_content = "\n".join(body_lines)
    return doc

In [5]:
documents = [preprocess_document(doc) for doc in extracted_documents]

In [6]:
documents

[Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 0, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content=''),
 Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 1, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='Prepared by Asif Bhat\nPython Tutorial\nIn [103]: import sys\nimport keyword\nimport operator\nfrom datetime import datetime\nimport os\nKeywords\nKeywords are the reserved words in Python and can\'t be used as an identifier\nIn [3]: print(keyword.kwlist) # List all Python Keywords\n[\'False\', \'None\', \'True\', \'and\', \'as\', \'assert\', \'async\', \'await\', \'break\', \'cl\nass\', \'continue\', \'def\', \'del\', \'elif\', \'else\', \'except\', \'finally\', \'for\', \'fr\nom\', \'global\', \'if\', \'import\', \'in\', \'is\', \'lambda\', \'nonlocal\', \'not\', \'or\',\n\'pass\', \'raise\', \'return\', \'try\', \'while\', \'with\', \'yield\']\nIn [4]: len(keyword.kwlist) # 

In [7]:
from langchain.text_splitter import CharacterTextSplitter

In [8]:
chunk_size = 300
chunk_overlap = 50
text_splitter = CharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split documents into chunks
chunks = text_splitter.split_documents(extracted_documents)

chunks

[Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 1, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='Prepared by Asif Bhat\nPython Tutorial\nIn [103]: import sys\nimport keyword\nimport operator\nfrom datetime import datetime\nimport os\nKeywords\nKeywords are the reserved words in Python and can\'t be used as an identifier\nIn [3]: print(keyword.kwlist) # List all Python Keywords\n[\'False\', \'None\', \'True\', \'and\', \'as\', \'assert\', \'async\', \'await\', \'break\', \'cl\nass\', \'continue\', \'def\', \'del\', \'elif\', \'else\', \'except\', \'finally\', \'for\', \'fr\nom\', \'global\', \'if\', \'import\', \'in\', \'is\', \'lambda\', \'nonlocal\', \'not\', \'or\',\n\'pass\', \'raise\', \'return\', \'try\', \'while\', \'with\', \'yield\']\nIn [4]: len(keyword.kwlist) # Python contains 35 keywords\nOut[4]: 35\nIdentifiers\nAn identifier is a name given to entities like class, functions, variables, etc. It helps to differentiat

In [9]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline

In [10]:
EMBADDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
st_embeddings = HuggingFaceEmbeddings(model_name=EMBADDING_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
vectorstore = FAISS.from_documents(chunks, st_embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x11f582570>

In [12]:
vectorstore.save_local(folder_path=VECTOR_FOLDER, index_name=index_name)

In [13]:
vectorstore = FAISS.load_local(
            VECTOR_FOLDER,
            embeddings=st_embeddings,
            index_name=index_name,
            allow_dangerous_deserialization=True,
        )
 
retriever =  vectorstore.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x123f39790>, search_kwargs={})

In [17]:
query = "What is List?"
retriever.invoke(input = query)

[Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 20, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='In [426]: list4 = [\'one\',\'two\' , "three"] # List of strings\nIn [427]: list5 = [\'Asif\', 25 ,[50, 100],[150, 90]] # Nested Lists\nIn [428]: list6 = [100, \'Asif\', 17.765] # List of mixed data types\nIn [429]: list7 = [\'Asif\', 25 ,[50, 100],[150, 90] , {\'John\' , \'David\'}]\nIn [430]: len(list6) #Length of list\nOut[430]: 3\nList Indexing\nIn [432]: list2[0] # Retreive first element of the list\nOut[432]: 10\nIn [433]: list4[0] # Retreive first element of the list\nOut[433]: \'one\'\nIn [434]: list4[0][0] # Nested indexing - Access the first character of the first list elem\nOut[434]: \'o\'\nIn [435]: list4[-1] # Last item of the list\nOut[435]: \'three\'\nIn [436]: list5[-1] # Last item of the list\nOut[436]: [150, 90]\nList Slicing\nlocalhost:8889/notebooks/Documents/GitHub/Public/Python/Python.ipynb 20/118'),
 Document(m

In [18]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

In [40]:
LLM_CHECKPOINT_ID = "google/flan-t5-base"
LLM_TASK = "text2text-generation"  # "text-generation", "summarization"
LLM_TEMPERATURE = 0.2

In [41]:
tokenizer = AutoTokenizer.from_pretrained(
    LLM_CHECKPOINT_ID, return_tensors="pt"
)

In [42]:
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_CHECKPOINT_ID)

In [43]:
import torch

In [44]:
pipe = pipeline(
    task=LLM_TASK,
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    do_sample=True,
    temperature=LLM_TEMPERATURE,
)

pipe

Device set to use cpu


<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline at 0x11cbc9f10>

In [45]:
llm = HuggingFacePipeline(pipeline=pipe)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x11cbc9f10>, model_id='google/flan-t5-base')

In [46]:
BOT_NAME = "d-chat"
PROMPT_INSTRUCTIONS = [
    "Answer the question based on the context provided.",
    "Do not use any external resources to answer the question.",
    "Do not provide any personal information.",
    "If you do not find the answer in the context, you can say 'I don't know'.",
]

In [47]:
PROMPT_INSTRUCTION_LITERALS = f"""Instructions:
{"\n".join(
    [f"{idx+1}. {instruction} " for idx, instruction in enumerate(PROMPT_INSTRUCTIONS)]
)}
"""
PROMPT_INSTRUCTION_LITERALS

"Instructions:\n1. Answer the question based on the context provided. \n2. Do not use any external resources to answer the question. \n3. Do not provide any personal information. \n4. If you do not find the answer in the context, you can say 'I don't know'. \n"

In [27]:
from langchain_core.prompts import ChatPromptTemplate

In [48]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", f"You are an AI assistant bot, named {BOT_NAME}."),
        ("system", "Answer questions based on the context provided."),
        ("system", "\n{context}"),
        ("system", PROMPT_INSTRUCTION_LITERALS),
        ("human", "{input}"),
    ]
)
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are an AI assistant bot, named d-chat.'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='Answer questions based on the context provided.'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n{context}'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template="Instructions:\n1. Answer the question based on the context provided. \n2. Do not use any external resources to answer the question. \n3. Do not provide any personal information. \n4. If you do not find the answer in the context, you 

In [49]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
question_answer_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are an AI assistant bot, named d-chat.'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='Answer questions based on the context provided.'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n{context}'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template="Instructions:\n1. Answer the question based on the

In [50]:
chain = create_retrieval_chain(retriever, question_answer_chain)
chain


RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x123f39790>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are an AI assistant bot, named d-chat.'), additional_kwargs={}), SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types

In [51]:
response = chain.invoke({"input": query})
response

Token indices sequence length is longer than the specified maximum sequence length for this model (1438 > 512). Running this sequence through the model will result in indexing errors


{'input': 'What is List?',
 'context': [Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 20, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='In [426]: list4 = [\'one\',\'two\' , "three"] # List of strings\nIn [427]: list5 = [\'Asif\', 25 ,[50, 100],[150, 90]] # Nested Lists\nIn [428]: list6 = [100, \'Asif\', 17.765] # List of mixed data types\nIn [429]: list7 = [\'Asif\', 25 ,[50, 100],[150, 90] , {\'John\' , \'David\'}]\nIn [430]: len(list6) #Length of list\nOut[430]: 3\nList Indexing\nIn [432]: list2[0] # Retreive first element of the list\nOut[432]: 10\nIn [433]: list4[0] # Retreive first element of the list\nOut[433]: \'one\'\nIn [434]: list4[0][0] # Nested indexing - Access the first character of the first list elem\nOut[434]: \'o\'\nIn [435]: list4[-1] # Last item of the list\nOut[435]: \'three\'\nIn [436]: list5[-1] # Last item of the list\nOut[436]: [150, 90]\nList Slicing\nlocalhost:8889/notebooks/Documents/GitHub/Public/Py

In [52]:
response = chain.invoke({"input": "what is all() method?"})
response

{'input': 'what is all() method?',
 'context': [Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 70, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='It is normally used with Lambda functions to filter list, tuple, or sets.\nfilter() method takes two parameters:\nfunction - function tests if elements of an iterable returns true or false\niterable - Sequence which needs to be filtered, could be sets, lists, tuples, or any iterators\nSyntax:\nMap\nThe map() function applies a given function to each item of an iterable (list, tuple etc.) and\nreturns a list of the results.\nmap() function takes two Parameters :\nfunction : The function to execute for each item of given iterable.\niterable : It is a iterable which is to be mapped.\nReturns : Returns a list of the results after applying the given function to each item of a given\niterable (list, tuple etc.)\nSyntax:\nReduce\nlocalhost:8889/notebooks/Documents/GitHub/Public/Python/Python.i

In [53]:
response = chain.invoke({"input": "what is the use of footer?"})
response

{'input': 'what is the use of footer?',
 'context': [Document(metadata={'source': 'uploads/python.pdf', 'file_path': 'uploads/python.pdf', 'page': 70, 'total_pages': 119, 'Producer': 'PyPDF2'}, page_content='It is normally used with Lambda functions to filter list, tuple, or sets.\nfilter() method takes two parameters:\nfunction - function tests if elements of an iterable returns true or false\niterable - Sequence which needs to be filtered, could be sets, lists, tuples, or any iterators\nSyntax:\nMap\nThe map() function applies a given function to each item of an iterable (list, tuple etc.) and\nreturns a list of the results.\nmap() function takes two Parameters :\nfunction : The function to execute for each item of given iterable.\niterable : It is a iterable which is to be mapped.\nReturns : Returns a list of the results after applying the given function to each item of a given\niterable (list, tuple etc.)\nSyntax:\nReduce\nlocalhost:8889/notebooks/Documents/GitHub/Public/Python/Pyt