In [1]:
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from collections import defaultdict

In [2]:
pdf_folder_path = './data/'


In [3]:
all_text = ""


In [4]:
for filename in os.listdir(pdf_folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, filename)
        loader = PyMuPDFLoader(pdf_path)
        docs = loader.load()
        for doc in docs:
            all_text += doc.page_content + "\n" 

print(f"Total text length: {len(all_text)} characters.")

Total text length: 208574 characters.


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)


In [6]:
chunks = text_splitter.split_text(all_text)


In [7]:
print(f"Number of chunks created: {len(chunks)}")
print(f"Sample chunk: {chunks[0]}")

Number of chunks created: 690
Sample chunk: Large Language Models: A Survey
Shervin Minaee, Tomas Mikolov, Narjes Nikzad, Meysam Chenaghlu
Richard Socher, Xavier Amatriain, Jianfeng Gao
Abstract—Large Language Models (LLMs) have drawn a
lot of attention due to their strong performance on a wide
range of natural language tasks, since the release of ChatGPT
in November 2022. LLMs’ ability of general-purpose language
understanding and generation is acquired by training billions of


In [8]:
docs = [Document(page_content=chunk) for chunk in chunks]


In [9]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [10]:
db2 = Chroma.from_documents(docs, embedding=embedding, persist_directory="./chroma_db/RRF")


In [11]:
db3 = Chroma(persist_directory="./chroma_db/RRF", embedding_function=embedding)


  warn_deprecated(


In [12]:
retriever = db3.as_retriever(search_kwargs={"k": 3})


In [13]:
llm = Ollama(model="llama3")


In [14]:
chain = RetrievalQA.from_chain_type(
    llm, retriever=retriever
)

In [15]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [16]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [17]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

# Manually handle Chat history

In [18]:
chat_history = []


In [28]:
from langchain_core.messages import AIMessage, HumanMessage
question = "What is Search types discuss,only give types?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
# manually add in chathistory

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

In [29]:
ai_msg_1

{'input': 'What is Search types discuss,only give types?',
 'chat_history': [HumanMessage(content='What is Search types discuss,only give types?'),
  AIMessage(content="According to the provided context, search types discussed are:\n\n* Google search engine\n* Retrieval augmented generation (RAG)\n\nI don't know any other specific search types mentioned in this context.")],
 'context': [Document(page_content='the Google search engine as questions. An annotator\nis presented with a question along with a Wikipedia\npage from the top 5 search results, and annotates a\nlong answer (typically a paragraph) and a short answer'),
  Document(page_content='C. Augmenting LLMs through external knowledge - RAG\nOne of the main limitations of pre-trained LLMs is their\nlack of up-to-date knowledge or access to private or use-\ncase-specific information. This is where retrieval augmented\ngeneration (RAG) comes into the picture [164]. RAG, illus-\ntrated in figure 37, involves extracting a query from

In [30]:
second_question = "Explain any of above type you mentioned?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})
# manually add in chathistory

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_2["answer"]),
    ]
)

In [31]:
ai_msg_2

{'input': 'Explain any of above type you mentioned?',
 'chat_history': [HumanMessage(content='What is Search types discuss,only give types?'),
  AIMessage(content="According to the provided context, search types discussed are:\n\n* Google search engine\n* Retrieval augmented generation (RAG)\n\nI don't know any other specific search types mentioned in this context."),
  HumanMessage(content='What is Search types discuss,only give types?'),
  AIMessage(content="The two search types mentioned in the context are:\n\n1. A search engine (no specific one is mentioned, but Google is an example)\n2. Retrieval augmented generation (RAG)\n\nThere's no specific explanation provided for these search types in this context.")],
 'context': [Document(page_content='for the model to generate the final response. A RAG system\nincludes three important components: Retrieval, Generation,\nAugmentation [165].\na) RAG-aware prompting techniques: Because of the\nimportance of RAG to build advanced LLM systems

# Automate memory

In [37]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [38]:
conversational_rag_chain.invoke(
    {"input": "my nme is sosa . i have one question about : How i fine tune LLM?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  
)["answer"]

"To fine-tune an LLM, you can use instruction tuning, which involves providing specific prompts that align with the expectations humans would have when giving instructions. This approach helps improve the model's performance by scaling laws and has been shown to outperform original foundation models in many cases. The specific approach and datasets used for instruction tuning may vary, but generally speaking, it leads to better results."

In [39]:
conversational_rag_chain.invoke(
    {"input": "diffrenece between Rag and above texhnique?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  
)["answer"]

'RAG (Retrieval-Augmentation-Generation) is a system that includes three components: Retrieval, Generation, and Augmentation. The key difference between RAG and instruction tuning is that RAG involves retrieving relevant information from an external knowledge source, adding it to the original prompt, and then generating a response, whereas instruction tuning focuses on providing specific prompts that align with human expectations for task completion.'