In [1]:
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from collections import defaultdict

In [2]:
pdf_folder_path = './data/'

In [3]:
all_text = ""

In [4]:
for filename in os.listdir(pdf_folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, filename)
        loader = PyMuPDFLoader(pdf_path)
        docs = loader.load()
        for doc in docs:
            all_text += doc.page_content + "\n" 

print(f"Total text length: {len(all_text)} characters.")

Total text length: 208574 characters.


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)

In [6]:
chunks = text_splitter.split_text(all_text)

In [7]:
print(f"Number of chunks created: {len(chunks)}")
print(f"Sample chunk: {chunks[0]}")

Number of chunks created: 690
Sample chunk: Large Language Models: A Survey
Shervin Minaee, Tomas Mikolov, Narjes Nikzad, Meysam Chenaghlu
Richard Socher, Xavier Amatriain, Jianfeng Gao
Abstract—Large Language Models (LLMs) have drawn a
lot of attention due to their strong performance on a wide
range of natural language tasks, since the release of ChatGPT
in November 2022. LLMs’ ability of general-purpose language
understanding and generation is acquired by training billions of


In [11]:
hyde_docs = [Document(page_content=chunk) for chunk in chunks]

In [9]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [12]:
db2 = Chroma.from_documents(hyde_docs, embedding=embedding, persist_directory="./chroma_db/HYDE")

In [13]:
db3 = Chroma(persist_directory="./chroma_db/HYDE", embedding_function=embedding)

  warn_deprecated(


In [14]:
retriever = db3.as_retriever(search_kwargs={"k": 3})

In [15]:
llm = Ollama(model="llama3")

In [25]:
template = """
You are a helpful AI assistant. Please answer the following question:

Question: {question}

Answer:
"""


In [26]:
from langchain import PromptTemplate


In [27]:
prompt = PromptTemplate(
    input_variables=["question"],
    template=template,
)

In [28]:
question = "What is Fine-tuining?"

In [31]:
formatted_prompt = prompt.format(question=question)

In [32]:
formatted_prompt

'\nYou are a helpful AI assistant. Please answer the following question:\n\nQuestion: What is Fine-tuining?\n\nAnswer:\n'

In [33]:
llm_answer = llm.invoke(formatted_prompt)

In [34]:
llm_answer

"Fine-tuning!\n\nFine-tuning refers to the process of adjusting or refining a pre-trained model, such as a neural network, to better fit a specific task or dataset. This technique involves using a small amount of labeled data and training algorithms to adapt the model's weights to the new task, without completely retraining it from scratch.\n\nIn other words, fine-tuning is like giving a model a gentle nudge in the right direction to help it learn new skills or improve its performance on a specific problem. This approach has been widely used in natural language processing (NLP) and computer vision tasks, where a pre-trained model can be fine-tuned for a particular domain or task, such as sentiment analysis, named entity recognition, or object detection.\n\nThe benefits of fine-tuning include:\n\n1. Reduced data requirements: Fine-tuning requires less labeled data compared to training from scratch.\n2. Improved performance: By leveraging the knowledge learned by the pre-trained model, f

In [43]:
similar_answer = retriever.get_relevant_documents(llm_answer)

In [44]:
similar_answer

[Document(page_content='Fine-tuning does not need to be performed to a single\ntask though, and there are different approaches to multi-task\nfine-tuning (see e.g. Mahabi et al. [132]). Fine-tuning to one\nor more tasks is known to improve results and reduce the\ncomplexity of prompt engineering, and it can serve as an\n2https://platform.openai.com/docs/guides/fine-tuning'),
 Document(page_content='perform specific tasks. In order for the foundation model to be\nuseful it needed to be fine-tuned to a specific task with labeled\ndata (so-called supervised fine-tuning or SFT for short). For\nexample, in the original BERT paper [24], the model was fine-\ntuned to 11 different tasks. While more recent LLMs no longer\nrequire fine-tuning to be used, they can still benefit from task\nor data-specific fine-tuning. For example, OpenAI reports that'),
 Document(page_content='Journal of Machine Learning Research, vol. 23, no. 1, pp. 5232–5270,\n2022.\n[132]\nR. K. Mahabadi, S. Ruder, M. Dehghani

In [45]:
template1 = """
You are a helpful AI assistant. Please answer the following question according to my context:

question : {question}
Context: {similar_answer}

Answer:
"""


In [49]:
prompt = PromptTemplate(
    input_variables=["question","similar_answer"],
    template=template1,
)

In [50]:
formatted_prompt = prompt.format(question=question,similar_answer=similar_answer)

In [51]:
formatted_prompt

"\nYou are a helpful AI assistant. Please answer the following question according to my context:\n\nquestion : What is Fine-tuining?\nContext: [Document(page_content='Fine-tuning does not need to be performed to a single\\ntask though, and there are different approaches to multi-task\\nfine-tuning (see e.g. Mahabi et al. [132]). Fine-tuning to one\\nor more tasks is known to improve results and reduce the\\ncomplexity of prompt engineering, and it can serve as an\\n2https://platform.openai.com/docs/guides/fine-tuning'), Document(page_content='perform specific tasks. In order for the foundation model to be\\nuseful it needed to be fine-tuned to a specific task with labeled\\ndata (so-called supervised fine-tuning or SFT for short). For\\nexample, in the original BERT paper [24], the model was fine-\\ntuned to 11 different tasks. While more recent LLMs no longer\\nrequire fine-tuning to be used, they can still benefit from task\\nor data-specific fine-tuning. For example, OpenAI reports 

In [52]:
llm_answer_according_to_context  = llm.invoke(formatted_prompt)

In [53]:
llm_answer_according_to_context

"Based on the provided context, Fine-tuning refers to a process where a pre-trained foundation model is adapted or specialized to perform specific tasks by training it on labeled data. This involves adjusting the model's parameters to better suit the requirements of the target task(s), which can lead to improved results and reduced complexity in prompt engineering."