In [8]:
from langchain.document_loaders import PyPDFLoader

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
import os

In [11]:
file_path = os.path.join(os.getcwd(), "data", "sample.pdf")

In [12]:
loader = PyPDFLoader(file_path)

In [13]:
documents = loader.load()

In [15]:
len(documents)

77

In [20]:
##this is an experimental value there is not deteministic value for chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
)

In [21]:
docs = text_splitter.split_documents(documents)
len(docs)

765

In [25]:
print(docs[0].metadata)
print(docs[0].page_content)

{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/home/viren/document_portal/notebook/data/sample.pdf', 'total_pages': 77, 'page': 0, 'page_label': '1'}
Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗ Louis Martin† Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller
Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou
Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev


In [71]:
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


In [35]:
load_dotenv()

True

In [39]:
embedding_model = GoogleGenerativeAIEmbeddings(model="embedding-001")

In [63]:
llm=ChatGroq(model="deepseek-r1-distill-llama-70b")

In [40]:
vectorestore = Chroma.from_documents(
    docs,
    embedding_model)

In [47]:
relavant_docs = vectorestore.similarity_search("What are the scores of the Llama2 models?")

In [48]:
relavant_docs[0].page_content

'2 models and others open-source models.\nStandard Benchmarks. In Table 20, we show results on several standard benchmarks.\nCode Generation. In Table 21, we compare results ofLlama 2with popular open source models on the\nHuman-Eval and MBPP code generation benchmarks.\nWorld Knowledge. We evaluate theLlama 2model together with other open-source models on the Natu-\nralQuestions and TriviaQA benchmarks (Table 22).'

In [76]:
retriever = vectorestore.as_retriever(search_kwargs={"k": 10})

In [77]:
retriever.get_relevant_documents("What are the scores of the Llama2 models?")

[Document(metadata={'keywords': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'page_label': '48', 'title': '', 'source': '/home/viren/document_portal/notebook/data/sample.pdf', 'producer': 'pdfTeX-1.40.25', 'total_pages': 77, 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'moddate': '2023-07-20T00:30:36+00:00', 'page': 47, 'subject': '', 'trapped': '/False', 'author': ''}, page_content='2 models and others open-source models.\nStandard Benchmarks. In Table 20, we show results on several standard benchmarks.\nCode Generation. In Table 21, we compare results ofLlama 2with popular open source models on the\nHuman-Eval and MBPP code generation benchmarks.\nWorld Knowledge. We evaluate theLlama 2model together with other open-source models on the Natu-\nralQuestions and TriviaQA benchmarks (Table 22).'),
 Document(metadata={'author': '', 'creator': 'LaTeX with hyperref', 'producer': 'pdfTeX-1.

In [54]:
prompt_template = """
Answer the question based on the context provided below. If the context does not contain the answer, say "I don't know".

Context: {context}

Question: {question}

Answer:
"""

In [55]:
from langchain.prompts import PromptTemplate

In [56]:
Prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [57]:
Prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\nAnswer the question based on the context provided below. If the context does not contain the answer, say "I don\'t know".\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:\n')

In [62]:
parser = StrOutputParser()

In [64]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [80]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | Prompt
    | llm
    | StrOutputParser()
)

In [83]:
rag_chain.invoke("Can you give me reward model usage in Llama2 models?")

"<think>\nOkay, so I need to figure out the reward model usage in Llama2 models based on the provided context. Let me read through the context carefully to extract the relevant information.\n\nFirst, I see that the context mentions the reward model several times. It says the reward model takes a model response and its prompt, including previous contexts, and outputs a scalar score indicating quality aspects like helpfulness and safety. This score is used as a reward to optimize Llama 2-Chat during RLHF (Reinforcement Learning with Human Feedback) to align better with human preferences and improve helpfulness and safety.\n\nIt also talks about how the reward models were trained using human annotations collected over time. These annotations were done weekly, and as more data was collected, the reward models improved. There's mention of evaluating model variants with different percentages of safety data using these reward models. \n\nThe context notes that the reward model's accuracy can 