# Building RAG from Scratch
https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/

## Model Define

In [2]:
SENTENCE_TRANSFORMER = 'sentence-transformers/all-MiniLM-L6-v2'
LLM = 'llama3-70b-8192'
LLM_API_KEY = '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'
DOCUMENT = '../data/llama2.pdf'

## Load Model

In [3]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq

embed_model = HuggingFaceEmbedding(model_name=SENTENCE_TRANSFORMER)
llm = Groq(model=LLM, api_key=LLM_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


## Initialize Postgres

In [4]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = "watchtek"
port = "5432"
user = "watchtek"
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [5]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="llama2_paper",
    embed_dim=384,  # openai embedding dimension
)

## Build an Ingestion Pipeline from Scratch

### 1. Load Data

In [6]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path=DOCUMENT)

### 2. Use a Text Splitter to Split Documents

In [7]:
from llama_index.core.node_parser import SentenceSplitter
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [8]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

### 3. Manually Construct Nodes from Text Chunks

In [9]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

### 4. Generate Embeddings for each Node

In [10]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

### 5. Load Nodes into a Vector Store

In [11]:
vector_store.add(nodes)

['ff4123c6-70b1-4610-b197-d092c5e735e2',
 'a8933268-99b9-40f8-a113-3f9defac8892',
 '41441b8c-bcba-4cb5-b95d-77c9f58992fe',
 '6fbf587c-aaf6-4ceb-bcdb-acdb9173cb31',
 '97bcc565-6323-4258-aa8a-c2d067c7a078',
 'f35b8cf1-e8a4-4e5d-aedd-a6f400e7e065',
 'a57e4c07-54a2-4a75-a6d3-09fd4b91f23e',
 'dbe825c6-0601-46c9-b577-16871884eb97',
 'a9ce81a1-3067-480f-a695-9cb384ae82ab',
 'e0f05157-a246-4133-8cd7-315ea84dc615',
 'a3f36b54-acc5-4b08-9978-1ade12d657df',
 '83e837ab-9dbe-4180-a0d0-8251fe08ed1a',
 'ffa24fee-d089-42f5-959d-7ed8829ea928',
 '870c07d5-18b0-4a95-8a00-6426a07422a1',
 '6c685d08-c2e1-48c5-8a81-47b37b29ae1e',
 'fc00a99f-bf1f-4878-99c7-2d544c2af443',
 'e64e2813-024b-4063-bc31-4e4f2c6f016c',
 'ac065e82-24cc-43ff-be4f-96895f67c418',
 '842299f9-f138-4898-ad98-632a88516d49',
 '223a0df0-2b05-49e1-8dc6-70b4641168d9',
 'c68b3b51-e4b3-4508-9418-e7a7da8c3d5b',
 'ebc5574d-b95c-4578-89bc-424b9fbec351',
 'e265cc43-9955-4eb2-a933-31a9266c472b',
 'ad1028b8-2934-4fc0-b9a3-1ef0048779e4',
 '8dc5ab37-7084-

## Build Retrieval Pipeline from Scratch

In [14]:
query_str = "Can you tell me about the key concepts for safety finetuning"

### 1. Generate a Query Embedding

In [15]:
query_embedding = embed_model.get_query_embedding(query_str)

### 2. Query the Vector Database

In [16]:
# construct vector store query
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [17]:
# returns a VectorStoreQueryResult
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())

4
Safety
In this section, we dive deeper into the important topic of safety measurements and mitigations. We first
discuss our safety investigations into pretraining data and pretrained models (Section 4.1). Next, we describe
the process of our safety alignment (Section 4.2), explaining how we collected safety-related annotations and
utilized SFT and RLHF, and present experimental results. Then, we discuss the red teaming we performed to
further understand and improve model safety (Section 4.3). Finally, we present quantitative safety evaluations
of Llama 2-Chat (Section 4.4). We also share a model card in the Appendix, in Table 52.
4.1
Safety in Pretraining
It is important to understand what is in the pretraining data both to increase transparency and to shed
light on root causes of potential downstream issues, such as potential biases. This can inform what, if any,
downstream mitigations to consider, and help guide appropriate model use. In this section, we analyze the
pretraining da

### 3. Parse Result into a Set of Nodes

In [18]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

### 4. Put into a Retriever

In [19]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [20]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

## Plug this into our RetrieverQueryEngine to synthesize a response

In [21]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [22]:
query_str = "How does Llama 2 perform compared to other open-source models?"

response = query_engine.query(query_str)
print(str(response))

Llama 2 70B outperforms all open-source models.


In [23]:
print(response.source_nodes[0].get_content())

Additionally, Llama 2 70B model outperforms all open-source models.
In addition to open-source models, we also compare Llama 2 70B results to closed-source models. As shown
in Table 4, Llama 2 70B is close to GPT-3.5 (OpenAI, 2023) on MMLU and GSM8K, but there is a significant
gap on coding benchmarks. Llama 2 70B results are on par or better than PaLM (540B) (Chowdhery et al.,
2022) on almost all benchmarks. There is still a large gap in performance between Llama 2 70B and GPT-4
and PaLM-2-L.
We also analysed the potential data contamination and share the details in Section A.6.
Benchmark (shots)
GPT-3.5
GPT-4
PaLM
PaLM-2-L
Llama 2
MMLU (5-shot)
70.0
86.4
69.3
78.3
68.9
TriviaQA (1-shot)
–
–
81.4
86.1
85.0
Natural Questions (1-shot)
–
–
29.3
37.5
33.0
GSM8K (8-shot)
57.1
92.0
56.5
80.7
56.8
HumanEval (0-shot)
48.1
67.0
26.2
–
29.9
BIG-Bench Hard (3-shot)
–
–
52.3
65.7
51.2
Table 4: Comparison to closed-source models on academic benchmarks. Results for GPT-3.5 and GPT-4
are from OpenAI