In [None]:
# --- Cell 1: Setup ---
import fitz
import re 
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- Cell 2: Load PDF ---
def load_pdf(path):
    """Extract text from PDF."""
    doc = fitz.open(path)
    return "\n".join(page.get_text() for page in doc)

pdf_text = load_pdf("../data/RAW.pdf")
print(pdf_text[:300])  # preview


Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗
Louis Martin†
Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernand


In [4]:
# --- Cell 3: Clean text ---
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)           # collapse newlines
    text = re.sub(r'[ \t]+', ' ', text)         # collapse spaces
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)  # remove page numbers
    return text.strip()

cleaned_text = clean_text(pdf_text)
print(cleaned_text[:300])  # preview


Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗
Louis Martin†
Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernand


In [5]:
# --- Cell 4: Chunk text ---
def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end].strip())
        start += chunk_size - overlap
    return chunks

chunks = chunk_text(cleaned_text)
print(f"Chunks created: {len(chunks)}")
print(chunks[0][:300])


Chunks created: 376
Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗
Louis Martin†
Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernand


In [6]:
# --- Cell 5: Embed chunks ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks, convert_to_numpy=True)

vector_store = {
    "chunks": chunks,
    "embeddings": embeddings
}

print(f"Stored {len(chunks)} chunks with embeddings of dim {embeddings.shape[1]}")


Stored 376 chunks with embeddings of dim 384


In [8]:
# --- Cell 6: Retriever ---
def retrieve(query, top_k=3):
    query_emb = embed_model.encode([query], convert_to_numpy=True)
    sims = np.dot(vector_store["embeddings"], query_emb.T).squeeze()
    idx = np.argsort(-sims)[:top_k]
    return [vector_store["chunks"][i] for i in idx]

# Test retrieval
results = retrieve("Who are the authors of LLaMA 2?", top_k=2)
for r in results:
    print("---")
    print(r[:300])


---
e at https://ai.meta.
com/resources/models-and-libraries/llama/. Those who use Llama 2 must comply with the terms of
the provided license and our Acceptable Use Policy, which prohibit any uses that would violate applicable
policies, laws, rules, and regulations.
We also provide code examples to help
---
te has been in
English, and has not covered, nor could it cover all scenarios. For these reasons, as with all LLMs,
Llama 2’s potential outputs cannot be predicted in advance, and the model may in some instances
produce inaccurate or objectionable responses to user prompts. Therefore, before deployi


In [9]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyDUOZunebNGQMazQoQCPycKHeHhdYDQEc0")  


In [None]:
model = genai.GenerativeModel("gemini-1.5-flash") 


In [11]:
response = model.generate_content("Explain Retrieval-Augmented Generation in 2 lines")
print(response.text)


Retrieval-Augmented Generation (RAG) combines large language models with external knowledge sources.  It retrieves relevant information to augment the model's context before generating a response, improving accuracy and reducing hallucinations.



In [15]:
# --- Cell 7: RAG QA with Gemini ---
def rag_answer(query):
    context = "\n\n".join(retrieve(query, top_k=3))
    prompt = f"Answer the question using the context below. If context is not enough, say so.\n\n{context}\n\nQ: {query}\nA:"
    
    response = model.generate_content(prompt)
    return response.text

print(rag_answer("how many trillion tokens of data as this provides a good performance–cost trade-off, up-sampling the most factual sourcein an effort to increase knowledge and dampen hallucinations. "))


The provided text states that 2 trillion tokens of data were used.



In [14]:
# --- Cell 8: Test RAG ---
questions = [
    "Who are the authors of LLaMA 2?",
    "What is the purpose of LLaMA 2?",
    "How is LLaMA 2 fine-tuned?"
]

for q in questions:
    print("="*50)
    print(rag_answer(q))


The provided text does not name the authors of Llama 2.  It mentions Meta's release of the model but doesn't list individual contributors.

Based on the provided text, LLaMA 2 is a large language model released openly for both research and commercial use.  The purpose is to encourage responsible AI innovation.  However, the text also emphasizes that users must comply with licensing and acceptable use policies and that safety testing and tuning are crucial before deployment to mitigate potential risks of inaccurate or objectionable outputs.

The provided text states that Llama 2-Chat is a fine-tuned version of Llama 2, showing improvements in truthfulness and toxicity.  However, it does not describe the specific fine-tuning methods used.



## ✅ RAG Pipeline Progress

***1. Data ingestion:*** Loaded `RAW.pdf` (LLaMA 2 research paper).  

***2. Chunking:*** Split the PDF text into smaller overlapping chunks.  

***3. Embeddings:*** Encoded chunks into vectors using `sentence-transformers`.  

At this stage → we already have a working in-memory vector store.  

---

## ⚙️ Next Steps to Complete the Pipeline

***4. Vector DB:*** Using a Python dict (ok for small scale).  
For production: **FAISS, Pinecone, Weaviate, or Chroma**.  

***5. Retrieval:*** Implemented with the `retrieve()` function.  

***6. Reranking (optional):*** Not added yet, can improve ordering later.  

***7. Prompt building:*** Implemented in **Cell 7** with a structured prompt.  

***8. LLM call:*** Using **Gemini Flash API**.  

***9. Output formatting:*** Returning **answer + sources** for transparency.  
