In [1]:
from pathlib import Path
import shutil

# Create folders
Path("data").mkdir(exist_ok=True)
Path("chunks").mkdir(exist_ok=True)
Path("vectordb").mkdir(exist_ok=True)

In [2]:
import os, json, faiss
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Load PDF
pdf = PdfReader("data/AI Training Document.pdf")
text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

# Chunk
splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_text(text)
with open("chunks/chunks.json", "w") as f:
    json.dump(chunks, f)

# Embed
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

# Save to FAISS
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
faiss.write_index(index, "vectordb/index.faiss")
with open("vectordb/chunk_map.json", "w") as f:
    json.dump(chunks, f)

print("Preprocessing complete. Chunks and index saved.")


Preprocessing complete. Chunks and index saved.


In [3]:
import faiss
import json
from sentence_transformers import SentenceTransformer

class Retriever:
    def __init__(self, index_path, chunk_path):
        self.index = faiss.read_index(index_path)
        with open(chunk_path, "r") as f:
            self.chunks = json.load(f)
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    def retrieve(self, query, k=3):
        emb = self.model.encode([query])
        D, I = self.index.search(emb, k)
        return [self.chunks[i] for i in I[0]]


In [None]:
import google.generativeai as genai

class Generator:
    def __init__(self):
        genai.configure(api_key="AIzaSyCrIUhbEH46MFcyI5h2IW-4psnChwvWf6Y")
        self.model = genai.GenerativeModel("gemini-1.5-flash")

    def generate(self, context, question):
        prompt = f"Use the context below to answer:\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"
        response = self.model.generate_content(prompt)
        return response.text.strip()

In [19]:
class RAGPipeline:
    def __init__(self):
        self.retriever = Retriever("vectordb/index.faiss", "vectordb/chunk_map.json")
        self.generator = Generator()

    def run(self, query):
        chunks = self.retriever.retrieve(query)
        context = "\n\n".join(chunks)
        answer = self.generator.generate(context, query)
        return answer, chunks

# Initialize
rag = RAGPipeline()


In [23]:
query = "what is the purpose of this Document?"
answer, source_chunks = rag.run(query)

print("Answer:\n", answer)
print("\nSource Chunks:\n")
for chunk in source_chunks:
    print("-", chunk)


Answer:
 The provided text is a snippet of a legal document, likely a User Agreement.  Its purpose is to define the user's responsibilities regarding the content they provide, ensuring they have the necessary rights and that the content is accurate, appropriate, and legal.  It also mentions arbitration as a potential dispute resolution method.

Source Chunks:

- an improper purpose (as measured by the standards set forth in Federal Rule of Civil
Procedure 11(b)). To the extent, following a presentation on the merits, on it s own motion or
a party’s, and after affording a reasonable opportunity to respond, an arbitrator determines
- connection with our, those assignees', and those subl icensees' use of that content in connection with
our provision, expansion, and promotion of our Services.
You represent and warrant that, for all such content you provide, you own or otherwise control all
- necessary rights to do so and to meet your obligations under this User Agreement. You represent and

In [None]:


# comment