In [2]:
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
file_paths = ["docs\ccpa_california.pdf", "data\ddpa.pdf", "data\GDPR-EU.pdf", "data\LGPD-english.pdf"]

In [8]:
docs

[Document(metadata={'producer': 'Adobe PDF Library 24.4.48', 'creator': 'Acrobat PDFMaker 24 for Word', 'creationdate': '2024-12-16T10:44:55-08:00', 'author': 'California Privacy Protection Agency', 'comments': '', 'company': '', 'contenttypeid': '0x010100E35E276DB9447C448D1AB74E91CC0E53', 'mediaserviceimagetags': '', 'moddate': '2025-09-17T15:56:59-07:00', 'sourcemodified': 'D:20241216184448', 'subject': 'California Consumer Privacy Act of 2018', 'title': 'California Consumer Privacy Act of 2018', 'source': 'data\\ccpa_california.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='Page 1 of 65 \nCALIFORNIA CONSUMER PRIVACY ACT OF 2018 \neffective 01/01/2025 – SB 1223, AB 1008, AB 1824 update \nposted to cppa.ca.gov January 2025 \nContents \n1798.100.  General Duties of Businesses that Collect Personal Information................................. 3 \n1798.105.  Consumers’ Right to Delete Personal Information....................................................... 4 \n1

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", r"(?<=[.?!])\s+"],
    chunk_size = 200,
    chunk_overlap = 20
)

In [11]:
text = splitter.split_documents(docs)

In [12]:
len(text[0].page_content)

156

In [13]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
embeddings = SentenceTransformer("all-MiniLM-L6-v2") #384 dimensions

In [14]:
import chromadb
chroma_client = chromadb.Client()

In [15]:
collection = chroma_client.create_collection(name="offline-rag-bot-v1")

In [16]:
for i, chunk in enumerate(text):
    chunk_embedding = embeddings.encode(chunk.page_content)

    collection.add(
        ids=[str(i)],
        documents=[chunk.page_content],
        embeddings=[chunk_embedding],
        metadatas=[chunk.metadata]
    )


In [77]:
from langchain_ollama import OllamaLLM


llm = OllamaLLM(
    model = 'gemma3:270m',
    temperature=0,
    
)



In [78]:
llm.invoke("Hi How Are you?")

'I am doing well, thank you for asking! How are you today?\n'

In [79]:
def chatbot(query: str):
    if query.lower().strip() == "exit":
        return "Goodbye!"

    # ---- embed query ----
    query_embedding = embeddings.encode(query).tolist()

    # ---- query chroma ----
    result = collection.query(
        query_embeddings=[query_embedding],
        n_results=5,
        include=["documents"]
    )

    docs = result.get("documents", [[]])[0]

    if not docs:
        return "I don't know."

    # ---- build context ----
    context = "\n\n".join(docs)

    # ---- RAG prompt ----
    prompt = f"""
You are a legal documentation assistant.

Rules:
- Answer ONLY using the provided context
- If the relavant context is not present, say "I don't know"
- Do NOT add external knowledge
- Write a COMPLETE sentence
- Provide a clear definition or explanation, not a short phrase

Context:
{context}

Question:
{query}

Answer (in 2-4 complete sentences):
"""

    answer = llm.invoke(prompt).strip()
    return answer


In [81]:
chatbot('Who is responsible for complying with data protection laws?')

'The responsible party for complying with data protection laws is the **controller**.'

In [80]:
import csv
import re
import os
os.makedirs("eval_outputs", exist_ok=True)

# ----------------------------
# CONFIG
# ----------------------------

MODEL_NAME = "gemma3_270m"   # change if needed
OUTPUT_DIR = "eval_outputs"

QUESTIONS = [
    "What is considered personal data?",
    "What rights do individuals have over their personal data?",
    "What does consent mean in data protection?",
    "What happens if an organization violates data protection law?",
    "What is the right to erasure?",
    "Explain about Article 52 in GDPR",
    "Explain Consumers’ Right to Delete Personal Information",
    "What are the specific penalties for a Data Fiduciary that fails to take reasonable security safeguards to prevent a personal data breach?",
    "What rights does a Data Principal have regarding the correction, completion, and erasure of their personal data?",
    "What are the 'legitimate uses' for which a Data Fiduciary may process personal data without obtaining fresh consent?",
    "What are the ten fundamental principles that personal data processing activities must observe under the LGPD?",
    "Under what specific conditions can sensitive personal data be processed without the data subject's consent in LGPD?",
    "What administrative sanctions and fines can the National Supervisory Authority (ANPD) impose for infractions of the Law?",
    "What are the legal grounds for the international transfer of personal data to foreign countries or organizations?",
    "How is the California Privacy Protection Agency governed, and who is responsible for appointing its five-member board?",
    "Under what circumstances must the California Privacy Protection Agency adjust the monetary thresholds in the CCPA, and which specific sections are subject to these adjustments?",
    "What are the specific qualifications and skills required for members of the agency board, and what employment restrictions apply to them after they leave office?",
    "What are the specific conditions under which a controller or processor is required to designate a Data Protection Officer (DPO)?",
    "What are the primary investigative and corrective powers granted to independent supervisory authorities to enforce the Regulation?",
    "What criteria must the European Commission consider when assessing whether a third country ensures an 'adequate level' of data protection for international transfers?"
]

# ----------------------------
# UTILS
# ----------------------------

def sanitize_filename(name: str) -> str:
    """Make filename OS-safe"""
    return re.sub(r"[^a-zA-Z0-9._-]", "_", name)

# ----------------------------
# MAIN EVAL LOGIC
# ----------------------------

def run_evaluation():
    filename = f"{sanitize_filename(MODEL_NAME)}.csv"
    filepath = f"{OUTPUT_DIR}/{filename}"

    rows = []

    print(f"Running evaluation with model: {MODEL_NAME}\n")

    for i, question in enumerate(QUESTIONS, start=1):
        print(f"[{i}/{len(QUESTIONS)}] Question: {question}")

        try:
            answer = chatbot(question)
        except Exception as e:
            answer = f"ERROR: {str(e)}"

        rows.append({
            "question": question,
            "answer": answer
        })

    # ----------------------------
    # WRITE CSV
    # ----------------------------

    with open(filepath, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["question", "answer"]
        )
        writer.writeheader()
        writer.writerows(rows)

    print(f"\n✅ Evaluation saved to: {filepath}")

# ----------------------------
# RUN
# ----------------------------

if __name__ == "__main__":
    run_evaluation()


Running evaluation with model: gemma3_270m

[1/20] Question: What is considered personal data?
[2/20] Question: What rights do individuals have over their personal data?
[3/20] Question: What does consent mean in data protection?
[4/20] Question: What happens if an organization violates data protection law?
[5/20] Question: What is the right to erasure?
[6/20] Question: Explain about Article 52 in GDPR
[7/20] Question: Explain Consumers’ Right to Delete Personal Information
[8/20] Question: What are the specific penalties for a Data Fiduciary that fails to take reasonable security safeguards to prevent a personal data breach?
[9/20] Question: What rights does a Data Principal have regarding the correction, completion, and erasure of their personal data?
[10/20] Question: What are the 'legitimate uses' for which a Data Fiduciary may process personal data without obtaining fresh consent?
[11/20] Question: What are the ten fundamental principles that personal data processing activities mu