<a href="https://colab.research.google.com/github/royam0820/Gemini-File-Search-API/blob/main/RAG_synthetic_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=3671a51f689e15e8a85504d0718a03d9d59cac7e437bbb4bf34d45e623b0f331
  Stored in directory: /root/.cache/pip/wheels/6e/62/11/dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
import os
import csv
import random
from fpdf import FPDF

# Configuration
OUTPUT_DIR = "rag_demo_data"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# ==========================================
# 1. GENERATE: Meeting Notes (Unstructured PDF)
# Goal: Test Semantic Search & Nuance
# ==========================================
def create_meeting_notes():
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    content = [
        "Internal Meeting Transcript - Q3 Brainstorming",
        "Date: August 15th, 2024",
        "Attendees: Sarah (Product), Mike (Sales), Elena (Engineering)",
        "",
        "Sarah: Okay, let's talk about the Lumina Sphere. We need to lock down the launch pricing.",
        "Mike: I'm looking at the market. If we go over $200, we lose the college demographic. I suggest $149.",
        "Sarah: $149? That barely covers the BOM cost with the new glass casing. Elena, what's our margin there?",
        "Elena: With the new glass, we're sitting at $110 per unit cost. $149 is too risky. We'd have zero wiggle room for marketing.",
        "Mike: Fine. But if we do $199, we need to bundle the 'Pro' app subscription for free.",
        "Sarah: I can agree to that. Let's tentatively set it at $199 with 6 months of Pro app included.",
        "Elena: Also, just a heads up, the engineering team decided to move the reset button. It's no longer on the back.",
        "Mike: Where is it? Support needs to know.",
        "Elena: It's hidden under the base rubber pad to prevent accidental resets.",
        "",
        "Action Items:",
        "- Sarah to finalize pricing model at $199.",
        "- Mike to update marketing assets.",
    ]

    for line in content:
        pdf.multi_cell(0, 10, line)

    filename = os.path.join(OUTPUT_DIR, "Meeting_Notes_Q3_Brainstorm.pdf")
    pdf.output(filename)
    print(f"Created: {filename}")

# ==========================================
# 2. GENERATE: User Manual (Layout/Visual PDF)
# Goal: Test Instruction Following & Layout
# ==========================================
def create_user_manual():
    pdf = FPDF()
    pdf.add_page()

    # Title
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(0, 10, "Lumina Sphere - User Manual v1.0", 0, 1, 'C')
    pdf.ln(10)

    # Specs Section
    pdf.set_font("Arial", 'B', 12)
    pdf.cell(0, 10, "1. Technical Specifications", 0, 1)
    pdf.set_font("Arial", size=11)

    specs = [
        "Input Voltage: 5V / 2A (USB-C)",
        "Battery Capacity: 4000mAh",
        "Connectivity: Wi-Fi 6, Bluetooth 5.2",
        "Water Resistance: IP67"
    ]
    for spec in specs:
        pdf.cell(0, 7, f"- {spec}", 0, 1)

    pdf.ln(5)

    # Layout Description (Simulating a visual component)
    pdf.set_font("Arial", 'B', 12)
    pdf.cell(0, 10, "2. Device Layout (Reference Diagram A)", 0, 1)
    pdf.set_font("Arial", size=11)
    pdf.multi_cell(0, 7,
        "The Lumina Sphere features a minimalist design. \n"
        "[DIAGRAM PLACEHOLDER]\n"
        "1. Top Dome: Touch sensitive area for brightness control.\n"
        "2. Main Body: Ambient light diffuser.\n"
        "3. Base: Non-slip rubber pad.\n"
        "NOTE: The Factory Reset button is located UNDERNEATH the base rubber pad (Label 3)."
    )

    filename = os.path.join(OUTPUT_DIR, "Lumina_Sphere_User_Manual_v1.pdf")
    pdf.output(filename)
    print(f"Created: {filename}")

# ==========================================
# 3. GENERATE: Inventory Data (CSV)
# Goal: Test Reasoning & Math
# ==========================================
def create_csv_data():
    filename = os.path.join(OUTPUT_DIR, "Q3_Sales_Projections.csv")

    headers = ["SKU", "Product_Name", "Region", "Stock_Level", "Unit_Price_USD", "Power_Req_Volts"]
    data = [
        ["LUM-001", "Lumina Sphere", "North America", 5000, 199, 5],
        ["LUM-001", "Lumina Sphere", "Europe", 1200, 219, 5],
        ["ACC-999", "Travel Pack (Legacy)", "North America", 2000, 49, 3], # The 3V trap!
        ["ACC-999", "Travel Pack (Legacy)", "Europe", 500, 55, 3]
    ]

    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
        writer.writerows(data)

    print(f"Created: {filename}")

# ==========================================
# 4. GENERATE: Legal Compliance (Long Text)
# Goal: Test Context Window & Retrieval
# ==========================================
def create_long_text():
    filename = os.path.join(OUTPUT_DIR, "Lumina_Global_Compliance_Regulations.txt")

    # Filler text to make it long
    filler = "The LuminaCorp compliance framework adheres to international standards ISO-9001 and ISO-27001. All devices must maintain radio frequency limits within designated bands. " * 20

    with open(filename, 'w') as f:
        f.write("LUMINA GLOBAL COMPLIANCE DOC - CONFIDENTIAL\n\n")

        # Write 50 "pages" of filler
        for i in range(1, 51):
            f.write(f"--- PAGE {i} ---\n")
            f.write(filler + "\n\n")

            # THE NEEDLE IN THE HAYSTACK (Inserted at Page 42)
            if i == 42:
                f.write("\n*** SPECIAL REGIONAL RESTRICTION ***\n")
                f.write("Due to local environmental legislation 'Protocol 99' regarding specific lithium recycling mandates, ")
                f.write("the Lumina Sphere CANNOT be sold or shipped to the state of Vermont.\n")
                f.write("*** END RESTRICTION ***\n\n")

    print(f"Created: {filename}")

# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    print("Generating RAG Demo Assets...")
    create_meeting_notes()
    create_user_manual()
    create_csv_data()
    create_long_text()
    print("\nDone! Files are in the 'rag_demo_data' folder.")

Generating RAG Demo Assets...
Created: rag_demo_data/Meeting_Notes_Q3_Brainstorm.pdf
Created: rag_demo_data/Lumina_Sphere_User_Manual_v1.pdf
Created: rag_demo_data/Q3_Sales_Projections.csv
Created: rag_demo_data/Lumina_Global_Compliance_Regulations.txt

Done! Files are in the 'rag_demo_data' folder.


In [None]:
import os
import shutil
from google.colab import files

# Define the directory to be downloaded
directory_to_download = "/content/rag_demo_data"

# Define the output zip file name
zip_filename = "rag_demo_data.zip"

# Create a zip archive of the directory
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', directory_to_download)

print(f"Successfully created {zip_filename}")

# Offer the zip file for download
try:
    files.download(zip_filename)
    print("Your download should start shortly.")
except Exception as e:
    print(f"An error occurred during download: {e}")
    print(f"You can manually download the file by running `!cp {zip_filename} .` and then downloading it from the files pane.")

Successfully created rag_demo_data.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Your download should start shortly.
