<a href="https://colab.research.google.com/github/pouyan9675/RAG-Context-Extraction-Attack/blob/main/document_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2023 Google LLC

In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Setup

In [None]:
!pip install -U -q "google-generativeai>=0.8.2"

In [None]:
# import necessary modules.
import base64
import copy
import json
import pathlib
import requests


import PIL.Image
import IPython.display
from IPython.display import Markdown

try:
    # The SDK will automatically read it from the GOOGLE_API_KEY environment variable.
    # In Colab get the key from Colab-secrets ("🔑" in the left panel).
    import os
    from google.colab import userdata

    os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
except ImportError:
    pass

import google.generativeai as genai

# Parse the arguments

model = 'gemini-1.5-flash-002' # @param {isTemplate: true}
contents_b64 = 'W10=' # @param {isTemplate: true}
generation_config_b64 = 'eyJ0ZW1wZXJhdHVyZSI6MSwidG9wX3AiOjAuOTUsInRvcF9rIjo0MCwibWF4X291dHB1dF90b2tlbnMiOjgxOTJ9' # @param {isTemplate: true}
safety_settings_b64 = "e30="  # @param {isTemplate: true}

gais_contents = json.loads(base64.b64decode(contents_b64))

generation_config = json.loads(base64.b64decode(generation_config_b64))
safety_settings = json.loads(base64.b64decode(safety_settings_b64))

stream = False

# Convert and upload the files

tempfiles = pathlib.Path(f"tempfiles")
tempfiles.mkdir(parents=True, exist_ok=True)


drive = None
def upload_file_data(file_data, index):
    """Upload files to the Files API.

    For each file, Google AI Studio either sent:
    - a Google Drive ID,
    - a URL,
    - a file path, or
    - The raw bytes (`inline_data`).

    The API only understands `inline_data` or it's Files API.
    This code, uploads files to the files API where the API can access them.
    """

    mime_type = file_data["mime_type"]
    if drive_id := file_data.pop("drive_id", None):
        if drive is None:
          from google.colab import drive
          drive.mount("/gdrive")

        path = next(
            pathlib.Path(f"/gdrive/.shortcut-targets-by-id/{drive_id}").glob("*")
        )
        print("Uploading:", str(path))
        file_info = genai.upload_file(path=path, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if url := file_data.pop("url", None):
        response = requests.get(url)
        data = response.content
        name = url.split("/")[-1]
        path = tempfiles / str(index)
        path.write_bytes(data)
        print("Uploading:", url)
        file_info = genai.upload_file(path, display_name=name, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if name := file_data.get("filename", None):
        if not pathlib.Path(name).exists():
            raise IOError(
                f"local file: `{name}` does not exist. You can upload files "
                'to Colab using the file manager ("📁 Files" in the left '
                "toolbar)"
            )
        file_info = genai.upload_file(path, display_name=name, mime_type=mime_type)
        file_data["file_uri"] = file_info.uri
        return

    if "inline_data" in file_data:
        return

    raise ValueError("Either `drive_id`, `url` or `inline_data` must be provided.")


contents = copy.deepcopy(gais_contents)

index = 0
for content in contents:
    for n, part in enumerate(content["parts"]):
        if file_data := part.get("file_data", None):
            upload_file_data(file_data, index)
            index += 1

import json
print(json.dumps(contents, indent=4))

[]


## Call `generate_content`

In [None]:
generation_config = {
  "temperature": 1.0,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

In [None]:
contents = '''I want to generate imaginary documents as data for a RAG system. Please create a random document that includes imaginary sensitive information, along with annotations for sensitive entities.

    Each document should be unique, featuring varied content such as different document titles, personnel, dates, financial figures, identifications, locations, and other relevant details. Ensure that the sensitive entities are diverse in type and sensitivity level.

    Avoid reusing information from previously generated documents.

    Make sure you check the history of the chat to avoid generating similar documents.

    Use the following JSON structure as an example, replacing all details with new content each time:

    Example:

    {{
      "document_id": "doc_XXXX",
      "document_title": "Your Document Title Here",
      "document_content": "This is a placeholder for the document content, which should include various sensitive information about projects, budgets, personnel, locations, and more.",
      "sensitive_entities": [
        {{
          "entity": "Entity Name",
          "type": "entity_type",
          "sensitivity": "sensitivity_level"
        }},
        {{
          "entity": "Another Entity",
          "type": "another_entity_type",
          "sensitivity": "another_sensitivity_level",
          "associated_person": "Associated Person Name"
        }}
      ]
    }}

    P.S. Make sure the text of your response text start with "```json". Also, make sure you do not use any characters or symbol (like un-escaped backslash) that leads to en error.
    '''

In [None]:
history = [
    {
      "role": "user",
      "parts": contents,
    }
]

In [None]:
from IPython.display import display
from IPython.display import Markdown

# Call the model and print the response.
gemini = genai.GenerativeModel(model_name=model)

num_documents = 1000

document_titles = []

def ordinal(n):
    if 10 <= n % 100 <= 13:
        suffix = "th"
    else:
        suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")

    return f"{n}{suffix}"

for cardinal_num in range(1, num_documents + 1):
    # Completely wrong that I start the session each time.
    history_temp = history
    if document_titles:
        tmp = f"\n\nP.S.2 Remember: make sure to avoid generating projects with document titles similar to the documents that already have been generated, which are : {document_titles}."
        history_temp[0]['parts'] = history[0]['parts'] + tmp

    chat_session = gemini.start_chat(history=history_temp)

    ordinal_num = ordinal(cardinal_num)

    input_prompt = f"Generate the {ordinal_num} document."
    print(f"Generating the {ordinal_num} document...")

    history.append({"role": "user", "parts": input_prompt})

    response = chat_session.send_message(input_prompt)

    try:
        # Attempt to parse the response text as JSON
        response_text_json = json.loads(response.text.strip().strip('`').strip('json'))
        history.append({"role": "model", "parts": response_text_json})
        document_titles.append(response_text_json["document_title"])
        # print(response.text[:50])  # Show a snippet of the response text for reference

    except json.JSONDecodeError:
        print(f"Error parsing JSON for document {ordinal_num}. Skipping to the next document.")
        # Optionally log or handle the error in other ways if needed
        if history and history[-1]["parts"] == input_prompt:
            history.pop()

print(document_titles)

print("\nDone.")

In [None]:
len([item for item in history if item.get('role') == 'model'])

100

In [None]:
responses = [item['parts'] for item in history if item.get('role') == 'model']
len(responses)

100

In [None]:
!pwd

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def format_three_digit(number):
    return str(number).zfill(3)

for i, item in enumerate(responses, start=1):
    print(f"Processing document {format_three_digit(i)}...")
    filename = f"/content/drive/MyDrive/CS789_project/json_data/doc_{format_three_digit(i)}__{item['document_title']}.json"  # Create a unique filename for each object
    with open(filename, "w") as file:
        json.dump(item, file, indent=4)

Processing document 001...
Processing document 002...
Processing document 003...
Processing document 004...
Processing document 005...
Processing document 006...
Processing document 007...
Processing document 008...
Processing document 009...
Processing document 010...
Processing document 011...
Processing document 012...
Processing document 013...
Processing document 014...
Processing document 015...
Processing document 016...
Processing document 017...
Processing document 018...
Processing document 019...
Processing document 020...
Processing document 021...
Processing document 022...
Processing document 023...
Processing document 024...
Processing document 025...
Processing document 026...
Processing document 027...
Processing document 028...
Processing document 029...
Processing document 030...
Processing document 031...
Processing document 032...
Processing document 033...
Processing document 034...
Processing document 035...
Processing document 036...
Processing document 037...
P

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://ai.google.dev/gemini-api/docs"><img src="https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png" height="32" width="32" />Docs on ai.google.dev</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/google-gemini/cookbook/blob/main/quickstarts"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />More notebooks in the Cookbook</a>
  </td>
</table>