## Generating Synthetic Dataset

In [None]:
%%capture

!pip install crewai
!pip install datasets

In [None]:
from datasets import load_dataset

data = load_dataset('LightFury9/gretelai_synthetic_pii_finance_english')
train_data = data['train'].sort("index")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
verb = True

In [None]:
google_api_key = '..'

In [None]:
from crewai import LLM

llm = LLM(
    model="gemini/gemini-1.5-flash",
    api_key=google_api_key
)

In [None]:
from crewai import Agent, Task, Crew, Process
from langchain_openai import ChatOpenAI

In [None]:
pid_extraction_agent = Agent(
    role="PID Identification Agent",
    goal=(
        "Extract and structure all personally identifiable information (PII) from the input text into a JSON format. "
        "The output should be a well-formatted JSON containing separate arrays for individuals and organizations."
    ),
    backstory=(
        "A data extraction specialist skilled in identifying and structuring PII data. You meticulously "
        "analyze text to find every mention of people and organizations, capturing all relevant details "
        "about each entity found."
    ),
    verbose=verb,
    allow_delegation=True,
    llm=llm
)

pid_identification_task = Task(
    description=(
        "Analyze the input text and output a JSON structure with the following format:\n"
        "\n"
        "  'individuals': [\n"
        "    \n"
        "      'id': '1',\n"
        "      'name': 'full_name',\n"
        "      'title': 'job_title_if_mentioned',\n"
        "      'email': 'email_if_found',\n"
        "      'phone': 'phone_if_found',\n"
        "      'organization': 'affiliated_organization_if_mentioned',\n"
        "      'context': 'brief_description_of_role_in_document'\n"
        "    \n"
        "  ],\n"
        "  'organizations': [\n"
        "    \n"
        "      'id': '1',\n"
        "      'name': 'company_name',\n"
        "      'type': 'company_type_if_mentioned',\n"
        "      'context': 'brief_description_of_organization_role'\n"
        "    \n"
        "  ]\n"
        "\\n\n"
        "Input text: {text}"
    ),
    agent=pid_extraction_agent,
    expected_output="A JSON object containing structured arrays of individuals and organizations found in the text."
)

In [None]:
persona_generation_agent = Agent(
    role="Persona Generation Agent",
    goal=(
        "Generate synthetic personas to replace each individual and organization identified in the input JSON, "
        "maintaining industry context and relationship dynamics while ensuring complete anonymization."
    ),
    backstory=(
        "An expert in creating realistic synthetic identities who understands industry-specific naming "
        "conventions, organizational structures, and professional relationships. You ensure generated "
        "personas maintain the same relationship dynamics and contextual relevance as the original entities."
    ),
    verbose=verb,
    allow_delegation=True,
    llm=llm
)

persona_generation_task = Task(
    description=(
        "Using the JSON structure from the PII Identification task, generate a matching JSON structure with "
        "synthetic personas. For each entity:\n"
        "1. Maintain the same JSON structure but with new synthetic data\n"
        "2. Ensure professional titles match industry norms\n"
        "3. Generate realistic emails following common business email patterns\n"
        "4. Create plausible phone numbers with appropriate area codes\n"
        "5. Maintain organizational hierarchies and relationships\n"
        "6. Ensure company names are realistic for the industry\n\n"
        "Output the synthetic personas in the same JSON format as the input."
    ),
    context=[pid_identification_task],
    agent=persona_generation_agent,
    expected_output="A JSON object containing synthetic personas matching the structure of the identified PIDs."
)

In [None]:
replacement_agent = Agent(
    role="Replacement Agent",
    goal=(
        "Systematically replace all identified PIDs in the original text with their synthetic counterparts, "
        "ensuring natural language flow and maintaining all contextual relationships."
    ),
    backstory=(
        "A language specialist skilled in seamless text transformation. You excel at replacing sensitive "
        "information while preserving document tone, style, and readability. You understand the importance "
        "of maintaining consistency across all references to the same entity."
    ),
    verbose=verb,
    allow_delegation=True,
    llm=llm
)

replacement_task = Task(
    description=(
        "Using the original text and the synthetic personas JSON:\n"
        "1. Create a mapping of original entities to their synthetic replacements\n"
        "2. Replace all instances of each entity, including partial name references\n"
        "3. Update any related information (emails, phone numbers, etc.)\n"
        "4. Maintain consistency for multiple references to the same entity\n"
        "5. Preserve formatting, punctuation, and document structure\n\n"
        "Original text: {text}"
    ),
    agent=replacement_agent,
    context=[pid_identification_task, persona_generation_task],
    expected_output="The original text with all PIIs replaced by synthetic equivalents."
)

In [None]:
quality_assurance_agent = Agent(
    role="Quality Assurance Agent",
    goal=(
        "Evaluate the synthesized data for accuracy, coherence, and natural language flow, "
        "ensuring the final document reads seamlessly with all replaced text."
        'Original text is {text}'
    ),
    backstory=(
        "An experienced editor and quality analyst, capable of detecting inconsistencies, "
        "syntax issues, and context mismatches. Your expertise ensures the final contract "
        "appears professionally written, with no traces of synthetic manipulation."
    ),
    verbose=verb,
    allow_delegation=True,
    llm=llm
)

quality_assurance_task = Task(
    description=(
        "Verify the accuracy, coherence, and readability of the modified contract, ensuring the replaced text "
        "flows naturally. Check for syntax, grammar, and context consistency, and flag any issues. Original text is {text}"
    ),
    agent=quality_assurance_agent,
    context = [replacement_task],
    expected_output="A validated, finalized contract that is ready for inclusion in the synthetic dataset."
)

In [None]:
manager_agent = Agent(
    role="Manager Agent",
    goal=(
        "Oversee the entire process of PID identification, synthetic persona generation, replacement, "
        "and quality assurance. Ensure outputs meet the expected standards and tasks are delegated properly."
    ),
    backstory=(
        "An organizational expert skilled in coordinating workflows and ensuring tasks are completed "
        "sequentially and accurately. You have a sharp eye for detecting errors and redirecting work for refinement."
    ),
    verbose=True,
    allow_delegation=True,
    llm=llm
)

# # Manager Task
# manager_task = Task(
#     description=(
#         "Supervise the PID identification, persona generation, replacement, and quality assurance tasks. "
#         "Coordinate between agents, validate outputs, and ensure all steps are completed successfully. "
#         "Provide a consolidated final output."
#     ),
#     agent=manager_agent,
#     context=[pid_identification_task, persona_generation_task, replacement_task, quality_assurance_task],
#     expected_output="A finalized document with all PIIs replaced by synthetic personas, validated for accuracy and readability."
# )

In [None]:
pid_processing_crew = Crew(
    agents=[pid_extraction_agent, persona_generation_agent, replacement_agent, quality_assurance_agent],
    tasks=[pid_identification_task, persona_generation_task, replacement_task, quality_assurance_task],
    verbose=verb,
    process=Process.hierarchical,
    manager_llm=manager_agent
)



In [None]:
input_text = train_data['generated_text'][0]

In [None]:
input_data = {
    'text': input_text
    }

finalized_contract = pid_processing_crew.kickoff(inputs=input_data)
print("Finalized Contract with Synthetic PIDs:", finalized_contract)

[1m[95m# Agent:[00m [1m[92mManager Agent[00m
[95m## Task:[00m [92mAnalyze the input text and output a JSON structure with the following format:

  'individuals': [
    
      'id': '1',
      'name': 'full_name',
      'title': 'job_title_if_mentioned',
      'email': 'email_if_found',
      'phone': 'phone_if_found',
      'organization': 'affiliated_organization_if_mentioned',
      'context': 'brief_description_of_role_in_document'
    
  ],
  'organizations': [
    
      'id': '1',
      'name': 'company_name',
      'type': 'company_type_if_mentioned',
      'context': 'brief_description_of_organization_role'
    
  ]
\n
Input text: Subject: Mandatory Company-Wide Diversity and Inclusion Training

Hello Team,

I hope this email finds you well. I am writing to announce a mandatory company-wide Diversity and Inclusion training session. This initiative is an essential part of our ongoing efforts to foster a welcoming and inclusive workplace for all employees.

The training 

In [None]:
input_text

'Subject: Mandatory Company-Wide Diversity and Inclusion Training\n\nHello Team,\n\nI hope this email finds you well. I am writing to announce a mandatory company-wide Diversity and Inclusion training session. This initiative is an essential part of our ongoing efforts to foster a welcoming and inclusive workplace for all employees.\n\nThe training will take place on the 15th of next month from 9:00 AM to 5:00 PM. The session will be held in the conference room at our headquarters located at 456 Elm Street. A map and further instructions will be provided closer to the date.\n\nIn preparation for the training, please ensure your attendance by confirming your availability via email to diversity@company.com with the following details:\n\n- Full Name: Rodolfo Gori\n- Passport Number: 240900144\n- Street Address: 173 Clark Drive\n\nYour cooperation in this matter is greatly appreciated. The training is a crucial step in our commitment to creating a more diverse and inclusive work environmen

In [None]:
start_index = 0
end_index = 100

In [None]:
for i in range(start_index, end_index):
    input_text = train_data['generated_text'][i]
    input_data = {'text': input_text}

    finalized_contract = pid_processing_crew.kickoff(inputs=input_data)
    synthetic_text = finalized_contract.raw

    # Save the processed synthetic text to a .txt file with the index as the file name
    with open(f"{i}.txt", "w") as file:
        file.write(synthetic_text)

    print(f"Processed and saved Data Point {i + 1} in {i}.txt")

    time.sleep(60)