## Dataset Cleaning

In [None]:
%%capture

!pip install crewai
!pip install datasets

In [None]:
from datasets import load_dataset

data = load_dataset('LightFury9/gretelai_synthetic_pii_finance_english')
train_data = data['train'].sort("index")

In [None]:
useless_columns = [
    "MT940",
    "SWIFT Message",
    "XBRL",
    "BAI Format",
    "Currency Exchange Rate Sheet",
    "EDI",
    "FIX Protocol",
    "Financial Data Feed",
    "FpML",
    "CSV"
]

In [None]:
filtered_train_data = train_data.filter(
    lambda x: x['document_type'] not in useless_columns
)

In [None]:
filtered_train_data_pd = filtered_train_data.to_pandas()

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(
    filtered_train_data_pd,
    test_size=0.5,
    stratify=filtered_train_data_pd['document_type'],
    random_state = 42,
)

In [None]:
train_df.groupby('document_type').size().reset_index(name='count').sort_values('count')

Unnamed: 0,document_type,count
43,Securities Prospectus,107
10,Credit Card Statement,116
16,Employment Contract,118
5,Compliance Certificate,120
17,Financial Aid Application,121
19,Financial Forecast,121
11,Cryptocurrency Transaction Report,123
41,Renewal Reminder,123
38,Real Estate Loan Agreement,124
20,Financial Regulatory Compliance Report,124


In [None]:
train_df.shape

(9507, 16)

In [None]:
from datasets import Dataset

In [None]:
data = Dataset.from_pandas(train_df)

In [None]:
data

Dataset({
    features: ['level_0', 'index', 'document_type', 'document_description', 'expanded_type', 'expanded_description', 'language', 'language_description', 'domain', 'generated_text', 'pii_spans', 'conformance_score', 'quality_score', 'toxicity_score', 'bias_score', 'groundedness_score', '__index_level_0__'],
    num_rows: 9507
})

In [None]:
data = data.remove_columns(['__index_level_0__'])

In [None]:
data.push_to_hub(
    'LightFury9/gretelai_synthetic_pii_finance_english',
    split = 'cleaned',
    token=hf_write
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/LightFury9/gretelai_synthetic_pii_finance_english/commit/2bc87d17f4027cdc6b59467ad5c6670c74c078f1', commit_message='Upload dataset', commit_description='', oid='2bc87d17f4027cdc6b59467ad5c6670c74c078f1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
verb = True

In [None]:
google_api_key = ''

In [None]:
from crewai import LLM

llm = LLM(
    model="gemini/gemini-1.5-flash",
    api_key=google_api_key
)

In [None]:
from crewai import Agent, Task, Crew, Process

In [None]:
# Create PII Processing Agent
pii_processor = Agent(
    role="PII Processing Expert",
    goal=(
        "Identify and replace all personally identifiable information (PII) and organization names in the input text "
        "with realistic synthetic data while maintaining context and relationships. "
        "Generate actual synthetic values for ALL PII fields and EVERY organization name - never use placeholders or brackets. "
        "Every single piece of PII and organization name must be replaced with a realistic, properly formatted value. "
        "Pay special attention to company names, ensuring ALL business entities are replaced consistently."
    ),
    backstory=(
        "An expert in both PII and business entity detection and synthetic data generation. You excel at identifying "
        "sensitive information, company names, and creating realistic replacements that maintain document coherence. "
        "You are particularly skilled at recognizing and replacing organization names in various formats (full names, "
        "abbreviations, trade names, etc.). You always generate actual synthetic values - never placeholders. "
        "For every type of PII and organization name, you create properly formatted, realistic synthetic data "
        "following the correct patterns for that type of information."
    ),
    verbose=verb,
    allow_delegation=False,
    llm=llm
)

processing_task = Task(
    description=(
        "Process the input text by:\n"
        "1. Identifying and replacing ALL organization names and PII including:\n"
        "   ORGANIZATION IDENTIFIERS:\n"
        "   - Full company/enterprise names\n"
        "   - Business abbreviations and trade names\n"
        "   - Subsidiary company names\n"
        "   - Organization divisions or department names\n"
        "   - Brand names associated with companies\n"
        "   - Industry-specific company identifiers\n"
        "   - Company registration numbers\n"
        "   - Corporate email domains\n\n"
        "   PERSONAL IDENTIFIERS:\n"
        "   - Names (replace with realistic full names)\n"
        "   - Addresses (replace with actual street addresses)\n"
        "   - Email addresses (create proper email addresses)\n"
        "   - Phone numbers (generate proper phone numbers with area codes)\n"
        "   - Job titles (use actual job titles)\n"
        "   - Passport numbers (generate proper format passport numbers)\n"
        "   - Social Security Numbers (use proper SSN format)\n"
        "   - Account numbers (maintain proper format)\n"
        "   - Any other identifying information\n\n"
        "2. Replace each piece of information with ACTUAL synthetic data:\n"
        "   ORGANIZATION REPLACEMENTS:\n"
        "   - Generate realistic company names appropriate to the industry context\n"
        "   - Maintain consistency in company name usage throughout the document\n"
        "   - Create matching synthetic email domains for company emails\n"
        "   - Ensure subsidiary names align with parent company names\n"
        "   - Replace any company-specific product or service names\n\n"
        "   PII REPLACEMENTS:\n"
        "   - DO NOT use placeholders like [SYNTHETIC_X] or [REDACTED]\n"
        "   - Generate proper formatted values for every field\n"
        "   - For passport numbers, generate actual 9-character alphanumeric codes\n"
        "   - For SSNs, generate properly formatted 9-digit numbers\n"
        "   - For phone numbers, use real area codes and proper formatting\n"
        "   - For addresses, create complete, realistic street addresses\n\n"
        "3. Ensure consistency and formatting:\n"
        "   - Maintain proper capitalization and formatting\n"
        "   - Keep the same format as the original text\n"
        "   - Ensure all references to the same entity use the same synthetic value\n"
        "   - Maintain relationships between companies and their employees\n"
        "   - Preserve industry context and business relationships\n\n"
        "Input text: {original_text}"
    ),
    agent=pii_processor,
    expected_output="Text with all organization names and PII replaced by actual synthetic data, no placeholders"
)

pii_verifier = Agent(
    role="PII and Organization Verification Agent",
    goal=(
        "Thoroughly verify that all organization names and PII have been removed and replaced with actual synthetic values. "
        "Pay special attention to company names in all forms (full names, abbreviations, subsidiaries). "
        "If any placeholders or improperly formatted synthetic data is found, replace them with proper "
        "synthetic values. Ensure no original company names or bracketed placeholders remain in the text."
    ),
    backstory=(
        "A meticulous data privacy specialist with expertise in both business entity and PII detection. "
        "You have a keen eye for spotting original company names, business identifiers, and personal information. "
        "You ensure all synthetic data follows proper formatting rules and appears realistic. "
        "You're particularly skilled at ensuring consistency in company name replacements across documents. "
        "You never allow placeholder text or original organization names to remain in the final output."
    ),
    verbose=verb,
    allow_delegation=False,
    llm=llm
)

verification_task = Task(
    description=(
        "Verify and correct the processed text by:\n"
        "1. Scanning for any remaining original information:\n"
        "   - Company names in any form (full names, abbreviations, trade names)\n"
        "   - Subsidiary or department names\n"
        "   - Corporate email domains\n"
        "   - Personal identifiers and PII\n\n"
        "2. Checking for placeholders like [SYNTHETIC_X] or [REDACTED]\n\n"
        "3. Verifying all synthetic data is properly formatted:\n"
        "   - Company names should be realistic and industry-appropriate\n"
        "   - Email domains should match synthetic company names\n"
        "   - Passport numbers should be actual 9-character codes\n"
        "   - Phone numbers should have proper area codes and formatting\n"
        "   - Addresses should be complete and realistic\n"
        "   - All numeric identifiers should follow proper patterns\n\n"
        "4. Ensuring consistency:\n"
        "   - Same company should have same synthetic name throughout\n"
        "   - Employee emails should match their company's domain\n"
        "   - Subsidiary names should align with parent company names\n\n"
        "5. Replace any found original names, placeholders, or improper synthetic data with realistic values\n"
        "6. Ensure document flow and formatting is maintained\n\n"
        "Return ONLY the corrected text with all proper synthetic values.\n\n"
        "Original text: {original_text}\n"
    ),
    agent=pii_verifier,
    context=[processing_task],
    expected_output="Text with all organization names and PII replaced by proper synthetic data, no placeholders"
)

In [None]:
pii_crew = Crew(
    agents=[pii_processor, pii_verifier],
    tasks=[processing_task, verification_task],
    verbose=verb,
    process=Process.sequential
)

In [None]:
start_index = 6
count = 1
end_index = start_index + count

In [None]:
import time

for i in range(start_index, end_index):
    input_text = train_data['generated_text'][i]
    input_data = {'original_text': input_text}

    finalized_contract = pii_crew.kickoff(inputs=input_data)
    synthetic_text = finalized_contract.raw

    file_name = f"{train_data['index'][i]}.txt"
    with open(file_name, "w") as file:
        file.write(synthetic_text)

    print(f"Processed and saved Data Point {train_data['index'][i]}.txt")

    time.sleep(15)

In [None]:
# !rm -rf *.txt

In [None]:
unique_categories = list(set(train_data['document_type']))
unique_categories.sort()

import random

def save_examples_to_txt(dataset, doc_type,  num_examples=5):

    filtered_data = dataset.filter(lambda x: x['document_type'] == doc_type)

    total_examples = len(filtered_data)

    if total_examples == 0:
        print(f"No examples found for document type '{doc_type}'.")
        return

    examples_to_show = min(num_examples, total_examples)
    selected_indices = random.sample(range(total_examples), examples_to_show)

    file_name = f"{doc_type}.txt"

    with open(file_name, 'w') as f:
        for idx in selected_indices:
            example_text = filtered_data[idx]['generated_text']
            f.write(example_text)
            f.write("\n" + "="*50 + "\n")

    print(f"Saved {examples_to_show} examples to '{file_name}'")

In [None]:
# unique_categories

In [None]:
for u in unique_categories[30:]:
  save_examples_to_txt(train_data, doc_type=u, num_examples=5)

Saved 5 examples to 'Health Insurance Claim Form.txt'
Saved 5 examples to 'ISDA Definition.txt'
Saved 5 examples to 'IT support ticket.txt'
Saved 5 examples to 'Insurance Claim Form.txt'
Saved 5 examples to 'Insurance Policy.txt'
Saved 5 examples to 'Investment Prospectus.txt'
Saved 5 examples to 'Loan Agreement.txt'
Saved 5 examples to 'Loan Application.txt'
Saved 5 examples to 'MT940.txt'
Saved 5 examples to 'Mortgage Amortization Schedule.txt'
Saved 5 examples to 'Mortgage Contract.txt'
Saved 5 examples to 'Payment Confirmation.txt'
Saved 5 examples to 'Pension Plan Agreement.txt'
Saved 5 examples to 'Policyholder's Report.txt'
Saved 5 examples to 'Privacy Policy.txt'
Saved 5 examples to 'Product Disclosure Statement.txt'
Saved 5 examples to 'Real Estate Loan Agreement.txt'
Saved 5 examples to 'Regulatory Compliance Guide.txt'
Saved 5 examples to 'Regulatory Filing.txt'
Saved 5 examples to 'Renewal Reminder.txt'
Saved 5 examples to 'SWIFT Message.txt'
Saved 5 examples to 'Safety Dat