In [None]:
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import openai
from openai import AzureOpenAI
import requests
import time
import os
import json
import requests
import subprocess
from openai import OpenAI
import random
from multiprocessing import Lock
from typing import List, Tuple, Dict, Any, Optional

In [None]:
KEYS_DIR = 'keys'
# set credentials for LLM calls
CLIENT_ID = ''
CLIENT_SECRET = ''
if not os.path.isdir(KEYS_DIR):
  os.makedirs(KEYS_DIR)

In [3]:
def get_openai_token(p_token_url, p_client_id, p_client_secret, p_scope):
    """
    Tet openai credentials
    We cache the credentials and refresh every 15 minutes to avoid frequent request
    """
    try:
        with open(os.path.join(KEYS_DIR,f'openai_key.json')) as f:
            key = json.load(f)
        if time.time()<key['expire_at']:
            return key["access_token"]
    except:
        pass
    response = requests.post(
        p_token_url,
        data={"grant_type": "client_credentials", "client_id": p_client_id,
                "client_secret": p_client_secret, "scope": p_scope}
    )
    response.raise_for_status()
    token = response.json()
    with open(os.path.join(KEYS_DIR,f'openai_key.json'),'w') as f:
        json.dump({
            "access_token": token["access_token"],
            'expire_at': time.time()+900
        },f,indent=2)
    return token["access_token"]

In [4]:
def get_openai_client(model):
    """
    get openai client for inference
    """
    client_id = CLIENT_ID
    client_secret = CLIENT_SECRET
    token_url = "https://prod.api.nvidia.com/oauth/api/v1/ssa/default/token"
    scope = "azureopenai-readwrite"
    token = get_openai_token(token_url, client_id, client_secret, scope)
    openai.api_type = "azure"
    openai.api_base = "https://prod.api.nvidia.com/llm/v1/azure/"
    openai.api_version = "2025-04-01-preview"
    openai.api_key = token
    client = AzureOpenAI(
        api_key=token,
        api_version="2025-04-01-preview",
        azure_endpoint="https://prod.api.nvidia.com/llm/v1/azure/",
    )
    return client

In [5]:
def get_llm_response(model,messages,temperature=1.0,return_raw_response=False,tools=None,max_length=1024):
    if isinstance(messages,str):
        messages = [{'role': 'user','content': messages}]
    if model in ['o3','o3-mini','gpt-4o','o3-high','gpt-5','gpt-5-mini','gpt-4.1','gpt-4o-mini']:
        openai_client = get_openai_client(model=model)
        chat_completion = openai_client.chat.completions.create(
                    model=model,
                    messages=messages,
                    temperature=temperature,
                    tools=tools,
                    max_completion_tokens=max_length
                )
        if return_raw_response:
            answer = chat_completion
        else:
            answer = chat_completion.choices[0].message.content
        return answer
    else:
        raise ValueError(f"Model {model} is not supported yet")

### Generate data schema

In [6]:
with open('prompts/table_prompt.txt') as f:
    table_prompt = f.read()
response = get_llm_response(model="gpt-5",messages=table_prompt,max_length=40000)
response

'["customers", "customer_profiles", "customer_addresses", "customer_contacts", "customer_documents", "kyc_checks", "aml_screenings", "sanctions_screenings", "pep_screenings", "customer_risk_ratings", "customer_relationships", "customer_preferences", "customer_segments", "accounts", "account_balances", "account_transactions", "account_statements", "account_fees", "account_interest_accruals", "account_limits", "account_flags", "account_closures", "account_holds", "cards", "card_accounts_links", "card_transactions", "card_disputes", "card_chargebacks", "card_fraud_alerts", "card_limits", "card_issuance", "card_tokenization", "loans", "loan_applications", "loan_disbursements", "loan_repayments", "loan_schedules", "loan_collateral", "loan_delinquencies", "loan_modifications", "loan_interest_accruals", "payments", "payment_instructions", "payment_beneficiaries", "wires", "ach_transfers", "bill_payments", "standing_orders", "direct_debits", "internal_transfers", "branches", "atms", "merchants

In [7]:
# Select fields to generate data

with open('prompts/schema_prompt.txt') as f:
    schema_prompt = f.read()

response = get_llm_response(model="gpt-5",messages=schema_prompt,max_length=40000)
with open("schema.txt","w") as f:
    f.write(response)

In [8]:
# Convert schema to data models for easy checking

with open('prompts/data_model_prompt.txt') as f:
    data_model_prompt = f.read()

response = get_llm_response(model="gpt-5",messages=data_model_prompt,max_length=40000)

with open('data_model.py','w') as f:
    f.write(response)

In [9]:
# Check whether the data model is aligned with the schema
with open('schema.txt') as f:
  schema = f.read()
with open('data_model.py') as f:
  data_model = f.read()
prompt = f'''
Schema:
{schema}

Data model:
{data_model}

Start your answer with yes if:
1. The fields and types in data model is aligned with the definitions in schema.
2. The fields and types are consistent in the data model, e.g., same type for the same variable across different places.

Start your answer with no otherwise.'''

response = get_llm_response(model="o3",messages=prompt,max_length=40000)
response

'no – several key mismatches exist\n\nMain mis-alignments between the JSON schema and the pydantic data model\n\n1. Client\n   • Schema splits residence address out as residence.* whereas the model embeds address inside contact.  \n   • Schema identity field is dob; model uses date_of_birth.  \n   • Risk-profile block (pep, kyc_level, sanctions_screened) is absent from the model.  \n   • Model uses authorized_beneficiaries; schema has beneficiary_refs.\n\n2. Account\n   • Schema allows kinds “brokerage, credit_card”; model restricts AccountKind to "checking | savings".  \n   • Schema field names routing and number; model uses routing_no and tail_digits.  \n   • Balance object (available / ledger) and card_refs are missing in the model.  \n   • AccountStatus enum differs: schema "active" while model "open".  \n   • opened_on exists only in model.\n\n3. Card\n   • Schema fields brand, status, limits missing in model; model instead has issuer, network.  \n   • Network enum limited to visa

### Generate database content

In [10]:
# To diversify the generated data, we encourage LLMs to focus on different perspectives
# In this case, we want LLMs to focus on banks in different countries
subject_prompt = '''List 100 major countries.
Use the following format to output:
```
["country1", "country2", ...]
```'''

response = get_llm_response(model="gpt-5",messages=subject_prompt,max_length=40000)
response

'[\n  "United States",\n  "Canada",\n  "Mexico",\n  "Brazil",\n  "Argentina",\n  "Chile",\n  "Colombia",\n  "Peru",\n  "Venezuela",\n  "Ecuador",\n  "Bolivia",\n  "Paraguay",\n  "Uruguay",\n  "Costa Rica",\n  "Panama",\n  "Guatemala",\n  "Honduras",\n  "El Salvador",\n  "Nicaragua",\n  "Cuba",\n  "Dominican Republic",\n  "Haiti",\n  "Jamaica",\n  "Trinidad and Tobago",\n  "Bahamas",\n  "United Kingdom",\n  "Germany",\n  "France",\n  "Italy",\n  "Spain",\n  "Portugal",\n  "Netherlands",\n  "Belgium",\n  "Switzerland",\n  "Austria",\n  "Poland",\n  "Czechia",\n  "Slovakia",\n  "Hungary",\n  "Romania",\n  "Bulgaria",\n  "Greece",\n  "Sweden",\n  "Norway",\n  "Denmark",\n  "Finland",\n  "Ireland",\n  "Ukraine",\n  "Russia",\n  "Belarus",\n  "China",\n  "India",\n  "Indonesia",\n  "Pakistan",\n  "Bangladesh",\n  "Japan",\n  "South Korea",\n  "North Korea",\n  "Vietnam",\n  "Thailand",\n  "Myanmar",\n  "Malaysia",\n  "Singapore",\n  "Philippines",\n  "Sri Lanka",\n  "Nepal",\n  "Afghanistan"

In [12]:
# Generate database content based on data model and schema.
# We generate 3 records each time to enhance the quality
# To generate multiple database entries, we generate multiple times.

with open('prompts/db_entry_prompt.txt') as f:
    db_entry_prompt = f.read()

response = get_llm_response(model="gpt-5",messages=db_entry_prompt,max_length=40000)
response = response.split("```")[1]
if not os.path.isdir('databases'):
    os.makedirs('databases')
with open("databases/0.json","w") as f:
    json.dump(json.loads(response),f,indent=2)

In [13]:
# Check whether the database content is natural, reasonable and consistent.
with open("databases/0.json") as f:
  database_content = json.load(f)
db_check_prompt1 = f'''{database_content}

Please check whether the content is natural and reasonable.
Please also check whether content is consistent, e.g., client id is the same across different places if it refers to the same client.
Start your answer with yes if all requirements are satisfied, and start your answer with no otherwise.'''
response = get_llm_response(model="gpt-4o",messages=db_check_prompt1,max_length=2000)
response

"Yes, after reviewing the content, it appears that the content is natural, reasonable, and consistent.  \n\nHere are the key checks performed:\n\n1. **Consistency of Client References**:\n   - Each client ID (e.g., `CL-id001-01`, `CL-id001-02`, `CL-id001-03`) is consistently used across the relevant sections like `clients`, `transactions`, `loans`, and `beneficiaries`. No mismatches were found.\n\n2. **Logical Connections**:\n   - Each transaction (`TX-id002-01`, `TX-id002-02`, `TX-id002-03`) properly references an account under the correct client.\n   - Loans (`LN-id003-01`, `LN-id003-02`, `LN-id003-03`) are linked to their respective clients and accounts, and repayment schedules are accounted for appropriately.\n   - Beneficiaries (`BF-id004-01`, `BF-id004-02`, `BF-id004-03`) belong to their respective clients, and the details of their accounts were cross-verified.\n\n3. **Reasonableness of Data**:\n   - Dates are logical and align with the timeline of events. For instance, disbursem

In [14]:
# Check whether the database content is aligned with the schema and data model.
with open('schema.txt') as f:
  schema = f.read()
with open('data_model.py') as f:
  data_model = f.read()
with open("databases/0.json") as f:
  database_content = f.read()
db_check_prompt2 = f'''
Schema:
{schema}

Data model:
{data_model}

Database content:
{database_content}

Start your answer with yes if the database content is aligned with the fields and type definitions in the schema and data model.
Start your answer with no otherwise.'''

response = get_llm_response(model="o3-mini",messages=db_check_prompt2,max_length=20000)
response

'No. One clear misalignment is in the loans data. For example, the loan identified as LN-id003-01 has its "collateral" field set to null, but the corresponding Loan model requires a Collateral (i.e. it isn’t defined as optional). This deviation means that the provided database content would not validate successfully against the data model as defined.'

### Generate tools

In [None]:
# To generate more tools, we sample LLMs multiple times and aggregate the results

with open('prompts/tool_prompt.txt') as f:
    tool_prompt = f.read()

response = get_llm_response(model='gpt-5',messages=tool_prompt,max_length=40000)

with open('tools.py','w') as f:
    f.write(response)

### Generate tasks

In [None]:
# To diversify generated tasks, we first generate intents (meta task)
# We tune the prompt to control the difficulty of the tasks
subject_prompt = f'''Porpose 100 realistic purposes in bank commonly seen in daily life.
I prefer complicated purposes that require mutiple steps to solve.
Each purpose should have only a few words.
Use the following format to output:
[
    "purpose 1",
    "purpose 2",
    ...
]
'''

response = get_llm_response(model='gpt-5',messages=subject_prompt,max_length=40000)
response

'[\n    "Mortgage refinancing",\n    "Renovation mortgage preapproval",\n    "Small business loan",\n    "Open LLC account",\n    "Setup payroll ACH",\n    "International wire transfer",\n    "Large currency exchange",\n    "Open joint trust",\n    "Estate distribution",\n    "Power of attorney",\n    "Escrow disbursement",\n    "Apply for HELOC",\n    "Student loan consolidation",\n    "Fraud dispute resolution",\n    "Credit limit increase",\n    "Reconcile account discrepancy",\n    "Setup merchant services",\n    "SBA loan application",\n    "Open custodial account",\n    "Transfer account ownership",\n    "Close dormant account",\n    "Bankers draft request",\n    "Issue certified check",\n    "Stop payment order",\n    "Conditional recurring transfer",\n    "Link external accounts",\n    "Resolve returned deposit",\n    "Replace debit card",\n    "Update beneficiary",\n    "Wire authorization setup",\n    "Open high-yield savings",\n    "Real estate escrow",\n    "Business credit

In [None]:
# Generate tasks based on selected purpose, tool set, database content.
# We can control the difficulty of the generated tasks by controling the size of the selected tool set.
with open('prompts/task_prompt.txt') as f:
  task_prompt = f.read()

response = get_llm_response(model='gpt-5',messages=task_prompt,max_length=20000)
task = response.split('<start>')[-1].split('<end>')[0]
task = json.loads(task)
with open('task.json','w') as f:
  json.dump(task,f,indent=2)

### Evolve task

In [21]:
# We evolve tasks by optionally adding more constraints, requiring more steps or tools, linking more databases, etc.

with open('prompts/evolve_prompt.txt') as f:
  evolve_prompt = f.read()

response = get_llm_response(model='gpt-5',messages=evolve_prompt,max_length=20000)
task = response.split('<start>')[-1].split('<end>')[0]
task = json.loads(task)
with open('task1.json','w') as f:
  json.dump(task,f,indent=2)