In [1]:
# Reload all modules
%reload_ext autoreload
%autoreload 2


In [2]:
from const import gl_codes, departments
import pandas as pd

codes = pd.DataFrame(gl_codes)
codes


Unnamed: 0,name,description
0,7100-INSURANCE,"All business insurance including cyber, liabil..."
1,7300-CONSULTING,External consulting services including technic...
2,7400-TRAINING,Professional development including certificati...
3,8100-DATA-SERVICES,"Data services including subscriptions, API acc..."
4,8200-AI-ML,Artificial intelligence and machine learning s...
5,8300-CUSTOMER-PLATFORM,Customer-facing platforms including support sy...
6,9100-TELECOM,"Telecommunications including internet service,..."
7,9200-FACILITY-OPS,"Facility operations including maintenance, cle..."
8,1600-FINANCIAL-SERVICES,Financial service fees including banking charg...
9,1650-MEMBERSHIPS,Professional memberships and industry associat...


In [3]:
import instructor
import openai
from pydantic import BaseModel
from faker import Faker
import random
from asyncio import Semaphore
import json

client = instructor.from_openai(openai.AsyncOpenAI())
fake = Faker()

class Transaction(BaseModel):
    transaction_name: str
    department: str
    category: str
    amount: float
    location: str

prev_transactions = open("./generated.jsonl").readlines()

async def generate_synthetic_query(code,description,sem:Semaphore):
    async with sem:
        amount = round(random.uniform(1, 1000), 2)
        location = random.choice(["New York", "San Francisco", "Chicago", "Los Angeles","Austin","Seattle","Miami","Boston","Atlanta","Dallas","Houston","Phoenix","San Diego","San Jose","London","Calgary","Alberta","Vancouver","Toronto","Montreal","Cape Town","Sydney","Melbourne","Tokyo","Shanghai","Beijing","Hong Kong","Singapore"])
        department = random.choice(departments)
        samples = random.sample([json.loads(t) for t in prev_transactions], 3)
        return await client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "Generate a fake business transaction that will be classified under {{ code }} ( Code description: {{ description }}). It should originate from {{ location }}, involve a charge of {{amount}} and have been made by the {{ department }} department. The name of the transaction should be the merchant itself (Eg. Amazon, Beirut bakery etc). Please use a descriptive and unique name - do not call it literally what it is"},
                {
                    "role": "user",
                    "content": f"Here are some examples of previous transactions: {samples}. Make sure your transaction is similar to these examples if relevant but not exactly the same. Company names should be unique and not be the same as the examples."
                }
            ],
            context={
                "code": code,
                "location": location,
                "amount": amount,
                "department": department,
                "description": description,
            },
            response_model=Transaction,
        )

In [4]:
from tqdm.asyncio import tqdm_asyncio as asyncio

async def generate_transactions():
    sem = Semaphore(10)
    tasks = []
    
    # Generate 100 transactions
    for _ in range(20):
        # Randomly select a GL code
        code_row = codes.iloc[random.randint(0, len(codes)-1)]
        code = code_row['name']
        description = code_row['description']
        
        # Create task for each transaction
        task = generate_synthetic_query(code, description, sem)
        tasks.append(task)
    
    # Wait for all tasks to complete
    transactions = await asyncio.gather(*tasks)
    return transactions

# Run the async function
transactions = await generate_transactions()
with open('transactions.jsonl', 'w') as f:
    for transaction in transactions:
        f.write(transaction.model_dump_json() + '\n')

# Convert to DataFrame

100%|██████████| 20/20 [00:02<00:00,  7.06it/s]
