In [25]:
from pydantic import BaseModel
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI
import pandas as pd

In [12]:
# define schema of our own dataset 
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code : str
    procedure_code : str
    total_charge: float
    insurance_claim_amount : float
    

# Sample Data
To guide the synthetic data generator, it's useful to provide it with a few real-world-like examples.

In [13]:
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

# Create a Prompt Template
The generator doesn't magically know how to create our data; we need to guide it. We do this by creating a prompt template. This template helps instruct the underlying language model on how to produce synthetic data in the desired format.

In [14]:
OPENAI_TEMPLATE = PromptTemplate(input_variables = ["example"], template = "{example}")

prompt_template = FewShotPromptTemplate(
    prefix = SYNTHETIC_FEW_SHOT_PREFIX,
    examples = examples,
    suffix = SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables = ["subject","extra"],
    example_prompt = OPENAI_TEMPLATE,
)

In [15]:
synthetic_data_generator = create_openai_data_generator(
    output_schema = MedicalBilling,
    llm = ChatOpenAI(
        temperature= 1.2
    ),
    prompt = prompt_template
)

In [18]:
llm = ChatOpenAI(temperature=1.2)

In [19]:
print(llm.model_name)  # default model


gpt-3.5-turbo


In [16]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)



In [9]:
import json

A = {"a":1, "b":2}
j = json.dumps(A)

In [11]:
decode = json.loads(j)
type(decode)

dict

In [None]:
synthetic_results

In [26]:
# For Pydantic v2
df = pd.DataFrame([item.model_dump() for item in synthetic_results])

In [27]:
df

Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,654321,Sophia Black,A09.9,99204,400.0,300.0
1,123456,Oliver White,B01.2,99203,250.0,200.0
2,987654,Ezekiel Green,F32.9,99213,350.0,275.0
3,987654,Daisy Silver,C46.9,99212,300.0,250.0
4,987654,Charlotte Singh,I86.9,99214,400.0,320.0
5,123456,Xavier Ortiz,M54.5,99215,450.0,350.0
6,987654,George Washington,K30.9,99213,350.0,280.0
7,987654321,Zephyr Montgomery,T80.3,99212,300.0,240.0
8,246810,Juniper Thomas,L21.0,99214,400.0,320.0
9,135792,Bartholomew McAllister,F32.9,99215,500.0,400.0


In [20]:
print(synthetic_results[:2])

[MedicalBilling(patient_id=654321, patient_name='Sophia Black', diagnosis_code='A09.9', procedure_code='99204', total_charge=400.0, insurance_claim_amount=300.0), MedicalBilling(patient_id=123456, patient_name='Oliver White', diagnosis_code='B01.2', procedure_code='99203', total_charge=250.0, insurance_claim_amount=200.0)]
