In [2]:
%pip install --upgrade --quiet  langchain langchain_experimental langchain-openai
# Set env var OPENAI_API_KEY or load from a .env file:
# import dotenv
# dotenv.load_dotenv()


Note: you may need to restart the kernel to use updated packages.


In [3]:

from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI

In [4]:
SYNTHETIC_FEW_SHOT_PREFIX

'This is a test about generating synthetic data about {subject}. Examples below:'

In [5]:
SYNTHETIC_FEW_SHOT_SUFFIX

'Now you generate synthetic data about {subject}. Make sure to {extra}:'

In [6]:
OPENAI_TEMPLATE

PromptTemplate(input_variables=['example'], template='{example}')

In [7]:
class MedicalBilling(BaseModel):
    patient_id:int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

In [8]:
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [9]:
prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [11]:
prompt_template.pretty_print()

This is a test about generating synthetic data about [33;1m[1;3m{subject}[0m. Examples below:

Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350

Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120

Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250

Now you generate synthetic data about [33;1m[1;3m{subject}[0m. Make sure to [33;1m[1;3m{extra}[0m:


In [13]:
OPENAI_API_KEY =""

In [14]:
synthetic_data_generator = create_openai_data_generator(
    output_schema = MedicalBilling,
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY),
    prompt = prompt_template
)

In [15]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)

In [16]:
synthetic_results

[MedicalBilling(patient_id=987654, patient_name='Sophia Rodriguez', diagnosis_code='F32.9', procedure_code='99204', total_charge=400.0, insurance_claim_amount=300.0),
 MedicalBilling(patient_id=123456, patient_name='Aloysius Throckmorton', diagnosis_code='G47.0', procedure_code='99203', total_charge=250.0, insurance_claim_amount=200.0),
 MedicalBilling(patient_id=456789, patient_name='Octavia Worthington', diagnosis_code='M54.5', procedure_code='99213', total_charge=350.0, insurance_claim_amount=275.0),
 MedicalBilling(patient_id=789012, patient_name='Barnaby Snodgrass', diagnosis_code='I10', procedure_code='99214', total_charge=300.0, insurance_claim_amount=250.0),
 MedicalBilling(patient_id=987654, patient_name='Zephyrine Applegate', diagnosis_code='F32.9', procedure_code='99204', total_charge=400.0, insurance_claim_amount=320.0),
 MedicalBilling(patient_id=123456, patient_name='Dmitriy Yermilov', diagnosis_code='R05', procedure_code='99203', total_charge=275.0, insurance_claim_amoun

In [17]:
#other ways 

In [20]:
from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)

In [21]:
chain = create_data_generation_chain(llm)

In [22]:
chain({"fields": ["blue", "yellow"], "preferences": {}})

  warn_deprecated(


{'fields': ['blue', 'yellow'],
 'preferences': {},
 'text': 'The vibrant blue sky was dotted with fluffy yellow clouds, creating a picturesque scene perfect for a lazy afternoon picnic.'}

In [23]:
chain({"fields": ["idli", "dosa"], "preferences": {}})

{'fields': ['idli', 'dosa'],
 'preferences': {},
 'text': 'The aroma of freshly steamed idli and crispy golden dosa wafted through the air, tempting me to indulge in a South Indian feast fit for a king.'}

In [24]:
chain(
    {
        "fields": {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
        "preferences": None,
    }
)

{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
 'preferences': None,
 'text': 'Tom Hanks, known for his iconic roles in movies such as "Forrest Gump" and "Green Mile," has captivated audiences worldwide with his exceptional talent and versatility in the film industry.'}

In [25]:
#Extraction from Generated data 

In [34]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(llm, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [27]:
from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

In [28]:
class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

In [35]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]["text"])
output = llm(_input.to_string())

parsed = parser.parse(output)
parsed

  warn_deprecated(


Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])