In [39]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)

from langchain_openai import AzureChatOpenAI
from util.config import load_configuration

In [40]:
PROP_FILE = 'app.properties'
configs = load_configuration(PROP_FILE)

API_TYPE = configs.get("API_TYPE").data
API_VERSION = configs.get("API_VERSION").data
API_KEY = configs.get("API_KEY").data
API_BASE = configs.get("API_BASE").data
LLM_ENGINE_GPT35_16K=configs.get("LLM_ENGINE_GPT35_16K").data

In [41]:
llm = AzureChatOpenAI(
    azure_endpoint=API_BASE,
    api_key=API_KEY,
    azure_deployment=LLM_ENGINE_GPT35_16K,
    openai_api_version=API_VERSION,
)

In [42]:
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

In [43]:
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [44]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [45]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=MedicalBilling,
    llm=llm,  # You'll need to replace with your actual Language Model instance
    prompt=prompt_template,
)

In [46]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=2,
)

In [47]:
synthetic_results

[MedicalBilling(patient_id=987654, patient_name='Samantha Johnson', diagnosis_code='C61', procedure_code='99204', total_charge=600.0, insurance_claim_amount=450.0),
 MedicalBilling(patient_id=123456, patient_name='Elijah Davis', diagnosis_code='F32.9', procedure_code='99215', total_charge=400.0, insurance_claim_amount=320.0)]

In [48]:
## Alternate implementation

from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)

chain = create_data_generation_chain(llm)

In [49]:
response = chain(
    {
        "fields": {"colors": ["blue", "yellow"]},
        "preferences": {"style": "Make it in a style of a weather forecast."},
    }
)
response

{'fields': {'colors': ['blue', 'yellow']},
 'preferences': {'style': 'Make it in a style of a weather forecast.'},
 'text': "In today's weather forecast, we can expect a vibrant blend of colors as the sky showcases a breathtaking combination of blue and yellow hues, creating a mesmerizing palette that will surely captivate the eyes of all who gaze upon it."}

In [50]:
response = chain(
    {
        "fields": {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
        "preferences": None,
    }
)
response

{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
 'preferences': None,
 'text': 'Tom Hanks, renowned for his remarkable talent, has graced the silver screen with his unforgettable performances in iconic movies such as "Forrest Gump" and "Green Mile".'}

In [51]:
response = chain(
    {
        "fields": {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
        "preferences": {"minimum_length": 200, "style": "gossip"},
    }
)
response

{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
 'preferences': {'minimum_length': 200, 'style': 'gossip'},
 'text': 'Did you hear the latest gossip? Tom Hanks, the incredible actor known for his iconic roles in movies like "Forrest Gump" and "Green Mile," has once again captured the hearts of audiences with his exceptional talent and undeniable charm.'}

In [52]:
## Generating exemplary dataset for extraction benchmarking purposes

inp = [
    {
        "Actor": "Tom Hanks",
        "Film": ["Forrest Gump", "Saving Private Ryan", "The Green Mile", "Toy Story", "Catch Me If You Can",],
    },
    {
        "Actor": "Tom Hardy",
        "Film": ["Inception", "The Dark Knight Rises", "Mad Max: Fury Road", "The Revenant", "Dunkirk",],
    },
]

generator = DatasetGenerator(llm, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [53]:
dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hanks, the acclaimed actor known for his roles in films such as "Forrest Gump," "Saving Private Ryan," "The Green Mile," "Toy Story," and "Catch Me If You Can," has captivated audiences with his exceptional talent, effortlessly transitioning from a lovable yet simple-minded character in "Forrest Gump" to a courageous and dedicated soldier in "Saving Private Ryan," and then portraying the compassionate and empathetic prison guard in "The Green Mile." However, it is his iconic voice acting as the beloved cowboy Woody in the animated classic "Toy Story" that has forever endeared him to both children and adults alike. With his versatility and undeniable charisma, Tom Hanks has solidified his place as one of the most versatile and enduring actors of our time,

In [54]:
## Extraction from generated examples

from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

In [55]:
prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]["text"])
output = llm.invoke(_input.to_string())
output

AIMessage(content='{\n  "Actor": "Tom Hanks",\n  "Film": [\n    "Forrest Gump",\n    "Saving Private Ryan",\n    "The Green Mile",\n    "Toy Story",\n    "Catch Me If You Can"\n  ]\n}', response_metadata={'token_usage': {'completion_tokens': 50, 'prompt_tokens': 378, 'total_tokens': 428}, 'model_name': 'gpt-35-turbo-16k', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}, id='run-1040be29-a6f5-4f0f-bb0c-20f5ea80fd6e-0')

In [56]:
parser = PydanticOutputParser(pydantic_object=Actor)
parsed = parser.parse(output.content)
parsed

Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])