# Structured Output In LangChain

In [2]:
!pip install -q -U langchain langchain-openai langchain_community langchain-mistralai transformers bitsandbytes accelerate jsonformer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.8/321.8 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.1/127.1 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.4/327.4 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [3]:
from typing import List
from pydantic import BaseModel, Field

## Pydantic Syntax

Pydantic data classes are a blend of Python's data classes with the validation power of Pydantic.

They offer a concise way to define data structures while ensuring that the data adheres to specified types and constraints.

In standard python you would create a class like this:

In [4]:
class User:
    def __init__(self, name: str, age: int, email: str):
        self.name = name
        self.age = age
        self.email = email

In [5]:
foo = User(name="Joe",age=32, email="joe@gmail.com")

In [6]:
foo.name

'Joe'

In [7]:
foo = User(name="Joe",age="bar", email="joe@gmail.com")

In [8]:
foo.age

'bar'

In [9]:
class pUser(BaseModel):
    name: str
    age: int
    email: str

In [10]:
foo_p = pUser(name="Jane", age=32, email="jane@gmail.com")

In [11]:
foo_p.name

'Jane'

**Note**: The next cell is expected to fail.

In [12]:
foo_p = pUser(name="Jane", age="bar", email="jane@gmail.com")

ValidationError: 1 validation error for pUser
age
  Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='bar', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/int_parsing

In [13]:
class Class(BaseModel):
    students: List[pUser]

In [14]:
obj = Class(
    students=[pUser(name="Jane", age=32, email="jane@gmail.com")]
)

In [15]:
obj

Class(students=[pUser(name='Jane', age=32, email='jane@gmail.com')])

**Example**

In [16]:
from google.colab import userdata
api_key = userdata.get('API_KEY')

In [17]:
from typing import List
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI

In [18]:
model = ChatOpenAI(model='meta-llama/llama-3-8b-instruct:free' , temperature=0, api_key=api_key, base_url="https://openrouter.ai/api/v1")

In [19]:
# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Joke)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

chain.invoke({"query": joke_query})

Joke(setup="Why don't scientists trust atoms?", punchline='Because they make up everything!')

In [20]:
# Here's another example, but with a compound typed field.
class Actor(BaseModel):
    name: str = Field(description="name of an actor")
    film_names: List[str] = Field(description="list of names of films they starred in")


actor_query = "Generate the filmography for a random actor."

parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

chain.invoke({"query": actor_query})

Actor(name='Tom Hanks', film_names=['Forrest Gump', 'Philadelphia', 'Cast Away', 'Apollo 13', 'Saving Private Ryan'])

# HuggingFace Model

In [31]:
from transformers import BitsAndBytesConfig
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from jsonformer.format import highlight_values
from jsonformer.main import Jsonformer

In [22]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [23]:
class HuggingFaceLLM:
    def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.1"):
        self.model = AutoModelForCausalLM.from_pretrained(model_name ,use_cache=True, device_map="auto",trust_remote_code=True, quantization_config=quantization_config)
        #trust_remote_code=True for phi
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)

    def generate(self, prompt, max_length=1024):
        json = {
            "type": "object",
            "properties": {
                "full_name": {"type": "string"},
                "phone_number": {"type": "string"},
                "address": {"type": "string"},

            }
        }

        builder = Jsonformer(
            model=self.model,
            tokenizer=self.tokenizer,
            json_schema=json,
            prompt= prompt
        )

        print("Creating output...")
        output = builder()

        return output

**You can test different LLM models from HuggingFace**

In [24]:
# llm = HuggingFaceLLM(model_name ="mistralai/Mistral-7B-Instruct-v0.2")
# llm = HuggingFaceLLM(model_name ="microsoft/Phi-3-mini-4k-instruct")
# llm = HuggingFaceLLM(model_name ="microsoft/phi-1_5")
# llm = HuggingFaceLLM(model_name ="google/gemma-2b-it") #"google/gemma-2b-it"
llm = HuggingFaceLLM(model_name ="mistralai/Mistral-7B-Instruct-v0.1")  # Choose the desired Hugging Face model

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [25]:
text = """Sarah Jones (555-867-5309), a baker with a passion for sourdough, recently moved to her new apartment at 123 Elm Street with her neighbor,
 Miguel Rodriguez , a talented musician known for his lively salsa performances. They often chat with Mrs. Lee , the friendly owner of the cozy bookstore across the street."""

In [28]:
template = """For the provided text, extract the following information:

 full name, phone number, address

The text is: ```{input_text}```
"""

# Fill in the placeholders in the template
formatted_template = template.format(input_text=text)

In [32]:
# Generate text using the formatted template

results = llm.generate(formatted_template)
print(results)

Creating output...
{'full_name': 'Sarah Jones', 'phone_number': '555-867-53', 'address': '123 Elm Street'}


# Getting structured output by using ResponseSchema in JSON format

In [33]:
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import PromptTemplate
from langchain_mistralai import ChatMistralAI
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
import json
from langchain.output_parsers import ResponseSchema, StructuredOutputParser


You can use other llm models instead of Mistral

In [34]:
from google.colab import userdata
api_key = userdata.get('API_KEY')

In [35]:
llm= ChatMistralAI(model="mistralai/mistral-7b-instruct:free", api_key = api_key , endpoint = "https://openrouter.ai/api/v1")

In [36]:
# We define each piece of information to be extracted using ResponseSchema
full_name_schema = ResponseSchema(name="full_name",
                             description="Extract full name")
phone_number_schema = ResponseSchema(name="phone_number",
                             description="Extract phone number")
address_schema = ResponseSchema(name="address",
                             description="Extract complete address")

# list of schemas
response_schemas = [full_name_schema, phone_number_schema, address_schema ]

In [37]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [38]:
# The output parser can be added to the prompt by format_instructions
print(output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"full_name": string  // Extract full name
	"phone_number": string  // Extract phone number
	"address": string  // Extract complete address
}
```


In [39]:
template = """Extract the following data from text:

 full name, phone number, address

{format_instructions}

{input_text}
"""

In [40]:
# Construct a Langchain Chain to connect the prompt template with the LLM and Pydantic parser
prompt = PromptTemplate.from_template(template, partial_variables={"format_instructions": output_parser.get_format_instructions()})

chain = prompt | llm | output_parser

In [41]:
text = """Sarah Jones (555-867-5309), a baker with a passion for sourdough, recently moved to her new apartment at 123 Elm Street with her neighbor,
 Miguel Rodriguez , a talented musician known for his lively salsa performances. They often chat with Mrs. Lee , the friendly owner of the cozy bookstore across the street."""

In [42]:
result = chain.invoke({"input_text": text})

print(result)
out_json = json.dumps(str(result), indent=4)

with open('output.json', 'w') as outfile:
  outfile.write(out_json)

{'full_name': 'Sarah Jones', 'phone_number': '555-867-5309', 'address': '123 Elm Street'}


# Output parser with Pydantic

You can define a class with fields which you want to get in output in JSON format

In [43]:
class Person_info(BaseModel):

    full_name: str = Field(default=None, description="Extract full name")
    phone_number: str = Field(
        default=None, description="Extract phone number"
    )
    address: str = Field(
        default=None, description="Extract complete address"
    )

parser = JsonOutputParser(pydantic_object=Person_info)

In [44]:
template = """For the provided text, extract the following information:

 full name, phone number, address

{format_instructions}

The text is: ```{input_text}```
"""

In [45]:
prompt = PromptTemplate.from_template(template, partial_variables={"format_instructions": parser.get_format_instructions()})

chain = prompt | llm | parser

In [46]:
result = chain.invoke({"input_text": text})

print(result)
out_json = json.dumps(str(result), indent=4)

with open('output2.json', 'w') as outfile:
  outfile.write(out_json)

{'full_name': 'Sarah Jones', 'phone_number': '555-867-5309', 'address': '123 Elm Street'}
