<a href="https://colab.research.google.com/github/sampathk-hps/langchain-fundamentals-colab/blob/main/LangChain_4_Build_an_Extraction_Chain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install --upgrade langchain-core

Collecting langchain-core
  Downloading langchain_core-0.3.77-py3-none-any.whl.metadata (3.2 kB)
Downloading langchain_core-0.3.77-py3-none-any.whl (449 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.5/449.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-core
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.3.76
    Uninstalling langchain-core-0.3.76:
      Successfully uninstalled langchain-core-0.3.76
Successfully installed langchain-core-0.3.77


## Schema

In [None]:
from typing import Optional
from pydantic import BaseModel, Field

class Person(BaseModel):
  # Doc-string for the entity Person.
  # This doc-string is sent to the LLM as the description of the schema Person,
  # and it can help to improve extraction results.
  """ Information about a person. """

  name: Optional[str] = Field(default=None, description="The name of the person.")
  age: Optional[int] = Field(default=None, description="The age of the person.")
  hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
  height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

## Extractor

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

## LLM

In [None]:
%pip install -qU "langchain-perplexity"

In [None]:
import getpass
import os

if not os.environ.get("PPLX_API_KEY"):
    os.environ["PPLX_API_KEY"] = getpass.getpass("Perplexity API Key:")

from langchain.chat_models import init_chat_model

llm = init_chat_model(model="sonar", model_provider="perplexity")

Perplexity API Key:··········


In [None]:
structured_llm = llm.with_structured_output(schema=Person)

## Test

In [None]:
text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})
response = structured_llm.invoke(prompt)
print(response)

name='Alan Smith' age=44 hair_color=None height_in_meters='1.78 m'


In [None]:
response.model_dump()

{'name': 'Alan Smith',
 'age': 44,
 'hair_color': None,
 'height_in_meters': '1.78 m'}

In [None]:
from typing import List, Optional

from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )


class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [None]:
structured_llm = llm.with_structured_output(schema=Data)
text = "My name is Jeff, my hair is black and i am 6 feet tall. Anna has the same color hair as me."
prompt = prompt_template.invoke({"text": text})
response = structured_llm.invoke(prompt)
response.model_dump()

{'people': [{'name': 'Jeff',
   'hair_color': 'black',
   'height_in_meters': '1.83'},
  {'name': 'Anna', 'hair_color': 'black', 'height_in_meters': None}]}

## Tool Calling

In [None]:
from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        "The ocean is vast and blue. It's more than 20,000 feet deep.",
        Data(people=[]),
    ),
    (
        "Fiona traveled far from France to Spain.",
        Data(people=[Person(name="Fiona", height_in_meters=None, hair_color=None)]),
    ),
]

messages = []

for txt, tool_call in examples:
    if tool_call.people:
        # This final message is optional for some providers
        ai_response = "Detected people."
    else:
        ai_response = "Detected no people."
    messages.extend(tool_example_to_messages(txt, [tool_call], ai_response=ai_response))

In [None]:
for message in messages:
    message.pretty_print()


The ocean is vast and blue. It's more than 20,000 feet deep.
Tool Calls:
  Data (50eefcbc-620f-487c-90fd-2c4b43e2e518)
 Call ID: 50eefcbc-620f-487c-90fd-2c4b43e2e518
  Args:
    people: []

You have correctly called this tool.

Detected no people.

Fiona traveled far from France to Spain.
Tool Calls:
  Data (bad6ff17-19ea-4944-872f-4867865e5dbf)
 Call ID: bad6ff17-19ea-4944-872f-4867865e5dbf
  Args:
    people: [{'name': 'Fiona', 'hair_color': None, 'height_in_meters': None}]

You have correctly called this tool.

Detected people.


Let's compare performance with and without these messages. For example, let's pass a message for which we intend no people to be extracted:

In [None]:
message_no_extraction = {
    "role": "user",
    "content": "The solar system is large, but earth has only 1 moon.",
}

structured_llm = llm.with_structured_output(schema=Person)
structured_llm.invoke([message_no_extraction])

Person(name='Earth', hair_color=None, height_in_meters=None)

In [None]:
structured_llm = llm.with_structured_output(schema=Person)
structured_llm.invoke(messages + [message_no_extraction])


TypeError: Got unknown type content='You have correctly called this tool.' tool_call_id='50eefcbc-620f-487c-90fd-2c4b43e2e518'