# Tagging and Extraction Using Bedrock tools

In [1]:
import json
import boto3
import panel as pn  # GUI

bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
)

In [2]:
from typing import List
from pydantic import BaseModel, Field
from langchain_aws.function_calling import convert_to_anthropic_tool

In [3]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [4]:
convert_to_anthropic_tool(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'input_schema': {'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain_aws import ChatBedrock


In [6]:
model = ChatBedrock(
    model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
    model_kwargs=dict(temperature=0)
)

In [7]:
tagging_tools = [convert_to_anthropic_tool(Tagging)]

In [8]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [9]:
model_with_tools = model.bind(
    tools=tagging_tools, 
    tool_choice={"type": "tool", "name": "Tagging"}
)

In [10]:
tagging_chain = prompt | model_with_tools

In [11]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'usage': {'prompt_tokens': 412, 'completion_tokens': 50, 'total_tokens': 462}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, response_metadata={'usage': {'prompt_tokens': 412, 'completion_tokens': 50, 'total_tokens': 462}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, id='run-158e8b44-881f-401f-8dc0-ae5936f55a9e-0', tool_calls=[{'name': 'Tagging', 'args': {'sentiment': 'pos', 'language': 'en'}, 'id': 'toolu_bdrk_01MyVBP2AHS2A8rL4votg53q', 'type': 'tool_call'}], usage_metadata={'input_tokens': 412, 'output_tokens': 50, 'total_tokens': 462})

In [12]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'usage': {'prompt_tokens': 415, 'completion_tokens': 50, 'total_tokens': 465}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, response_metadata={'usage': {'prompt_tokens': 415, 'completion_tokens': 50, 'total_tokens': 465}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, id='run-e577ee1a-4464-4180-9597-dd0a0e3a8860-0', tool_calls=[{'name': 'Tagging', 'args': {'sentiment': 'neg', 'language': 'it'}, 'id': 'toolu_bdrk_0149sVQhNmTwmwmma9qHVbYe', 'type': 'tool_call'}], usage_metadata={'input_tokens': 415, 'output_tokens': 50, 'total_tokens': 465})

## Parsers

There are a couple of options for output parsers

In [13]:
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser
from langchain_aws.function_calling import ToolsOutputParser

In [14]:
tagging_chain = prompt | model_with_tools | JsonOutputKeyToolsParser(
    key_name="Tagging", first_tool_only=True)

In [15]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'language': 'it', 'sentiment': 'neg'}

In [16]:
aws_tagging_chain = prompt | model_with_tools | ToolsOutputParser(first_tool_only=True, args_only=True)

In [17]:
aws_tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'sentiment': 'neg', 'language': 'it'}

## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [36]:
from typing import Optional
class Person(BaseModel):
    """Information about a person ."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [37]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people in JSON format")

In [38]:
convert_to_anthropic_tool(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'input_schema': {'properties': {'people': {'description': 'List of info about people in JSON format',
    'items': {'description': 'Information about a person .',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [39]:
extraction_tools = [convert_to_anthropic_tool(Information)]
extraction_model = model.bind(
    tools=extraction_tools,
    tool_choice={"type": "tool", "name": "Information"}
)

In [40]:
extraction_model.invoke("Joe is 30")

AIMessage(content='', additional_kwargs={'usage': {'prompt_tokens': 433, 'completion_tokens': 58, 'total_tokens': 491}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, response_metadata={'usage': {'prompt_tokens': 433, 'completion_tokens': 58, 'total_tokens': 491}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, id='run-16858d21-db7f-47b6-900e-e746c7e19d9e-0', tool_calls=[{'name': 'Information', 'args': {'people': [{'name': 'Joe', 'age': 30}]}, 'id': 'toolu_bdrk_013sXB8cEBvD1RgDBgK3JhHB', 'type': 'tool_call'}], usage_metadata={'input_tokens': 433, 'output_tokens': 58, 'total_tokens': 491})

In [41]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'usage': {'prompt_tokens': 438, 'completion_tokens': 81, 'total_tokens': 519}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, response_metadata={'usage': {'prompt_tokens': 438, 'completion_tokens': 81, 'total_tokens': 519}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, id='run-9b340817-52ef-49e5-bf97-438df0385a1f-0', tool_calls=[{'name': 'Information', 'args': {'people': '[\n  {\n    "name": "Joe",\n    "age": 30\n  },\n  {\n    "name": "Martha",\n    "age": <UNKNOWN>\n  }\n]'}, 'id': 'toolu_bdrk_01G4BvEwCWg2RWnB8iZfeD3b', 'type': 'tool_call'}], usage_metadata={'input_tokens': 438, 'output_tokens': 81, 'total_tokens': 519})

In [43]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info."),
    ("human", "{input}")
])

In [44]:
extraction_chain = prompt | extraction_model

In [45]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'usage': {'prompt_tokens': 456, 'completion_tokens': 78, 'total_tokens': 534}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, response_metadata={'usage': {'prompt_tokens': 456, 'completion_tokens': 78, 'total_tokens': 534}, 'stop_reason': 'tool_use', 'model_id': 'anthropic.claude-3-5-sonnet-20240620-v1:0'}, id='run-ff94d0c0-d40b-439a-af0f-876c8207353e-0', tool_calls=[{'name': 'Information', 'args': {'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}, 'id': 'toolu_bdrk_01BHDTMkTTrCNrbCtr8bhL9G', 'type': 'tool_call'}], usage_metadata={'input_tokens': 456, 'output_tokens': 78, 'total_tokens': 534})

In [76]:
extraction_chain = prompt | extraction_model | JsonOutputKeyToolsParser(
    key_name="Information")

In [77]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}]

In [67]:
aws_extraction_chain = prompt | extraction_model | ToolsOutputParser(first_tool_only=True, args_only=True)

In [68]:
aws_extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [69]:
def people(input):
    return input["people"]

In [70]:
full_extraction_chain = prompt | extraction_model | ToolsOutputParser(first_tool_only=True, args_only=True) | people

In [71]:
full_extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]

## Doing it for real

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [80]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    "https://lilianweng.github.io/posts/2023-06-23-agent/", 
    header_template={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'} )
documents = loader.load()

In [88]:
doc = documents[0]

In [89]:
page_content = doc.page_content[:10000]

In [None]:
print(page_content[:1000])

In [91]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [92]:
overview_tagging_tool = [
    convert_to_anthropic_tool(Overview)
]
tagging_model = model.bind(
    tools=overview_tagging_tool,
    tool_choice={"type": "tool", "name": "Overview"}
)
tagging_chain = prompt | tagging_model | ToolsOutputParser(first_tool_only=True, args_only=True)

In [93]:
tagging_chain.invoke({"input": page_content})

{'summary': 'This article discusses LLM-powered autonomous agents, focusing on three key components: planning, memory, and tool use. It covers techniques for task decomposition, self-reflection, and memory management in these agent systems. The article explores various approaches like Chain of Thought, Tree of Thoughts, ReAct, Reflexion, Chain of Hindsight, and Algorithm Distillation for improving agent performance and learning capabilities.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection, Chain of Thought, Tree of Thoughts, ReAct, Reflexion, Chain of Hindsight, Algorithm Distillation'}

In [94]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [95]:
paper_extraction_tool = [
    convert_to_anthropic_tool(Info)
]
extraction_model = model.bind(
    tools=paper_extraction_tool,
    tool_choice={"type": "tool", "name": "Info"}
)
extraction_chain = prompt | extraction_model | ToolsOutputParser(first_tool_only=True, args_only=True)

In [98]:
extraction_chain.invoke({"input": page_content})["papers"]

[{'title': 'Chain of Thought Prompting Elicits Reasoning in Large Language Models',
  'author': 'Jason Wei et al.'},
 {'title': 'Tree of Thoughts: Deliberate Problem Solving with Large Language Models',
  'author': 'Shunyu Yao et al.'},
 {'title': 'LLM+P: Empowering Large Language Models with Optimal Planning Proficiency',
  'author': 'Bo Liu et al.'},
 {'title': 'ReAct: Synergizing Reasoning and Acting in Language Models',
  'author': 'Shunyu Yao et al.'},
 {'title': 'Reflexion: Language Agents with Verbal Reinforcement Learning',
  'author': 'Noah Shinn and Beck Labash'},
 {'title': 'Chain-of-Hindsight: Learning from Human Feedback with Large Language Models',
  'author': 'Xueyan Zou et al.'},
 {'title': 'Algorithm Distillation: A Method for Learning Reinforcement Learning Algorithms from Behavioral Data',
  'author': 'Michael Laskin et al.'},
 {'title': 'RL^2: Fast Reinforcement Learning via Slow Reinforcement Learning',
  'author': 'Yan Duan et al.'}]

In [99]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [100]:
extraction_chain = prompt | extraction_model |  ToolsOutputParser(first_tool_only=True, args_only=True)

In [102]:
extraction_chain.invoke({"input": page_content})["papers"]

[{'title': 'Chain of thought prompting elicits reasoning in large language models',
  'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts: Deliberate Problem Solving with Large Language Models',
  'author': 'Yao et al.'},
 {'title': 'LLM+P: Empowering Large Language Models with Optimal Planning Proficiency',
  'author': 'Liu et al.'},
 {'title': 'ReAct: Synergizing Reasoning and Acting in Language Models',
  'author': 'Yao et al.'},
 {'title': 'Reflexion: an autonomous agent with dynamic memory and self-reflection',
  'author': 'Shinn & Labash'},
 {'title': 'Chain-of-Hindsight: Learning from Feedback Sequences to Improve Outputs',
  'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation: A Method for Learning Reinforcement Learning Algorithms from Behavioral Data',
  'author': 'Laskin et al.'},
 {'title': 'RL^2: Fast Reinforcement Learning via Slow Reinforcement Learning',
  'author': 'Duan et al.'}]

In [103]:
extraction_chain.invoke({"input": "hi"})

{'papers': []}

In [104]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [105]:
splits = text_splitter.split_text(doc.page_content)

In [106]:
len(splits)

15

In [107]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [108]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [None]:
print(splits[0])

In [110]:
from langchain.schema.runnable import RunnableLambda

In [111]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [112]:
prep.invoke("hi")

[{'input': 'hi'}]

In [126]:
papers = RunnableLambda(
    lambda input: [e['papers'] for e in input]
)


In [127]:
chain = prep | extraction_chain.map() | papers | flatten

In [128]:
chain.invoke(doc.page_content)

[{'title': 'Chain of thought', 'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al.'},
 {'title': 'LLM+P', 'author': 'Liu et al.'},
 {'title': 'ReAct', 'author': 'Yao et al.'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash'},
 {'title': 'Reflexion: Language Agents with Verbal Reinforcement Learning',
  'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation: A Method for Learning Algorithms from Data',
  'author': 'Laskin et al.'},
 {'title': 'Algorithm Distillation: A Method for Learning Downstream-Optimal Representations for Reinforcement Learning',
  'author': 'Laskin et al.'},
 {'title': 'RL^2: Fast Reinforcement Learning via Slow Reinforcement Learning',
  'author': 'Duan et al.'},
 {'title': 'The Magical Number Seven, Plus or Minus Two: Some Limits on Our Capacity for Processing Information',
  'author': 'Miller'},
 {'title': 'MRKL', 'author': 'Karpas et al.'},
 {'title': 'TALM', 'autho