# Import library

In [1]:
import os
import yaml

with open("../conf/service.dev.yaml", 'r') as f:
    configs = yaml.safe_load(f)
os.environ['OPENAI_API_KEY'] = configs['openai']['api_key']

# Tagging and Extraction Using OpenAI functions

# Tagging
- https://python.langchain.com/docs/use_cases/tagging

In [2]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
# from langchain_core.utils.function_calling import convert_to_openai_function

In [3]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`") 
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [4]:
convert_pydantic_to_openai_function(Tagging)

  warn_deprecated(


{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'type': 'object',
  'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language']}}

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [6]:
model = ChatOpenAI(temperature=0)  # deterministic

In [7]:
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

In [8]:
prompt = ChatPromptTemplate.from_messages([
    ('system', "Think carefully, and then tag the text as instructed"),
    ('user', "{input}")
])

In [9]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={'name': "Tagging"}  # force it
)

In [10]:
tagging_chain = prompt | model_with_functions

In [11]:
tagging_chain.invoke({'input': "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "sentiment": "pos",\n  "language": "en"\n}', 'name': 'Tagging'}})

In [12]:
tagging_chain.invoke({'input': "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "sentiment": "neg",\n  "language": "it"\n}', 'name': 'Tagging'}})

In [13]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [14]:
tagging_chain.invoke({'input': "non mi piace questo cibo"})

{'sentiment': 'neg', 'language': 'it'}

# Extraction
Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [15]:
from typing import Optional

class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")


class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [16]:
convert_pydantic_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'type': 'object',
  'properties': {'people': {'description': 'List of info about people',
    'type': 'array',
    'items': {'description': 'Information about a person.',
     'type': 'object',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'description': "person's age", 'type': 'integer'}},
     'required': ['name']}}},
  'required': ['people']}}

In [17]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(
    functions=extraction_functions,
    function_call={'name': "Information"}
)

In [18]:
extraction_model.invoke("Joe is 30")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    }\n  ]\n}', 'name': 'Information'}})

In [19]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    },\n    {\n      "name": "Martha",\n      "age": 0\n    }\n  ]\n}', 'name': 'Information'}})

- Martha의 나이가 0가 아님에도 0로 **추정**하는 것을 방지하기 위해 prompt를 수정

In [20]:
prompt = ChatPromptTemplate.from_messages([
    ('system', "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ('human', "{input}")
])

In [21]:
extraction_chain = prompt | extraction_model

In [22]:
extraction_chain.invoke({'input': "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    },\n    {\n      "name": "Martha"\n    }\n  ]\n}', 'name': 'Information'}})

In [23]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [24]:
extraction_chain.invoke({'input': "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]}

- `people` 은 그저 list로 `person`을 뽑아내기 위한 용도로 필요없는 key라서 생략할 것

In [25]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [26]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name='people')
extraction_chain.invoke({'input': "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]

# More real example

In [27]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [28]:
doc = documents[0]
page_content = doc.page_content[:10000]

In [29]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written.")
    keywords: str = Field(description="Provide keywords.")

In [30]:
overview_tagging_functions = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_functions,
    function_call={'name': "Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [31]:
tagging_chain.invoke({'input': page_content})

{'summary': 'This article discusses the concept of building autonomous agents powered by LLM (large language model) as their core controller. It explores the key components of such agent systems, including planning, memory, and tool use. It also covers various techniques for task decomposition and self-reflection in LLM-powered agents.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection'}

In [32]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]

class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [33]:
convert_pydantic_to_openai_function(Info)

{'name': 'Info',
 'description': 'Information to extract',
 'parameters': {'type': 'object',
  'properties': {'papers': {'type': 'array',
    'items': {'description': 'Information about papers mentioned.',
     'type': 'object',
     'properties': {'author': {'type': 'string'}},
     'required': ['title']}}},
  'required': ['papers']}}

In [34]:
paper_extraction_functions = [
    convert_pydantic_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=paper_extraction_functions,
    function_call={'name': "Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name='papers')

In [35]:
extraction_chain.invoke({'input': page_content})

[{'author': 'Lilian Weng'}]

- 정확도가 낮다. prompt를 수정하여 좀 더 원하는 방향으로 가이드해보자.

In [36]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article.

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ('system', template),
    ('human', "{input}")
])

In [37]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name='papers')

- 근데 왜 `title` 이 안 나오지?

In [38]:
extraction_chain.invoke({'input': page_content})

[{'author': 'Wei et al. 2022'},
 {'author': 'Yao et al. 2023'},
 {'author': 'Liu et al. 2023'},
 {'author': 'Shinn & Labash 2023'},
 {'author': 'Laskin et al. 2023'},
 {'author': 'Duan et al. 2017'}]

# More text processing
한 번에 넣기 너무 긴 text는 잘라서 넣고, 나중에 합쳐야된다.

In [39]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [40]:
splits = text_splitter.split_text(doc.page_content)

In [41]:
len(splits)

14

In [42]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [43]:
from langchain.schema.runnable import RunnableLambda

In [44]:
prep = RunnableLambda(lambda x: [{'input': doc} for doc in text_splitter.split_text(x)])

In [45]:
prep.invoke("hi")

[{'input': 'hi'}]

In [46]:
chain = prep | extraction_chain.map() | flatten 

- 기본적으로 5개가 병렬적으로 처리된다.

In [47]:
chain.invoke(doc.page_content)

[{'author': 'Wei et al. 2022'},
 {'author': 'Yao et al. 2023'},
 {'author': 'Liu et al. 2023'},
 {'author': 'Shinn & Labash 2023'},
 {'author': 'Shinn & Labash, 2023'},
 {'author': 'Liu et al. 2023'},
 {'author': 'Laskin et al. 2023'},
 {'author': 'Laskin et al. 2023'},
 {'author': 'Duan et al. 2017'},
 {'author': 'Miller 1956'},
 {'author': 'LSH'},
 {'author': 'ANNOY'},
 {'author': 'HNSW'},
 {'author': 'FAISS'},
 {'author': 'ScaNN'},
 {'author': 'Karpas et al. 2022'},
 {'author': 'Parisi et al. 2022'},
 {'author': 'Schick et al. 2023'},
 {'author': 'Shen et al. 2023'},
 {'author': 'API-Bank (Li et al. 2023)'},
 {'author': 'ChemCrow (Bran et al. 2023)'},
 {'author': 'Boiko et al. (2023)'},
 {'author': 'Park, et al. (2023)'},
 {'author': 'Park et al.'},
 {'author': 'John Smith'},
 {'author': 'Jane Doe'},
 {'author': 'John Doe'},
 {'author': 'Jane Smith'},
 {'author': 'John Smith'},
 {'author': 'Jane Doe'},
 {'author': 'Wei et al.'},
 {'author': 'Yao et al.'},
 {'author': 'Liu et al.'},
