## Import library

In [1]:
# ✅ Standard Libraries
import os
import time
from uuid import uuid4
from typing import Literal, Optional,List
from typing_extensions import TypedDict

# ✅ Third-Party Utilities
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

# ✅ LangChain Core
from langchain_core.tools import tool
from langchain_core.messages import convert_to_messages
from langchain_core.documents import Document
from langchain_core.runnables import RunnableConfig,RunnableLambda

# ✅ LangChain Models
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_google_vertexai import ChatVertexAI

# ✅ LangChain MongoDB Integration
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever

# ✅ LangChain Tools / Plugins
from langchain_tavily import TavilySearch
from langchain.output_parsers.openai_functions import (
    JsonOutputFunctionsParser,
    PydanticOutputFunctionsParser
)
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.prompts import ChatPromptTemplate, PromptTemplate

# ✅ LangGraph
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import create_react_agent
from langchain_community.callbacks import get_openai_callback

# ✅ Pydantic for Schema
from pydantic import BaseModel, Field

In [73]:
# def pretty_print_message(message, indent=False):
#     pretty_message = message.pretty_repr(html=True)
#     if not indent:
#         print(pretty_message)
#         return

#     indented = "\n".join("\t" + c for c in pretty_message.split("\n"))
#     print(indented)


# def pretty_print_messages(update, last_message=False):
#     is_subgraph = False
#     if isinstance(update, tuple):
#         ns, update = update
#         # skip parent graph updates in the printouts
#         if len(ns) == 0:
#             return

#         graph_id = ns[-1].split(":")[0]
#         print(f"Update from subgraph {graph_id}:")
#         print("\n")
#         is_subgraph = True

#     for node_name, node_update in update.items():
#         update_label = f"Update from node {node_name}:"
#         if is_subgraph:
#             update_label = "\t" + update_label

#         print(update_label)
#         print("\n")

#         messages = convert_to_messages(node_update["messages"])
#         if last_message:
#             messages = messages[-1:]

#         for m in messages:
#             pretty_print_message(m, indent=is_subgraph)
#         print("\n")

## Setup

In [77]:
# ✅ Load variables from .env file
load_dotenv()

# ✅ Access API keys and URIs
openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# ✅ Raise error if any key is missing
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in .env file")
if not tavily_api_key:
    raise ValueError("TAVILY_API_KEY is not set in .env file")

# ✅ Optional: Set as environment variables for downstream tools
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["TAVILY_API_KEY"] = tavily_api_key


## Import test data

In [48]:
data = pd.read_csv("../test.csv")

display(data.shape)
display(data.head(5))

(499, 2)

Unnamed: 0,id,query
0,36deab86-cfd3-48b5-9bea-a36c1b0e63a8,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ..."
1,2b5bbd26-45e8-4768-ab8a-b5dc1d153ab7,Answer the question with the appropriate optio...
2,8a722080-bc16-49db-89c9-100cd61cd28a,Answer the question with the appropriate optio...
3,75316e95-88f4-4fef-83b9-dde0aa52889a,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ..."
4,bcca13bc-2675-4645-82cc-7e4c412ed294,"Given the data and tweets, could you project w..."


In [57]:
for i in range(5):
    print(data.iloc[i,1])
    print('\n')

ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โปรดตอบด้วยคำตอบที่ถูกต้อง A, B, C หรือ D เท่านั้น อย่าใช้คำฟุ่มเฟือยหรือให้ข้อมูลเพิ่มเติม

คำถาม: ______ สถานที่ทำงานเกี่ยวข้องกับการเสริมสร้างศักยภาพให้พนักงาน ตัวอย่างเช่น 'job enrichment' ที่พนักงานได้รับขอบเขตที่ใหญ่ขึ้นในการตัดสินใจว่าจะจัดระเบียบงานของตนอย่างไร หรือ 'job enlargement' ที่พนักงานได้รับมอบหมายงานที่หลากหลายมากขึ้น

ตัวเลือกคำตอบ: A: Re-invigorating, B: Re-flourishing, C: Revitalizing, D: Rehumanizing

คำตอบ:


Answer the question with the appropriate options A, B, C and D. Please respond with the exact answer A, B, C or D only. Do not be verbose or provide extra information. 
Question: Who of these is the entrepreneur?
Answer Choices: A: Barack Obama, B: James Dyson, C: Damien Hirst, D: Mo Farah 
Answer:


Answer the question with the appropriate options A, B, C and D. Please respond with the exact answer A, B, C or D only. Do not be verbose or provide extra information. 
Question: According to COSO, which of the follow

## Create Agent

### Only LLM

In [None]:
class OutputFormat(BaseModel):
    answer: Literal["A", "B", "C", "D"] = Field(..., description="Choose the best answer from the multiple choices")
    reason: str = Field(..., description="A paragraph summarizing that leading to your conclusion")

# 2) Define input TypedDict
class InputFormat(TypedDict):
    id: str
    query: str

# 3) RCRC‐style prompt
prompt = """
<role>
You are a financial expert assistant trained to answer finance-related multiple-choice questions.
</role>

<context>
id of question: {id}
question and choice: {query}
</context>

<result>
Return the following fields:
- answer: one of "A", "B", "C", or "D"
- reason: A compact paragraph summarizing why choose the final answer
</result>

<constraint>
- the answer have to be only one of "A", "B", "C", or "D"
- the reason should output more than 32 tokens and less than 256 tokens
</constraint>
"""

# 4) Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

# 5) Convert Pydantic schema to OpenAI function spec
function_spec = convert_pydantic_to_openai_function(OutputFormat)

# 6) Build the classification chain
chain = (
    PromptTemplate.from_template(prompt)
    | llm.bind(
        functions=[function_spec],
        function_call={"name": function_spec["name"]}
    )
    | PydanticOutputFunctionsParser(pydantic_schema=OutputFormat)
)


In [75]:
## test
response = chain.invoke({
    "id": data.loc[0, "id"],
    "query": data.loc[0, "query"]
})

response

OutputFormat(answer='D', reason="The term 'Rehumanizing' best fits the context of enhancing employee potential through job enrichment and job enlargement. This concept emphasizes restoring the human element in work by allowing employees greater autonomy and a broader range of responsibilities, which aligns with the idea of empowering them in their roles. The other options, while related to revitalization or improvement, do not specifically capture the essence of enhancing the human experience in the workplace as effectively as 'Rehumanizing' does.")

In [76]:
results = []

for _, row in data.iterrows():
    input_data = InputFormat(id=row["id"], query=row["query"])
    result = chain.invoke(input_data)
    results.append([result.answer,result.reason])

    break

results_df = pd.DataFrame(results)
final_df = pd.concat([data, results_df], axis=1)

display(results_df)
display(final_df)

Unnamed: 0,0,1
0,D,The term 'Rehumanizing' is most appropriate in...


Unnamed: 0,id,query,0,1
0,36deab86-cfd3-48b5-9bea-a36c1b0e63a8,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ...",D,The term 'Rehumanizing' is most appropriate in...
1,2b5bbd26-45e8-4768-ab8a-b5dc1d153ab7,Answer the question with the appropriate optio...,,
2,8a722080-bc16-49db-89c9-100cd61cd28a,Answer the question with the appropriate optio...,,
3,75316e95-88f4-4fef-83b9-dde0aa52889a,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ...",,
4,bcca13bc-2675-4645-82cc-7e4c412ed294,"Given the data and tweets, could you project w...",,
...,...,...,...,...
494,c9dd262e-405c-4078-baae-262aa48ddcc8,Answer the question with the appropriate optio...,,
495,73c720b5-1101-4790-af52-3366823e1d32,"ตอบคำถามโดยใช้ตัวเลือกที่เหมาะสม A, B, C และ D...",,
496,357db18f-d872-416e-a07f-753099853d9c,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ...",,
497,2d8b1419-1c46-4e83-892a-081fb417de38,Scrutinize the data and tweets to envisage if ...,,


### Search Engine

#### Tavily

In [80]:
search_tool = TavilySearch(
    max_results=5,
    topic="general",
)
tool.invoke("What's a 'node' in LangGraph?")

{'query': "What's a 'node' in LangGraph?",
 'follow_up_questions': None,
 'answer': None,
 'images': [],
 'results': [{'title': 'Nodes and Edges | langchain-ai/langgraph-101 | DeepWiki',
   'url': 'https://deepwiki.com/langchain-ai/langgraph-101/2.2-nodes-and-edges',
   'content': 'Nodes and Edges | langchain-ai/langgraph-101 | DeepWiki Nodes and Edges Nodes and Edges What are Nodes and Edges? In LangGraph, a graph is composed of nodes connected by edges to form a directed workflow. Nodes are the workhorses of LangGraph - they are Python functions that receive the current graph state as input, perform operations, and return updates to that state. Edges define the flow of execution between nodes in a LangGraph. graph_builder.add_edge("retrieve_documents", "generate_response") Conditional edges use a function to determine the next node based on the current state. Building a Graph with Nodes and Edges graph_builder.add_node("retrieve_documents", retrieve_documents) graph_builder.add_edge(

#### Keyword search tools

In [83]:
class OutputFormat(BaseModel):
    keyword_seach: str = Field(..., description="A compact keyword for searching in web seach tools to get more context for answering the question")

keyword_search_prompt = """
<Role>
You are a search engine assistant. Your job is to create effective search phrases for financial or academic questions.

<context>
question and choice: {query}
</context>

<result>
Return the following fields:
- keyword_search: A compact keyword for searching in web seach tools to get more context for answering the question
</result>

Generate a concise **search keyword phrase** that would help someone find the correct answer to this question using Google or Bing.
Use key financial or domain-specific terms (avoid full sentence).
"""

# 4) Initialize the LLM
keyword_search_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

# 5) Convert Pydantic schema to OpenAI function spec
keyword_search_function_spec = convert_pydantic_to_openai_function(OutputFormat)

# 6) Build the classification chain
keyword_search_chain = (
    PromptTemplate.from_template(keyword_search_prompt)
    | keyword_search_llm.bind(
        functions=[keyword_search_function_spec],
        function_call={"name": keyword_search_function_spec["name"]}
    )
    | PydanticOutputFunctionsParser(pydantic_schema=OutputFormat)
)


In [None]:
keyword_search_chain_tool = keyword_search_chain.as_tool(
    name="keyword_search_extractor",
    description="Find keywords for seach engine from question and answer choice",
)

keyword_search_chain_tool.invoke({"query": data.loc[0, "query"]})

OutputFormat(keyword_seach='job enrichment job enlargement workplace empowerment')

### Seach engine agent

In [91]:
class OutputFormat(BaseModel):
    answer: Literal["A", "B", "C", "D"] = Field(..., description="Choose the best answer from the multiple choices")
    reason: str = Field(..., description="A paragraph summarizing that leading to your conclusion")

# 2) Define input TypedDict
class InputFormat(TypedDict):
    id: str
    query: str

search_engine_prompt = """
<Role>
You are a financial expert assistant trained to answer finance-related multiple-choice questions which you have the search engine tools to get more context of question and answer.

<Context>
id of question: {id}
question and choice: {query}

Use internal tools in this exact order:
1. 🔧 Keyword search tools to get the search keyword from question and choice
2. 🔧 Web search tools to get more context from keyword search

<result>
Return only a JSON object with:
- answer: one of "A", "B", "C", or "D"
- reason: A compact paragraph summarizing why choose the final answer
</result>

<constraint>
- the answer have to be only one of "A", "B", "C", or "D"
- the reason should output more than 32 tokens and less than 256 tokens
</constraint>
"""


llm = ChatOpenAI(model="gpt-4.1", temperature=0.0)
web_search_agent = create_react_agent(
    model=llm,
    tools=[
        keyword_search_chain_tool,
        search_tool
    ],
    prompt=search_engine_prompt,
    name="web_search_agent",
    response_format=OutputFormat,
    #debug=True,
)

In [94]:
1

1

In [93]:
response = web_search_agent.invoke(
    {
        "messages": [
            (
                data.iloc[0,0],
                data.iloc[0,1]
            )
        ]
    }
)

print(response)

ValueError: Unexpected message type: '36deab86-cfd3-48b5-9bea-a36c1b0e63a8'. Use one of 'human', 'user', 'ai', 'assistant', 'function', 'tool', 'system', or 'developer'.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/MESSAGE_COERCION_FAILURE 

### RAG