## Import library & Setup

In [1]:
# ✅ Standard Libraries
import os
import time
from uuid import uuid4
from typing import Literal, Optional,List
from typing_extensions import TypedDict

# ✅ Third-Party Utilities
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

# ✅ LangChain Core
from langchain_core.tools import tool
from langchain_core.messages import convert_to_messages
from langchain_core.documents import Document
from langchain_core.runnables import RunnableConfig,RunnableLambda

# ✅ LangChain Models
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_google_vertexai import ChatVertexAI

# ✅ LangChain MongoDB Integration
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever

# ✅ LangChain Tools / Plugins
from langchain_tavily import TavilySearch
from langchain.output_parsers.openai_functions import (
    JsonOutputFunctionsParser,
    PydanticOutputFunctionsParser
)
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.prompts import ChatPromptTemplate, PromptTemplate

# ✅ LangGraph
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import create_react_agent
from langchain_community.callbacks import get_openai_callback

# ✅ Pydantic for Schema
from pydantic import BaseModel, Field

In [2]:
# ✅ Load variables from .env file
load_dotenv()

# ✅ Access API keys and URIs
openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# ✅ Raise error if any key is missing
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in .env file")
if not tavily_api_key:
    raise ValueError("TAVILY_API_KEY is not set in .env file")

# ✅ Optional: Set as environment variables for downstream tools
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["TAVILY_API_KEY"] = tavily_api_key


In [3]:
def pretty_print_message(message, indent=False):
    pretty_message = message.pretty_repr(html=True)
    if not indent:
        print(pretty_message)
        return

    indented = "\n".join("\t" + c for c in pretty_message.split("\n"))
    print(indented)


def pretty_print_messages(update, last_message=False):
    is_subgraph = False
    if isinstance(update, tuple):
        ns, update = update
        # skip parent graph updates in the printouts
        if len(ns) == 0:
            return

        graph_id = ns[-1].split(":")[0]
        print(f"Update from subgraph {graph_id}:")
        print("\n")
        is_subgraph = True

    for node_name, node_update in update.items():
        update_label = f"Update from node {node_name}:"
        if is_subgraph:
            update_label = "\t" + update_label

        print(update_label)
        print("\n")

        messages = convert_to_messages(node_update["messages"])
        if last_message:
            messages = messages[-1:]

        for m in messages:
            pretty_print_message(m, indent=is_subgraph)
        print("\n")

## Read test data

In [4]:
data = pd.read_csv("../test.csv")

display(data.shape)
display(data.head(5))

(499, 2)

Unnamed: 0,id,query
0,36deab86-cfd3-48b5-9bea-a36c1b0e63a8,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ..."
1,2b5bbd26-45e8-4768-ab8a-b5dc1d153ab7,Answer the question with the appropriate optio...
2,8a722080-bc16-49db-89c9-100cd61cd28a,Answer the question with the appropriate optio...
3,75316e95-88f4-4fef-83b9-dde0aa52889a,"ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โ..."
4,bcca13bc-2675-4645-82cc-7e4c412ed294,"Given the data and tweets, could you project w..."


In [5]:
for i in range(5):
    print(data.iloc[i,1])
    print('\n')

ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โปรดตอบด้วยคำตอบที่ถูกต้อง A, B, C หรือ D เท่านั้น อย่าใช้คำฟุ่มเฟือยหรือให้ข้อมูลเพิ่มเติม

คำถาม: ______ สถานที่ทำงานเกี่ยวข้องกับการเสริมสร้างศักยภาพให้พนักงาน ตัวอย่างเช่น 'job enrichment' ที่พนักงานได้รับขอบเขตที่ใหญ่ขึ้นในการตัดสินใจว่าจะจัดระเบียบงานของตนอย่างไร หรือ 'job enlargement' ที่พนักงานได้รับมอบหมายงานที่หลากหลายมากขึ้น

ตัวเลือกคำตอบ: A: Re-invigorating, B: Re-flourishing, C: Revitalizing, D: Rehumanizing

คำตอบ:


Answer the question with the appropriate options A, B, C and D. Please respond with the exact answer A, B, C or D only. Do not be verbose or provide extra information. 
Question: Who of these is the entrepreneur?
Answer Choices: A: Barack Obama, B: James Dyson, C: Damien Hirst, D: Mo Farah 
Answer:


Answer the question with the appropriate options A, B, C and D. Please respond with the exact answer A, B, C or D only. Do not be verbose or provide extra information. 
Question: According to COSO, which of the follow

## Create Agent

### Only LLM

In [18]:
class OutputFormat(BaseModel):
    answer: Literal["A", "B", "C", "D"] = Field(..., description="Choose the best answer from the multiple choices")
    reason: str = Field(..., description="A paragraph summarizing that leading to your conclusion")

# 2) Define input TypedDict
class InputFormat(TypedDict):
    id: str
    query: str

# 3) RCRC‐style prompt
prompt = """
<role>
You are a financial expert trained to answer finance-related multiple-choice questions.
</role>

<context>
ID: {id}
{query}
</context>
"""
# <result>
# Return JSON with:
# - answer: "A", "B", "C", or "D"
# - reason: A concise explanation justifying the answer (32–256 tokens)
# </result>

# <constraint>
# - Only one answer: "A", "B", "C", or "D"
# - Reason length: >32 and <256 tokens
# </constraint>

# 4) Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

# 5) Convert Pydantic schema to OpenAI function spec
function_spec = convert_pydantic_to_openai_function(OutputFormat)

# 6) Build the classification chain
chain = (
    PromptTemplate.from_template(prompt)
    | llm
    
    # .bind(
    #     functions=[function_spec],
    #     function_call={"name": function_spec["name"]}
    # )
    # | PydanticOutputFunctionsParser(pydantic_schema=OutputFormat)
)


In [19]:
print(data.loc[0, "query"])

ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โปรดตอบด้วยคำตอบที่ถูกต้อง A, B, C หรือ D เท่านั้น อย่าใช้คำฟุ่มเฟือยหรือให้ข้อมูลเพิ่มเติม

คำถาม: ______ สถานที่ทำงานเกี่ยวข้องกับการเสริมสร้างศักยภาพให้พนักงาน ตัวอย่างเช่น 'job enrichment' ที่พนักงานได้รับขอบเขตที่ใหญ่ขึ้นในการตัดสินใจว่าจะจัดระเบียบงานของตนอย่างไร หรือ 'job enlargement' ที่พนักงานได้รับมอบหมายงานที่หลากหลายมากขึ้น

ตัวเลือกคำตอบ: A: Re-invigorating, B: Re-flourishing, C: Revitalizing, D: Rehumanizing

คำตอบ:


In [23]:
## test
response = chain.invoke({
    "id": 1234,
    "query": data.loc[0, "query"]
})

response

AIMessage(content='D', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 204, 'total_tokens': 205, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BlzncUS9HJ6WrN8ZVDuIbGFpe7R3B', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--6230df16-582d-476c-b7be-706d95f5e770-0', usage_metadata={'input_tokens': 204, 'output_tokens': 1, 'total_tokens': 205, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [None]:
results = []
i = 0
for _, row in data.iterrows():
    input_data = InputFormat(id=row["id"], query=row["query"])
    result = chain.invoke(input_data)
    results.append([result.content])

    time.sleep(0.2)
    # i += 1
    # if i >= 3:
    #     break

results_df = pd.DataFrame(results)
final_df = pd.concat([data, results_df], axis=1)

# display(results_df)
display(final_df)

In [17]:
len(results)

140

### LLM + Search Engine

#### Keyword search tools

In [13]:
class OutputFormat(BaseModel):
    keyword_seach: str = Field(..., description="A compact keyword for searching in web seach tools to get more context for answering the question")

keyword_search_prompt = """
<role>
You are a search assistant. Your task is to generate precise search keywords for financial or academic multiple-choice questions.
</role>

<context>
Question and choice: {query}
</context>

<result>
Return JSON with:
- keyword_search: A concise phrase optimized for Google or Bing (no full sentences; use financial/domain terms only)
</result>

Generate a concise **search keyword phrase** that would help someone find the correct answer to this question using Google or Bing.
Use key financial or domain-specific terms (avoid full sentence).
"""

# 4) Initialize the LLM
keyword_search_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

# 5) Convert Pydantic schema to OpenAI function spec
keyword_search_function_spec = convert_pydantic_to_openai_function(OutputFormat)

# 6) Build the classification chain
keyword_search_chain = (
    PromptTemplate.from_template(keyword_search_prompt)
    | keyword_search_llm.bind(
        functions=[keyword_search_function_spec],
        function_call={"name": keyword_search_function_spec["name"]}
    )
    | PydanticOutputFunctionsParser(pydantic_schema=OutputFormat)
)

In [14]:
print(data.loc[0, "query"])

ตอบคำถามด้วยตัวเลือกที่เหมาะสม A, B, C และ D โปรดตอบด้วยคำตอบที่ถูกต้อง A, B, C หรือ D เท่านั้น อย่าใช้คำฟุ่มเฟือยหรือให้ข้อมูลเพิ่มเติม

คำถาม: ______ สถานที่ทำงานเกี่ยวข้องกับการเสริมสร้างศักยภาพให้พนักงาน ตัวอย่างเช่น 'job enrichment' ที่พนักงานได้รับขอบเขตที่ใหญ่ขึ้นในการตัดสินใจว่าจะจัดระเบียบงานของตนอย่างไร หรือ 'job enlargement' ที่พนักงานได้รับมอบหมายงานที่หลากหลายมากขึ้น

ตัวเลือกคำตอบ: A: Re-invigorating, B: Re-flourishing, C: Revitalizing, D: Rehumanizing

คำตอบ:


In [15]:
keyword_search_chain_tool = keyword_search_chain.as_tool(
    name="keyword_search_extractor",
    description="Find keywords for seach engine from question and answer choice",
)
keyword_search_result = keyword_search_chain_tool.invoke({"query": data.loc[0, "query"]})

keyword_search_result

  keyword_search_chain_tool = keyword_search_chain.as_tool(


OutputFormat(keyword_seach='job enrichment job enlargement workplace empowerment')

In [16]:
# data.loc[0, "query"]

In [82]:
keyword_search_result.keyword_seach

'job enrichment job enlargement workplace empowerment'

#### Tavily Search

In [17]:
search_tool = TavilySearch(
    max_results=3,
    topic="general",
)

search_result = search_tool.invoke("job enrichment job enlargement workplace empowerment")

In [18]:
search_result

{'query': 'job enrichment job enlargement workplace empowerment',
 'follow_up_questions': None,
 'answer': None,
 'images': [],
 'results': [{'title': 'Job enrichment: The key to unlocking employee motivation',
   'url': 'https://www.businessmanagementdaily.com/81696/job-enrichment-strategies/',
   'content': "Job enrichment techniques improve an employee's sense of satisfaction and empowerment. On the other hand, job enlargement has a singular focus: increasing the number of tasks a job has to combat",
   'score': 0.83678174,
   'raw_content': None},
  {'title': 'What is Employee Empowerment? Job Involvement & Culture - ASQ',
   'url': 'https://asq.org/quality-resources/employee-empowerment',
   'content': "Employee empowerment varies based on an organization's culture and work design. However, empowerment is based on the concepts of job enlargement and job enrichment. Job enlargement differs from job enrichment in that job enlargement is horizontal expansion and job enrichment is con

#### Final search engine agent

In [None]:
class OutputFormat(BaseModel):
    answer: Literal["A", "B", "C", "D"] = Field(..., description="Choose the best answer from the multiple choices")
    reason: str = Field(..., description="A paragraph summarizing that leading to your conclusion")

# 2) Define input TypedDict
class InputFormat(TypedDict):
    id: str
    query: str

web_search_prompt = """
<role>
You are a financial expert assistant trained to answer finance-related multiple-choice questions using internal search tools.
</role>

<context>
Question ID: {id}  
Question and choice: {query}

Use the tools in this order:
1. 🔧 Keyword search → extract search phrase from the input question and choice  
2. 🔧 Search tool → retrieve context from the search results to get content and url

Get context from Web search to help finding the correct answer
</context>

<result>
Return JSON with:
- answer: "A", "B", "C", or "D"
- reason: A concise explanation justifying the chosen answer (32–256 tokens)
</result>

<constraint>
- Only one valid answer: "A", "B", "C", or "D"
- Reason must be >32 and <256 tokens
- Find keyword from input query only. Don't generate own question that related to real question
</constraint>
"""

llm = ChatOpenAI(model="gpt-4.1", temperature=0.0)
# 6) Build the classification chain
web_search_agent = (
    PromptTemplate.from_template(web_search_prompt)
    | (llm
       .bind(
        functions=[function_spec],
        function_call={"name": function_spec["name"]}))
    | PydanticOutputFunctionsParser(pydantic_schema=OutputFormat)
)

In [None]:
# llm = ChatOpenAI(model="gpt-4.1", temperature=0.0)  
# tools = [search_tool, keyword_search_chain_tool]
# llm_with_tools = llm.bind_tools(tools)
# result = llm_with_tools.invoke("how many type of cat in the world")

# result

AIMessage(content='There are two main ways to interpret your question:\n\n1. **Wild Cat Species:**  \nThere are about **38–41 recognized species of wild cats** in the world. This includes big cats like lions, tigers, leopards, and cheetahs, as well as smaller wild cats such as the serval, caracal, ocelot, and many others.\n\n2. **Domestic Cat Breeds:**  \nIf you are asking about **domestic cat breeds**, the number varies depending on the cat registry:\n- The **International Cat Association (TICA)** recognizes over **70 breeds**.\n- The **Cat Fanciers’ Association (CFA)** recognizes **45 breeds**.\n- The **Fédération Internationale Féline (FIFe)** recognizes **48 breeds**.\n\n**Summary:**  \n- **Wild cat species:** ~38–41  \n- **Domestic cat breeds:** 40–70+ (depending on the registry)\n\nIf you want a list or more details about either wild cat species or domestic cat breeds, let me know!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 215,

In [None]:
# print(result.content)

There are two main ways to interpret your question:

1. **Wild Cat Species:**  
There are about **38–41 recognized species of wild cats** in the world. This includes big cats like lions, tigers, leopards, and cheetahs, as well as smaller wild cats such as the serval, caracal, ocelot, and many others.

2. **Domestic Cat Breeds:**  
If you are asking about **domestic cat breeds**, the number varies depending on the cat registry:
- The **International Cat Association (TICA)** recognizes over **70 breeds**.
- The **Cat Fanciers’ Association (CFA)** recognizes **45 breeds**.
- The **Fédération Internationale Féline (FIFe)** recognizes **48 breeds**.

**Summary:**  
- **Wild cat species:** ~38–41  
- **Domestic cat breeds:** 40–70+ (depending on the registry)

If you want a list or more details about either wild cat species or domestic cat breeds, let me know!


In [None]:
web_search_result = web_search_agent.invoke({"id":"123"
                                             ,"query":"แมวมีกี่ขา A.1 B.2 C.3 D.4"})

web_search_result

ValidationError: 1 validation error for OutputFormat
query
  Field required [type=missing, input_value={'answer': 'D', 'reason':...ยทั่วไป'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

### RAG