In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-4o-mini')

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import asyncio
import os

from pyzerox import zerox

# Model Setup (Use only Vision Models)
# For other providers [https://docs.litellm.ai/docs/providers]

# placeholder for additional model kwargs which might be required for some models
kwargs = {}

# system prompt to use for the vision model
custom_system_prompt = None

# to override
# custom_system_prompt = "For the below PDF page, do something..something..."

###################### OpenAI ######################
model = "gpt-4o-mini"
####################################################

# Define main async entrypoint
async def main():
    file_path = "/Users/ksj/MyProjects/llm/inflearn-langgrath-lecture/docs/income_tax.pdf"
    output_dir = "/Users/ksj/MyProjects/llm/inflearn-langgrath-lecture/docs"
    
    result = await zerox(
        file_path=file_path,
        model=model,
        output_dir=output_dir,
        custom_system_prompt=custom_system_prompt,
        select_pages=None,
        **kwargs
    )
    
    return result


# run the main function
result = asyncio.run(main())

# print markdown result
print(result)

In [None]:
import markdown
from bs4 import BeautifulSoup

file_path_markdown = "/Users/ksj/MyProjects/llm/inflearn-langgrath-lecture/docs/real_estate_tax.md"
file_path_text = "/Users/ksj/MyProjects/llm/inflearn-langgrath-lecture/docs/real_estate_tax.txt"

# read the Markdown file
with open(file_path_markdown, 'r', encoding='utf-8') as md_file:
    md_content = md_file.read()

# convert Markdown to HTML
html_content = markdown.markdown(md_content)

# use BeautifulSoup to extract text from HTML
soup = BeautifulSoup(html_content, 'html.parser')
text_content = soup.get_text()

# save the text to .txt file
with open(file_path_text, 'w', encoding='utf-8') as txt_file:
    txt_file.write(text_content)

print("Markdown converted to plain text successfully!")

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 100,
    separators=['\n\n', '\n']
)

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader(file_path_markdown)
documents = loader.load_and_split(text_splitter)

In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader(file_path_text)
documents = loader.load_and_split(text_splitter)

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name = 'real_estate_tax_collections',
    persist_directory = './real_estate_tax_collections'
)

In [None]:
retriever = vector_store.as_retriever(search_kwargs={'k': 4})

In [None]:
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class AgentState(TypedDict):
    query: str
    context: List[Document]
    answer: str

In [None]:
def retrieve(state: AgentState) -> AgentState:
    """
    'retrieve' Node
    : 사용자의 질문에 기반하여, 벡터 스토어에서 관련 문서를 검색한다.

    Args:
        - state(AgentState): 사용자의 질문을 포함한 에이전트의 현재 state

    Returns:
        - AgentState: 검색된 문서가 추가된 state
    """
    
    query = state['query']
    context = retriever.invoke(query)
    
    return {'context': context}

In [None]:
from langsmith import Client

client = Client()
prompt = client.pull_prompt("rlm/rag-prompt", include_model=True)

In [None]:
def generate(state: AgentState) -> AgentState:
    """
    'generate' Node
    : 사용자의 질문과 검색된 문서를 기반으로 응답을 생성한다.

    Args:
        - state(AgentState): 사용자의 질문과 검색된 문서를 포함한 에이전트의 현재 state

    Returns:
        - AgentState: 생성된 응답이 추가된 state
    """
    
    query = state['query']
    context = state['context']
    
    rag_chain = prompt | llm
    ai_message = rag_chain.invoke({'question': query, 'context': context})
    
    return {'answer': ai_message}

In [None]:
from langgraph.graph import StateGraph, START, END

graph_builder = StateGraph(AgentState)

# nodes
graph_builder.add_node('retrieve', retrieve)
graph_builder.add_node('generate', generate)

# edges
graph_builder.add_edge(START, 'retrieve')
graph_builder.add_edge('retrieve', 'generate')
graph_builder.add_edge('generate', END)

# sequence_graph_builder = StateGraph(AgentState).add_sequence([retrieve, generate])
# sequence_graph_builder.add_edge(START, 'retrieve')
# sequence_graph_builder.add_edge('generate', END)

In [None]:
graph = graph_builder.compile()

In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
query = "연봉 5천만원인 거주자의 소득세는 얼마인가요?"
initial_state = {'query': query}

graph.invoke(initial_state)