In [None]:
%pip install pypdf langchain-community langchain-text-splitters py-zerox nest_asyncio

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf_file_path = '.././income_tax.pdf'

loader = PyPDFLoader(file_path=pdf_file_path)

pages=[]

async for page in loader.alazy_load():
    pages.append(page)


In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
from pyzerox import zerox
import os
import json
import asyncio

kwargs = {}

custom_system_prompt = None

model = "gpt-4o-mini"

async def main():
    file_path = "../income_tax.pdf"

    select_pages = None

    output_dir = "./documents"

    result = await zerox(file_path = file_path, model = model, output_dir = output_dir, 
    custom_system_prompt= custom_system_prompt, select_pages = select_pages, **kwargs)

    return result

result = asyncio.run(main())

print(result)

In [None]:
%pip install -q "unstructured[md]" nltk

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100,
    separators=["\n\n", "\n"]
)



In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document

markdown_path ="./documents/income_tax.md"
loader = UnstructuredMarkdownLoader(markdown_path)
document_list = loader.load_and_split(text_splitter)

In [None]:
document_list[43]

In [None]:
%pip install -q markdown html2text beautifulsoup4

In [None]:
import markdown
from bs4 import BeautifulSoup

text_path = './documents/income_tax.txt'

with open(markdown_path, 'r', encoding='utf-8') as md_file:
    md_content = md_file.read()

html_content = markdown.markdown(md_content)

soup = BeautifulSoup(html_content, 'html.parser')
text_content = soup.get_text()

with open(text_path, 'w', encoding='utf-8') as txt_file:
    txt_file.write(text_content)

print("Markdown converted to plain text successfully!")

In [None]:
from langchain_community.document_loaders import TextLoader

lader = TextLoader(text_path)
document_list = loader.load_and_split(text_splitter)

In [None]:
document_list[39]

In [None]:
%pip install -q langchain-chroma

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents = document_list,
    embedding = embeddings,
    collection_name = "income_tax_collection",
    persist_directory = "./income_tax_collection"
)


In [None]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [None]:
query = '연봉 8천만원 직장인의 소득세는?'

In [None]:
retriever.invoke(query)

In [None]:
from typing_extensions import List, TypedDict

class AgentState(TypedDict):
    query: str
    context: List[Document]
    answer: str

In [None]:
from langgraph.graph import StateGraph

graph_builder = StateGraph(AgentState)


In [None]:
def retrieve(state: AgentState):
    query = state['query']
    docs = retriever.invoke(query)
    return {'context' : docs}

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI

prompt = hub.pull("rlm/rag-prompt")

llm = ChatOpenAI(model="gpt-4o")

In [None]:
def generate(state: AgentState):
    context = state['context']
    query = state['query']
    rag_chain = prompt | llm
    response = rag_chain.invoke({'question': query, 'context': context})
    return {'answer' : response}

In [None]:
graph_builder.add_node('retrieve', retrieve)
graph_builder.add_node('generate', generate)

In [None]:
from langgraph.graph import START , END

graph_builder.add_edge(START, 'retrieve')
graph_builder.add_edge('retrieve', 'generate')
graph_builder.add_edge('generate', END)

graph = graph_builder.compile()


In [None]:
from IPython.display import Image, display

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
sequence_graph_builder = StateGraph(AgentState).add_sequence([retrieve, generate])

In [None]:
sequence_graph_builder.add_edge(START, 'retrieve')
sequence_graph_builder.add_edge('retrieve', END)

In [None]:
sequence_graph = sequence_graph_builder.compile()

In [None]:
display(Image(sequence_graph.get_graph().draw_mermaid_png()))

In [None]:
initial_state = {'query': query}
graph.invoke(initial_state)