In [19]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.partition.pptx import partition_pptx
from unstructured.staging.base import dict_to_elements

import chromadb

from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [2]:

DLAI_API_KEY = ""
DLAI_API_URL = ""

s = UnstructuredClient(
    api_key_auth=DLAI_API_KEY,
    server_url=DLAI_API_URL,
)


In [3]:
%ls

CoT.pdf                                embedded-images-tables.pdf
RAG_ON_UNSTRUCTURED.ipynb              image_in_pdf_preprocess.ipynb
[34mchroma_tmp[m[m/                            medium_blog.html
donut_paper.pdf                        msft_openai.pptx
donut_readme.md                        process_unstructured_file_types.ipynb
donut_slide.pptx                       unstructured_chromadb.ipynb
el_nino.html                           winter-sports.epub
el_nino.pdf


In [6]:
filename = "donut_paper.pdf"
with open(filename,"rb") as f:
    files =  shared.Files(
        content=f.read(),
        file_name=filename
    )
req = shared.PartitionParameters(
    files=files,
    strategy="hi_res",
    hi_res_model_name="yolox",
    pdf_infer_table_structure=True,
    skip_infer_table_types=[],
)
res = s.general.partition(req)
pdf_elements= dict_to_elements(res.elements)

In [7]:
pdf_elements[2:5]

[<unstructured.documents.elements.NarrativeText at 0x16b408650>,
 <unstructured.documents.elements.Title at 0x1699aa310>,
 <unstructured.documents.elements.Title at 0x175ad5610>]

In [11]:
tables = [el for el in pdf_elements if el.category=='Table']
tables[0].metadata.text_as_html

'<table><tr><td>NAVER CLOVA</td><td>2NAVER Search</td><td>3SNAVER AI Lal</td></tr></table>'

In [12]:
reference_title = [
    el for el in pdf_elements
    if el.text == "References"
    and el.category == "Title"
][0]

In [13]:
reference_title.to_dict()
references_id = reference_title.id

for element in pdf_elements:
    if element.metadata.parent_id == references_id:
        print(element)
        break

pdf_elements = [el for el in pdf_elements if el.metadata.parent_id != references_id]

1. Afzal, M.Z., Capobianco, S., Malik, M.I., Marinai, S., Breuel, T.M., classification with Dengel, A., Liwicki, M.: Deepdocclassifier: Document deep convolutional neural network. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 1111–1115 (2015). https://doi.org/10.1109/ICDAR.2015.7333933 1, 4, 14


In [14]:
headers = [el for el in pdf_elements if el.category == "Header"]
pdf_elements = [el for el in pdf_elements if el.category != "Header"]

In [15]:
###PPT

In [16]:
filename = "donut_slide.pptx"
pptx_elements = partition_pptx(filename=filename)

In [17]:
filename = "donut_readme.md"
md_elements = partition_md(filename=filename)

INFO: Reading document from string ...
INFO: Reading document ...
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no attribute type
INFO: HTML element instance has no

In [42]:
elements = chunk_by_title(pdf_elements + pptx_elements + md_elements)

In [43]:
documents = []
for element in elements:
    metadata= element.metadata.to_dict()
    del metadata["languages"]
    metadata['source'] = metadata['filename']
    documents.append(Document(page_content=element.text,metadata=metadata))

In [66]:
%set_env OPENAI_API_KEY=

env: OPENAI_API_KEY=


In [54]:
embeddings = OpenAIEmbeddings()

In [55]:
documents

[Document(page_content='OCR-free Document Understanding Transformer\n\n2 2 0 2\n\nGeewook Kim1∗, Teakgyu Hong4†, Moonbin Yim2†, Jeongyeon Nam1, Jinyoung Park5†, Jinyeong Yim6†, Wonseok Hwang7†, Sangdoo Yun3, Dongyoon Han3, and Seunghyun Park1\n\nt c O 6\n\n1NAVER CLOVA 4Upstage\n\n3NAVER AI Lab', metadata={'filename': 'donut_paper.pdf', 'filetype': 'application/pdf', 'page_number': 1, 'orig_elements': 'eJzVVF1PFTEQ/SvNPl+1X9ttfSNoREQwcMEQJGTazq4b7rabvbvqlfjfnUUwxGCCTze+NOmZMx9nZtqLmwJX2GEar9pYvGSFKWUoQXPhnNYeldRYaQ9OCGu9FqJYsKLDESKMQPybom5XmKDD2TnmNI1XPfQ4PO9jPXNn87jpb83Q96s2wNjm9OLOvILUTNDgmuwXBaamuCS0J+QqTZ3HgXDxg6ARv41zjKPd42f1gMhe5TDNdbPTFHFYj5Bimxq2HCCt6zx05Dq73aVetuMKCwr0p17rVOViLI01wULQUgmrRFnZ0inqhd2G3ltk+IeZPGyQZJJxJh+KP01UBDZ5aL9jXM68RxohXBU4N0ogVLzmwVQViMo5W1tNnaj/u0a8Qfya8zV713bi0ySlqBZsiXDdbCa2l1OjCeSSL9j7nJNvEztvO3mP7SMxNnSwQ+iojv02bfJE+/UBhuvyN4vQmTi7mnvwY05rpLx7X0lidY+e0CXmzM6npBa0vBSd0rI9mK+0u+wEKfznzZRuU4iH8zuEYaDmfcG/zS4665Abj9yGUIcyaF6byAGDtKRbb/vRjiywI2ae9CBRld575V

In [56]:
vstore = Chroma.from_documents(documents,embeddings)

INFO: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [57]:
retriever = vstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
)

In [58]:
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x3089484d0>, search_kwargs={'k': 6})

In [59]:
from langchain.prompts.prompt import PromptTemplate
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [60]:
template = """You are an AI assistant for answering questions about the Donut document understanding model.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about Donut, politely inform them that you are tuned to only answer questions about Donut.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
prompt = PromptTemplate(template=template, input_variables=["question", "context"])
prompt



In [61]:
llm = OpenAI(temperature=0)

doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")

In [62]:
doc_chain



In [63]:
question_generator_chain = LLMChain(llm=llm, prompt=prompt)

In [64]:
qa_chain = ConversationalRetrievalChain(
    retriever=retriever,
    question_generator=question_generator_chain,
    combine_docs_chain=doc_chain,
)

In [65]:
qa_chain.invoke({
    "question": "How does Donut compare to other document understanding models?",
    "chat_history": []
})["answer"]

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"


ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].