# Unstructured to extract Info from Scanned pdf
- https://unstructured.io/
- https://unstructured-io.github.io/unstructured/index.html
- https://docs.unstructured.io/api-reference/api-services/python-sdk


In [None]:
%%capture
%pip install "unstructured[all-docs]"

In [None]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.display import JSON

import json

from unstructured.partition.html import partition_html
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements, elements_to_json

In [None]:
%pip show unstructured

In [None]:
import unstructured.partition

help(unstructured.partition)


In [None]:
from unstructured.partition.pdf import partition_pdf

# Specify the path to your PDF file
filename = "data/gpt4all.pdf"

# Call the partition_pdf function
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf(filename)

# Now, elements is a list of all elements present in the pages of the parsed pdf document

In [None]:
elements

In [None]:
len(elements)

In [None]:
element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

In [None]:
unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

In [None]:
from unstructured.partition.pdf import partition_pdf

# Specify the path to your PDF file
filename = "data/scanned_gpt4all.pdf"

# Call the partition_pdf function
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf(filename)

# Now, elements is a list of all elements present in the pages of the parsed pdf document

In [None]:
elements

In [None]:
len(elements)

In [None]:
element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

### Okay, scanned pdf extraction works.

##### We don't see `Table`, table information is not extracted as we expected, lets use different strategy.

### Table extraction from PDF
- Now let’s say that your PDF has tables and let’s say you want to preserve the structure of the tables. 
- You will have to specify the [strategy](https://unstructured-io.github.io/unstructured/best_practices/strategies.html) parameter as `hi_res`. This will use a combination of computer vision and Optical Character Recognition (OCR) to extract the tables and maintain the structure. 
It will return both the text and the html of the table. This is super useful for rendering the tables or passing to a LLM.

> Note: For even better table extraction Unstructured offers an API that improves upon the existing open source models.

> Depending upon machine, you might face different module / library issues, these links might help
- https://stackoverflow.com/questions/59690698/modulenotfounderror-no-module-named-lzma-when-building-python-using-pyenv-on
- https://unstructured-io.github.io/unstructured/installation/full_installation.html

In [None]:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(filename=filename,
                         infer_table_structure=True,
                         strategy='hi_res',
           )

In [None]:
len(elements)

In [None]:

element_dict = [el.to_dict() for el in elements]
output = json.dumps(element_dict, indent=2)
print(output)

unique_types = set()

for item in element_dict:
    unique_types.add(item['type'])

print(unique_types)

In [None]:
tables = [el for el in elements if el.category == "Table"]

print(tables[0].text)
print(tables[0].metadata.text_as_html)

In [None]:
tables

In [None]:
len(tables)

In [None]:
tables[0].text

In [None]:
tables[0].metadata

### Now, comes the most interesting part ( utilizing the extracted data in most efficient way)

- It's helpful to have an HTML representation of the table so that you can the information to an LLM while maintaining the table structure.

In [None]:
table_html = tables[0].metadata.text_as_html

In [None]:
table_html

In [None]:
# view what the HTML in the metadata field looks like

from io import StringIO 
from lxml import etree

parser = etree.XMLParser(remove_blank_text=True)
file_obj = StringIO(table_html)
tree = etree.parse(file_obj, parser)
print(etree.tostring(tree, pretty_print=True).decode())

In [None]:
# let's display this table

from IPython.core.display import HTML
HTML(table_html)

#### Now, lets plugin in LangChain to summarize these tables using `Llama3` via `Ollama`
#### [Ollama Playlist](https://www.youtube.com/playlist?list=PLz-qytj7eIWX-bpcRtvkixvo9fuejVr8y)

In [None]:
%%capture
%pip install langchain-ollama langchain_core langchain_community

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.documents import Document
from langchain.chains.summarize import load_summarize_chain

In [None]:
ChatOllama??

First run the Ollama server  
http://localhost:11434

In [None]:
llm = ChatOllama(model="llama3.1:8b")
chain = load_summarize_chain(llm, chain_type="stuff")
output = chain.invoke([Document(page_content=table_html)])

In [None]:
output

In [None]:
print(output['output_text'])

#### Convert to pandas df

In [None]:
%pip install pandas

In [None]:
import pandas as pd

# Convert HTML table to pandas DataFrame
dfs = pd.read_html(table_html)

In [None]:
dfs

In [None]:

# Assuming there's only one table, get the DataFrame
df = dfs[0]

# Now you have the DataFrame
print(df)


In [None]:
df.shape

In [None]:
df.head()