In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import groq
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
import fitz
import pdfplumber
from io import BytesIO
from PIL import Image
from groq import Groq
import base64

In [3]:
groq.api_key = "api-key"

# Disable parallelism for HuggingFace tokenizers to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"


groq.api_key = "api-key"


# Function to get structured data from the image using Groq
def get_image_data_from_groq(base64_image):
    client = Groq(api_key = "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")

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": 
                        f"""
                        ### INSTRUCTION:
                        The image is from a page in a document.
                        Your job is to extract complete information from the image and return the data in a structured format. It may contain tables, graphs and piecharts or anything.
                        ### WITHOUT ANY ADDITION COMMENT, INTRODUCTORY OR CONCLUDING REMARKS (NO PREAMBLE):
                        """
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        model="llama-3.2-90b-vision-preview",
    )

    return chat_completion.choices[0].message.content

# Function to encode the image
def encode_image(image_bytes):
    return base64.b64encode(image_bytes).decode('utf-8')

def nested_list_to_string(nested_list):
    result = []
    for sublist in nested_list:
        for inner_list in sublist:
            # Convert all items to strings, handling None values
            formatted_row = ', '.join(str(item) if item is not None else '' for item in inner_list)
            result.append(formatted_row)
    return '\n'.join(result)


# Function to extract text and images from PDF files
def extract_pdf_text_and_images(pdf_paths):
    """Extract text and image-based content from a list of PDF files using PyMuPDF."""
    pdf_texts = []
    for pdf_path in pdf_paths:
        try:
            with pdfplumber.open(pdf_path) as pdf:
                doc = fitz.open(pdf_path)
                for count, page in enumerate(pdf.pages, start=1):
                    page_number = count
                    text = page.extract_text()
                    tables = nested_list_to_string(page.extract_tables())

                    # Initialize data structure for the current page
                    page_data = {
                        'page_number': page_number,
                        'text': text,
                        'tables': "",
                        'images': tables
                    }

                    # Extract images from the page using PyMuPDF
                    fitz_page = doc.load_page(page_number - 1)
                    images = fitz_page.get_images(full=True)
                    for img_index, img in enumerate(images):
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image = Image.open(BytesIO(image_bytes))
    
                        # Check image dimensions
                        if image.width < 2 or image.height < 2:
                            continue
                        
                        try:
                            # Encode image and get structured data using Groq
                            encoded_image = encode_image(image_bytes)
                            llm_img_data = get_image_data_from_groq(encoded_image)
                            page_data['images'] += f"Image: {llm_img_data}"
                        except Exception as e:
                            print(f"Error extracting data from image on page {page_number}, image {img_index + 1}: {e}")

                    
                    # Append the page data to the overall pdf_texts list
                    pdf_texts.append(page_data)
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")
    
    return pdf_texts

In [5]:
pdf_paths = ["example.pdf"]
extracted_data = extract_pdf_text_and_images(pdf_paths)
print("data extracted")

data extracted


In [9]:
print(extracted_data[18])

{'page_number': 19, 'text': 'Example from Psychology\nWhat do you notice\nis different in this\ngraph than the\nothers reviewed so\nfar?', 'tables': '', 'images': ''}


In [13]:
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="document")

# Add data to the collection
for page in extracted_data:
    data = f"Page_Number: PAGE {page['page_number']}\n\n Page_Image_Description: {page['images']}\n\n TABLE: Table {page['tables']}\n\n Page_Text: {page['text']}\n\n"

    collection.add(
        documents=[data],
        metadatas={"page_number": f"page {page['page_number']}"},
        ids=[f"doc_{page['page_number']}"]
    )
print("Data successfully added to ChromaDB!")

Insert of existing embedding ID: doc_1
Add of existing embedding ID: doc_1
Insert of existing embedding ID: doc_2
Add of existing embedding ID: doc_2
Insert of existing embedding ID: doc_3
Add of existing embedding ID: doc_3
Insert of existing embedding ID: doc_4
Add of existing embedding ID: doc_4
Insert of existing embedding ID: doc_5
Add of existing embedding ID: doc_5
Insert of existing embedding ID: doc_6
Add of existing embedding ID: doc_6
Insert of existing embedding ID: doc_7
Add of existing embedding ID: doc_7
Insert of existing embedding ID: doc_8
Add of existing embedding ID: doc_8
Insert of existing embedding ID: doc_9
Add of existing embedding ID: doc_9
Insert of existing embedding ID: doc_10
Add of existing embedding ID: doc_10
Insert of existing embedding ID: doc_11
Add of existing embedding ID: doc_11
Insert of existing embedding ID: doc_12
Add of existing embedding ID: doc_12
Insert of existing embedding ID: doc_13
Add of existing embedding ID: doc_13
Insert of existin

Data successfully added to ChromaDB!


In [9]:
results = collection.query(
    query_texts=["From page 6 get the tabular data"], # Chroma will embed this for you
    n_results=7 # how many results to return
)
print(results)


{'ids': [['doc_14', 'doc_1', 'doc_3', 'doc_5', 'doc_4', 'doc_15', 'doc_6']], 'embeddings': None, 'documents': [['Page_Number: PAGE 14\n\n Page_Image_Description: \n\n TABLE: Table \n\n Page_Text: • If given a table of data, we should be able to plot it. Below is\nsome sample data; plot the data with x on the x-axis and y on the\ny-axis.\nx y\n0 0\n1 3\n2 6\n3 9\n4 12\n5 15\n6 18\n7 21\n8 24\n\n', 'Page_Number: PAGE 1\n\n Page_Image_Description: \n\n TABLE: Table \n\n Page_Text: Tables, Charts, and\nGraphs\nwith Examples from History, Economics,\nEducation, Psychology, Urban Affairs and\nEveryday Life\nREVISED: MICHAEL LOLKUS 2018\n\n', 'Page_Number: PAGE 3\n\n Page_Image_Description: \n\n TABLE: Table \n\n Page_Text: Tables, Charts, and\nGraphs Basics\n\n', 'Page_Number: PAGE 5\n\n Page_Image_Description: \n\n TABLE: Table \n\n Page_Text: Types of Visual\nRepresentations of Data\n\n', 'Page_Number: PAGE 4\n\n Page_Image_Description: \n\n TABLE: Table \n\n Page_Text: \uf075 We use chart

In [11]:
llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq.api_key)

query = "From page 6 get the tabular data"

page = collection.query(
    query_texts=[query], # Chroma will embed this for you
    n_results=7 # how many results to return
)['documents'][0]

prompt = ChatPromptTemplate.from_template(
        """
        ### PAGE CONTENT:
        {page}
        ### INSTRUCTION:
        You are an assistant tasked with providing relevant information from the above page content based on the following query: {query}.
        ### (NO PREAMBLE):

        """
    )
query_chain = prompt | llm | StrOutputParser()

query_result = query_chain.invoke({"page":page, "query": query})
print(query_result)

From page 6, the tabular data is:

| Year | All Industries | Manufacturing | Finance, Insurance, Real Estate, Rental, Leasing | Arts, Entertainment, Recreation, Accommodation, and Food Service | Other |
| --- | --- | --- | --- | --- | --- |
| 2010 | 26093515 | 4992521 | 4522451 | 964032 | 15614511 |
| 2011 | 27535971 | 5581942 | 4618678 | 1015238 | 16320113 |
| 2012 | 28663246 | 5841608 | 4797313 | 1076249 | 16948076 |
| 2013 | 29601191 | 5953299 | 5031881 | 1120496 | 17495515 |
| 2014 | 30895407 | 6047477 | 5339678 | 1189646 | 18318606 |
| 2015 | 31397023 | 5829554 | 5597018 | 1283813 | 18686638 |
