In [1]:
!pip install unstructured
!pip install pdfminer.six
!pip install pi-heif
!pip install openai

Collecting unstructured
  Downloading unstructured-0.16.8-py3-none-any.whl.metadata (24 kB)
Collecting chardet (from unstructured)
  Using cached chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured)
  Using cached lxml-5.3.0-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Collecting nltk (from unstructured)
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting requests (from unstructured)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting beautifulsoup4 (from unstructured)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting emoji (from unstructured)
  Using cached emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting dataclasses-json (from unstructured)
  U



Collecting pdfminer.six
  Using cached pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Using cached pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706
Collecting pi-heif
  Downloading pi_heif-0.21.0-cp310-cp310-win_amd64.whl.metadata (6.7 kB)
Collecting pillow>=10.1.0 (from pi-heif)
  Using cached pillow-11.0.0-cp310-cp310-win_amd64.whl.metadata (9.3 kB)
Downloading pi_heif-0.21.0-cp310-cp310-win_amd64.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------- ----- 1.6/1.8 MB 10.5 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 10.1 MB/s eta 0:00:00
Using cached pillow-11.0.0-cp310-cp310-win_amd64.whl (2.6 MB)
Installing collected packages: pillow, pi-heif
Successfully installed pi-heif-0.21.0 pillow-11.0.0
Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=

In [2]:
import os
from unstructured.partition.pdf import partition_pdf

# define the path to the pdf file
pdf_file_path = "../datasets/pdf_files/adult_data_article.pdf"

raw_pdf_elements = partition_pdf(
    filename=pdf_file_path,
    strategy="hi_res",
)


# function to extract tables and texts from the raw pdf elements
def extract_text_and_tables(raw_pdf_elements):
    """
    This function takes in the raw pdf elements and extracts the tables and texts from the pdf
    """
    tables = []
    texts = []

    # loop through the raw pdf elements and categorize them into tables and texts
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            tables.append(str(element))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            texts.append(str(element))

    return texts, tables


# extract the tables and texts from the raw pdf elements
found_texts, found_tables = extract_text_and_tables(raw_pdf_elements=raw_pdf_elements)

ModuleNotFoundError: No module named 'unstructured_inference'

In [None]:
from openai import OpenAI
import pandas as pd


# function to take tables as input and then summarize them
def tables_summarize(row):
    """
    This function takes each row of a dataframe and uses a LLM to generate text
    summaries to each table (row.table)

    Args:
        row: pandas dataframe row, including the attribute table
    Returns:
        table_summaries: text summaries for each table
    """
    summary_prompt = f"""You are an assistant tasked with summarizing tables. \
                    Give a concise summary of the table. Table chunk: {row.table}"""

    # Initialize the OpenAI client with your API key
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # Create the chat completion using the chosen
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": summary_prompt}],
        temperature=0.7,
        max_tokens=150,
    )

    # Generate and print the response
    row["table_summary"] = response.choices[0].message.content

    return row


# create a pandas dataframe from the tables
tables_df = pd.DataFrame(found_tables, columns=["table"])

# add a column to the dataframe to store the summaries
tables_df = tables_df.apply(tables_summarize, axis=1)

In [None]:
# define a random question to the embedded table
user_question = "What are the education levels of the people working in Sales?"


def build_prompt_and_generate_answer(user_question, found_table):
    """
    This function builds a prompt using the user's question and the context of the table
    and generates an answer using the OpenAI API

    Parameters:
        user_question: the question asked by the user
        found_table: the table context to generate the answer from

    Returns:
        answered_question: the answer to the user's question
    """
    # define the question prompt
    question_prompt = f"""You are an assistant using the content from PDFs \
                        to answer questions. Below you can find the \
                        user's question and relevant context. Please use the \
                        context to generate an answer to the user's question.
                        
                        # User question: {user_question}

                        # Context: 
                        
                        ## Table summary: 
                        {found_table.table_summary}

                        ## Table content: 
                        {found_table.table}"""

    # initialize the OpenAI client with your API key
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # initialize the OpenAI client with your API key
    answered_question = (
        client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": question_prompt}],
            temperature=0.7,
            max_tokens=150,
        )
        .choices[0]
        .message.content
    )

    return answered_question


# generate the answer to the user's question
# as context we using the first entry in the tables_df
answered_question = build_prompt_and_generate_answer(
    user_question=user_question, found_table=tables_df.iloc[0]
)

print(answered_question)

So when a user is asking for information like that, we would attach the table to the prompt and the summary. The text snippet below shows how we can build a simple prompt using the table and the generated text summary to answer a user's question.

In [None]:
# define a random question to the embedded table
user_question = "What are the education levels of the people working in Sales?"


def build_prompt_and_generate_answer(user_question, found_table):
    """
    This function builds a prompt using the user's question and the context of the table
    and generates an answer using the OpenAI API

    Parameters:
        user_question: the question asked by the user
        found_table: the table context to generate the answer from

    Returns:
        answered_question: the answer to the user's question
    """
    # define the question prompt
    question_prompt = f"""You are an assistant using the content from PDFs \
                        to answer questions. Below you can find the \
                        user's question and relevant context. Please use the \
                        context to generate an answer to the user's question.
                        
                        # User question: {user_question}

                        # Context: 
                        
                        ## Table summary: 
                        {found_table.table_summary}

                        ## Table content: 
                        {found_table.table}"""

    # initialize the OpenAI client with your API key
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # initialize the OpenAI client with your API key
    answered_question = (
        client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": question_prompt}],
            temperature=0.7,
            max_tokens=150,
        )
        .choices[0]
        .message.content
    )

    return answered_question


# generate the answer to the user's question
# as context we using the first entry in the tables_df
answered_question = build_prompt_and_generate_answer(
    user_question=user_question, found_table=tables_df.iloc[0]
)

print(answered_question)