# New Table Preprocessing


## Libraries and basic functions Needed

In [1]:
import os
import json
import re
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
import base64
from dotenv import load_dotenv
import openai
import pandas as pd
import shutil
from database_manager import MySQLDB
from pydantic import BaseModel
from textwrap import dedent




load_dotenv()
API_KEY_CGPT = os.getenv('API_KEY_CGPT')
print(f'Loaded API KEY ChatGPT: {API_KEY_CGPT}')
openai.api_key = API_KEY_CGPT
MODEL="gpt-4o"

Loaded API KEY ChatGPT: sk-proj-FrNMaLZT7tgBftIDPuBOT3BlbkFJFND9eOXtUMeCyWxplFlV


In [39]:
def call_gpt_api_with_single_prompt(instructions, prompt, model="gpt-4o-2024-08-06", max_tokens=2500, response_format=None, img_path=None,detail='high'):
    """
    Sends a single message to GPT API with optional image input and retrieves the response.
    
    Parameters:
    - instructions: System instructions to set the context (e.g., "You are an AI assistant that analyzes tables").
    - prompt: User's message or query (e.g., "Please analyze the table in the image and provide a summary").
    - model: The GPT model to be used (default is "gpt-4o-2024-08-06").
    - max_tokens: Maximum number of tokens for the response (default is 2500).
    - response_format: Format of the response (e.g., "Rag_reponse"). Defaults to standard completion if not provided.
    - img_path: Optional path to an image file. If provided, the image will be included in the request.
    
    Returns:
    - The GPT answer object.
    """

    content = []
    dict_images = []
    # Create the messages list to send to GPT
    messages = [
        {"role": "system", "content": instructions}
    ]

    # If an image path is provided, encode and append it as a separate message
    if img_path:
        base64_image = encode_image(img_path)
        prompt_text = {'type':'text','text':dedent(prompt)}
        dic_images = {'type':'image_url','image_url':{'url': f"data:image/png;base64,{base64_image}",'detail':detail}}
        dict_images.append(dic_images)
        content.append(prompt_text)
        content.extend(dict_images)
        chat = {"role": "user", "content":content}

    else:
        print("a")
        # Append the image message to the conversation
        chat = {"role": "user", "content":dedent(prompt)}
    
    messages.append(chat)
    
    try:
        if response_format == None:
            # Call GPT API using OpenAI's beta chat completions with parse
            response = openai.beta.chat.completions.parse(
                model=model,
                messages=messages,
                max_tokens=max_tokens)
        else:
            # Call GPT API using OpenAI's beta chat completions with parse
            response = openai.beta.chat.completions.parse(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
            response_format=response_format)

        # Extract and return the response content
        answer = response.choices[0].message.content
        return answer

    except Exception as e:
        print(f"Error during GPT API call: {e}")
        return None

def encode_image(image_path):
    """
    Encodes an image to base64 for transmission.
    
    Parameters:
    - image_path: The path to the image to encode.
    
    Returns:
    - Base64 encoded image as a string.
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

## Define Meta-Prompt

In [40]:
class meta_prompt_reponse(BaseModel):
    main_idea: str
    instruction_set: str
    generated_prompt: str

instruction_set = """You are an AI assistant specialized in analyzing tables from regulatory documents. Your task is to:
1. Analyze the provided table and summarize its main idea or purpose concisely in German.
2. Generate a new tailored prompt that can be used to extract detailed row-level data from the table in a structured way for retrieval purposes.

The new prompt should guide the system to extract each row's data, ensuring important values like percentages, units, and any special notes are included.
"""
prompt = """Please analyze the following table and complete the following three tasks:

1. Summarize the main idea of the table in **German**, focusing on key points such as the regulations, data categories, or the overall content it covers.

2. Generate a **new set of system instructions** to pair up with the generated prompt. It should give clear instructions to the system, specifying their role and expertise. Set the behaviour to pair up with the generated prompt. The descriptions have to be in **german**.
   
3. Generate a **new prompt** that can be used to extract a detailed **row-level description** from the table. This new prompt should be optimized for retrieving important values like **percentages, units, and special notes**. Ensure that the description is suitable for embedding and later retrieval.

The goal is to make the row extraction as efficient and informative as possible. 
"""
# img_path = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_clean_v2/data/documents/output/Richtlinie BEG EM/Richtlinie BEG EM (2023-12-21)/tables/fileoutpart4.png'
img_path = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_clean_v2/data/documents/output/Liste förderfähigen Anlagen - Biomasse/BEG EM Liste förderfähigen Biomasse (2021-02-17)/part0/tables/fileoutpart24.png'

response = call_gpt_api_with_single_prompt(instructions=instruction_set,
                                prompt=prompt,
                                img_path=img_path,
                                response_format=meta_prompt_reponse)

response_l = json.loads(response)
for i in response_l:
    print(f'{i} - {response_l[i]}')

main_idea - Die Tabelle listet förderfähige Heizkesselmodelle des Herstellers Hargassner auf. Sie beschreibt die Nennwärmeleistung, den Kesselwirkungsgrad, CO-Emissionen, Staubemissionen und CO-Meßteil bei verschiedenen Anlagentypen. Diese Informationen sind entscheidend für die Förderung nach BAFA-Richtlinien.
instruction_set - Analysiere Tabellen aus regulatorischen Dokumenten und extrahiere detaillierte, zeilenbezogene Informationen. Achte auf wichtige Werte wie Prozentsätze, Einheiten und besondere Hinweise. Strukturiere die Daten für spätere Verwendung und Abrufbarkeit.
generated_prompt - Extrahiere die folgenden Informationen zu jedem Anlagentyp aus der Tabelle: Hersteller, Anlagentyp, Nennwärmeleistung (kW), Kesselwirkungsgrad (%), CO bei Nennlast (mg/m³), Staub bei Nennlast (mg/m³), CO-Meßteil (mg/m³). Achte darauf, alle Einheiten und wichtigen Hinweise zu berücksichtigen.


In [41]:
table_description = response_l['main_idea']
prompt_ext_table = response_l['generated_prompt']
system_ext_instructions = response_l['instruction_set']


## Extract Meta-Prompt and Table Description

In [42]:
class row_description(BaseModel):
    row_description: str

class table_description(BaseModel):
    rows: list[row_description]

response_rows = call_gpt_api_with_single_prompt(img_path=img_path,
                                                response_format=table_description,
                                                prompt=prompt_ext_table,
                                                instructions=system_ext_instructions)

response_rows_l = json.loads(response_rows)


In [43]:
for i in response_rows_l:
    rows = response_rows_l[i]
    for j in rows:
        print(j['row_description'])

ECO-HK 170 mit eCleaner (Pellets), Nennwärmeleistung: 166.0 kW, Kesselwirkungsgrad: 93.4%, CO bei Nennlast: 12.0 mg/m³, Staub bei Nennlast: 13.0 mg/m³, CO-Meßteil: 51.0 mg/m³.
ECO-HK 170 (PK), Nennwärmeleistung: 170.0 kW, Kesselwirkungsgrad: 94.3%, CO bei Nennlast: 15.0 mg/m³, Staub bei Nennlast: 10.0 mg/m³, CO-Meßteil: 61.0 mg/m³.
ECO-HK 199, Nennwärmeleistung: 199.0 kW, Kesselwirkungsgrad: 94.5%, CO bei Nennlast: 12.0 mg/m³, Staub bei Nennlast: 11.0 mg/m³, CO-Meßteil: 73.0 mg/m³.
ECO-HK 200, Nennwärmeleistung: 200.0 kW, Kesselwirkungsgrad: 94.8%, CO bei Nennlast: 13.0 mg/m³, Staub bei Nennlast: 11.0 mg/m³, CO-Meßteil: 70.0 mg/m³.
ECO-HK 220, Nennwärmeleistung: 216.0 kW, Kesselwirkungsgrad: 95.5%, CO bei Nennlast: 13.0 mg/m³, Staub bei Nennlast: 9.0 mg/m³, CO-Meßteil: 43.0 mg/m³.


In [None]:
def get_content_table(elements,db_connector,pdf_file_name,pdf_file_type,db_table_name,directory):

    new_elements = []

    for i,element in enumerate(elements):
        type_element = element['type']
        if type_element == 'Table':
            table_path = element['filePaths']
            if i<len(elements)-1:
                next_type = elements[i+1]['type']
                if next_type == 'Footnote':
                    content_ft = elements[i+1]['content']
                else:
                    content_ft = ''
            else:
                content_ft = ''
            
            table_sum_dict = {'pdf_type': pdf_file_type,
                              'pdf_name': pdf_file_name,
                              'table_path': table_path}
            # print(f'table Path: {table_path}\n{table_sum_dict}')
            ids = db_connector.find_record_id(db_table_name,table_sum_dict)

            if ids == None:
                table_path_comp = os.path.join(directory,table_path)
                summary = gpt_call_table_description(table_path_comp,content_ft)
                # summary = ''
                table_sum_dict['description'] = summary
                table_sum_dict['footnote'] = content_ft
                db_connector.insert_record(db_table_name,table_sum_dict)
            
            else:
                #LOAD ID
                record = db_connector.get_record_by_id(db_table_name,ids)
                summary = record[4]

            # save_table_description()
            element['content'] = summary
            new_elements.append(element)
        elif type_element == 'Footnote':
            continue
        else:
            new_elements.append(element)
            
    return elements


def gpt_call_table_description(file_path,footnote):

    question = """Please analyze the provided image of a table and create a concise description in German that includes the following details:

	1.	Overview: Mention the primary manufacturers and the types of models listed in the table.
	2.	Model Variations: Identify any significant variations in the models, such as different series or types, and use the first letters or key differentiators to clarify distinctions.
	3.	Manufacturer Diversity: List the different manufacturers included in the table.
	4.	Value Ranges: Provide the range of key technical values (e.g., heating output, efficiency) that apply to the models listed.
	5.	Certification and Unique Attributes: Mention any certifications (e.g., BAFA) or unique attributes that apply to all or most of the models.

Ensure the description is concise, clear, and in German, reflecting the language of the table data."""

    if footnote != '':
        ft_note = f'Use the following footnote to interpret the image: {footnote}'
    else:
        ft_note = ''

    description = chat_gpt_api_call_image(img_path= file_path,question= question,footnote=ft_note)

    return description


def chat_gpt_api_call_image(img_path,question = 'Describe the following image',footnote=''):

    base64_image = encode_image(img_path)

    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": question},
            {"role": "user", "content": [
                {"type": "text", "text": f"Summarize the following image.\n{footnote}"},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"}
                }
            ]}
        ],
        temperature=0.0,
    )
    summary = response
    summary = summary.choices[0].message.content

    return summary

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

## Extract Rows with Table Description