In [1]:
import os
import re
import openai
import csv
import fitz  # PyMuPDF

In [2]:
def retrieve_files(root_folder, file_extension, name_filters=[]):
    """
    Retrieves all files of a certain type from folders within a given folder, 
    with optional name filters.

    Parameters:
    root_folder (str): The path to the root folder to search within.
    file_extension (str): The file extension to look for (e.g., '.txt').
    name_filters (list): A list of keywords to filter the file names (default is an empty list).

    Returns:
    list: A list of paths to the files that match the criteria.
    """
    matching_files = []

    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(file_extension):
                if not name_filters or any(filter in filename for filter in name_filters):
                    matching_files.append(os.path.join(dirpath, filename))

    return matching_files

In [5]:
# Example usage
root_folder = 'Daten KI'
file_extension = '.pdf'
name_filters = ['Allgemeines Merkblatt', 'FAQ BEG','Infoblatt f√∂rderf√§hig','Richtlinie BEG','Technische FAQ']

files = retrieve_files(root_folder, file_extension, name_filters)
for file in files:
    print(file)

Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_1 (2021-03-01).pdf
Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_2 (2021-06-01).pdf
Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_4 (2022-08-15).pdf
Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_3 (2021-10-21).pdf
Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_5 (2023-03-13).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-12-09).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2021-09-16).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2023-12-21).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-07-21)_AÃànderung.pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2020-12-17).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2021-05-20).pdf
Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-09-15)_AÃànderung.pdf
Daten KI/Allgemeines Merkblatt zur Antragstellung/Allgemeines Merkblatt zur Antragstellung - Zuschuss_1.8 (2023-08-31).pdf
Daten KI/Allgemeines Merkblatt zur Antragstellung/Allgemeines

In [63]:
API_KEY_CGPT = "sk-proj-K6G7rknFnUGcX3yL8DjkT3BlbkFJlnkOGz5AMOOHI8KXlYjf"


def extract_text_from_pdf(pdf_path, page_limit=None):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        if page_limit and page_num >= page_limit:
            break
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def add_question(conversation,question):
    chat = {"role": "user", "content": question}

    conversation.append(chat)
    return conversation

def ask_openai(question,conversation):
    conv = conversation.copy()
    messages = add_question(conv,question)
    openai.api_key = API_KEY_CGPT  # Replace with your OpenAI API key
    print(f'Context : {len(messages)}')
    response = openai.chat.completions.create(
      model="gpt-4o",  # or 'gpt-3.5-turbo'
      messages=messages,
      max_tokens=150,
    )
    return response.choices[0].message.content


# Function to extract file name from path
def get_file_name(file_path):
    return os.path.basename(file_path)

# Function to parse the answer and extract the date and version number
def parse_answer(answer):
    date_pattern = r"Publication Date: (\d{2}/\d{2}/\d{4})"
    version_pattern = r"Version Number: (\d+\.\d+)"
    
    date_match = re.search(date_pattern, answer)
    version_match = re.search(version_pattern, answer)
    
    publication_date = date_match.group(1) if date_match else "DD/MM/YYYY"
    version_number = version_match.group(1) if version_match else "X.X"
    
    return publication_date, version_number


In [65]:
# Example usage
root_folder = 'Daten KI'
file_extension = '.pdf'
name_filters = ['Allgemeines Merkblatt', 'FAQ BEG','Infoblatt f√∂rderf√§hig','Richtlinie BEG','Technische FAQ']
page_limit = 2
character_limit = 100
conversation_history = [
    {"role": "system",
     "content": "You are a helpful assistant and answer questions of documents. The documents are in german, so date format is usually DD.MM.YYYY."}
]
question = """Extract the effective date and the version number from the following document/text. If the effective date is not explicitly mentioned, determine the date from when the document was published.
Provide both the effective date and the version number. Sometimes the date is given as: Von/Vom Datum, Stand or as Datum des Inkrafttretens. Sometimes past versions are mentioned, so take the latest version and date.
Write the answer in the format: Publication Date: DD/MM/YYYY - Version Number: X.X.
If any of the values is not available, write the default: Publication Date: DD/MM/YYYY or Version Number: X.X."""
file_info_dict = []



files = retrieve_files(root_folder, file_extension, name_filters)
n = 0
for file in files:
    n+=1
    doc_text = extract_text_from_pdf(file,page_limit)
    print(f'file: {file}\nExtracted text:\n{doc_text[:character_limit]}\n---------\n')
    query = f"Answer the following question: {question}\n\nDocument:\n{doc_text}"
    # print(f'Query: {query}')
    answer = ask_openai(query,conversation_history)
    print(f'Answer: {answer}\n\n%---------------------------------%\n\n')

    # Extract file name, date, and version number
    file_name = get_file_name(file)
    publication_date, version_number = parse_answer(answer)
    
    # Store the information in the list
    file_info_dict.append({
        "file": file_name,
        "date": publication_date,
        "version": version_number
    })
print(file_info_dict)

file: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_1 (2021-03-01).pdf
Extracted text:
Bundesf√∂rderung f√ºr effiziente Geb√§ude 
Liste der technischen FAQ - BEG EM  
1 
Bundesf√∂rderung f√ºr 
---------

Context : 2
Answer: Publication Date: 01/03/2021 - Version Number: 1.0

%---------------------------------%


file: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_2 (2021-06-01).pdf
Extracted text:
Bundesf√∂rderung f√ºr effiziente Geb√§ude 
Liste der technischen FAQ - BEG EM  
Version 2.0 (06/2021), 
---------

Context : 2
Answer: Publication Date: 01/06/2021 - Version Number: 2.0

%---------------------------------%


file: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_4 (2022-08-15).pdf
Extracted text:
Bundesf√∂rderung f√ºr effiziente Geb√§ude - Liste der technischen FAQ - BEG EM 
Version 4.0 (08/2022) 

---------

Context : 2
Answer: Publication Date: 15/08/2022 - Version Number: 4.0

%---------------------------------%


file: Daten KI/Technische FAQ BEG EM/Tech

In [66]:
for i in file_info_dict:
    file = i['file']
    date = i['date']
    version = i['version']
    print(f'{file}: Date: {date} - Version: {version}\n')

Technische FAQ BEG EM_1 (2021-03-01).pdf: Date: 01/03/2021 - Version: 1.0

Technische FAQ BEG EM_2 (2021-06-01).pdf: Date: 01/06/2021 - Version: 2.0

Technische FAQ BEG EM_4 (2022-08-15).pdf: Date: 15/08/2022 - Version: 4.0

Technische FAQ BEG EM_3 (2021-10-21).pdf: Date: 21/10/2021 - Version: 3.0

Technische FAQ BEG EM_5 (2023-03-13).pdf: Date: 13/03/2023 - Version: 5.0

Richtlinie BEG EM (2022-12-09).pdf: Date: 30/12/2022 - Version: 1.0

Richtlinie BEG EM (2021-09-16).pdf: Date: 18/10/2021 - Version: 16.09

Richtlinie BEG EM (2023-12-21).pdf: Date: 21/12/2023 - Version: 1.0

Richtlinie BEG EM (2022-07-21)_AÃànderung.pdf: Date: 28/07/2022 - Version: 1.1

Richtlinie BEG EM (2020-12-17).pdf: Date: 17/12/2020 - Version: 1.0

Richtlinie BEG EM (2021-05-20).pdf: Date: 07/06/2021 - Version: X.X

Richtlinie BEG EM (2022-09-15)_AÃànderung.pdf: Date: 21/09/2022 - Version: 2.0

Allgemeines Merkblatt zur Antragstellung - Zuschuss_1.8 (2023-08-31).pdf: Date: 31/08/2023 - Version: 1.8

Allgemeines

In [70]:
# Specify the CSV file name
csv_file = 'Daten KI/Metadata/Files_date_version.csv'

data = file_info_dict
# Determine the fieldnames dynamically
fieldnames = set()
for entry in data:
    fieldnames.update(entry.keys())
fieldnames = list(fieldnames)

# Writing to CSV
with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the data
    for entry in data:
        writer.writerow(entry)

print(f"Data has been written to {csv_file}")

Data has been written to Daten KI/Metadata/Files_date_version.csv


# READ DOCUMENTS

In [12]:
from langchain_community.document_loaders import UnstructuredFileLoader
from collections import Counter
from langchain.docstore.document import Document


In [6]:
files = retrieve_files(root_folder, file_extension, name_filters)
docs_gesammelt = []

for file in files:
    print(f'reading: {file}')
    loader = UnstructuredFileLoader(file,mode='elements',strategy = 'fast',languages=["deu"])
    docs = loader.load()
    docs_gesammelt.append(docs)
    # print(docs[0].page_content[:400])
    # break



reading: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_1 (2021-03-01).pdf
reading: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_2 (2021-06-01).pdf
reading: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_4 (2022-08-15).pdf
reading: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_3 (2021-10-21).pdf
reading: Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_5 (2023-03-13).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-12-09).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2021-09-16).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2023-12-21).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-07-21)_AÃànderung.pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2020-12-17).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2021-05-20).pdf
reading: Daten KI/Richtlinie BEG EM/Richtlinie BEG EM (2022-09-15)_AÃànderung.pdf
reading: Daten KI/Allgemeines Merkblatt zur Antragstellung/Allgemeines Merkb

In [7]:
doc1 = docs_gesammelt[0]
doc1[10].metadata

{'source': 'Daten KI/Technische FAQ BEG EM/Technische FAQ BEG EM_1 (2021-03-01).pdf',
 'coordinates': {'points': ((62.4, 597.04),
   (62.4, 606.04),
   (77.40599999999999, 606.04),
   (77.40599999999999, 597.04)),
  'system': 'PixelSpace',
  'layout_width': 595.32,
  'layout_height': 841.92},
 'file_directory': 'Daten KI/Technische FAQ BEG EM',
 'filename': 'Technische FAQ BEG EM_1 (2021-03-01).pdf',
 'languages': ['deu'],
 'last_modified': '2023-07-18T09:27:14',
 'page_number': 1,
 'parent_id': '5f844955ea8ba0972301b8c96f33c2fb',
 'filetype': 'application/pdf',
 'category': 'UncategorizedText'}

In [11]:
doc1[0].__class__.__name__

'Document'

In [22]:
categories = []
docs_gesammelt2 = []
allowed_types = ['Title','NarrativeText','UncategorizedText','ListItem']

for i in doc1:
    content = i.page_content
    md = i.metadata
    fname = md['filename']
    category = md['category']
    page_number = md['page_number']
    doc =  Document(page_content="text", metadata={"source": "local"})

    if category in allowed_types:
        categories.append(category)
        t_doc = Document(page_content = content,metadata={'page_number':page_number,'category':category,'source':fname})
        docs_gesammelt.append(t_doc)
        # print(t_doc)
        print(f'Text:\n{content}\n\nMetadata:\npage_number: {page_number} - category: {category} - fname: {fname}\n\n\n')

# Use Counter to count the occurrences of each category
category_counts = Counter(categories)

# Print the summary
for category, count in category_counts.items():
    print(f"Category: {category}, Count: {count}")

Text:
Bundesf√∂rderung f√ºr effiziente Geb√§ude ‚Äì Liste der technischen FAQ - Einzelma√ünahmen

Metadata:
page_number: 1 - category: Title - fname: Technische FAQ BEG EM_1 (2021-03-01).pdf



Text:
Die Themen der Technischen FAQ wurden auf Grundlage von h√§ufig gestellten Fragen von Fachkundigen sowie h√§ufig vorkommenden Fehlern in den Nachweisen von Effizienzh√§usern, Effizienzgeb√§uden und Einzelma√ünahmen zusammengestellt.

Metadata:
page_number: 1 - category: NarrativeText - fname: Technische FAQ BEG EM_1 (2021-03-01).pdf



Text:
Mit den Technischen FAQ werden die Mindestanforderungen der ‚ÄûBundesf√∂rderung effiziente Geb√§ude ‚Äì Einzelma√ünahmen‚Äú (BEG EM) sowie Bestimmungen des GEG, der GEG-Normen und sonstiger Regelwerke erl√§utert bzw. in Teilbereichen zusammengefasst. Weiterf√ºhrende Vorgaben k√∂nnen den jeweiligen Regelwerken entnommen werden.

Metadata:
page_number: 1 - category: NarrativeText - fname: Technische FAQ BEG EM_1 (2021-03-01).pdf



Text:
Die Technischen 

In [86]:
tables = [el for el in docs if el.metadata['category'] == "Table"]
if tables:
    print(tables.text)
    print(tables.metadata.text_as_html)

AttributeError: 'list' object has no attribute 'text'