In [133]:
import ollama
import os
from rapidfuzz import fuzz, process

# Text Cleaning

## Functions for cleaning

### Imports

In [1]:
import re
import unicodedata

### Remove URLs

In [None]:
def remove_links(text):
    # Pattern to match URLs in both forms: (<http...>) or <http...>
    pattern = r'\(<(http[^\)]+?)>\)|<(http[^\>]+?)>'
    # Pattern to match any text in the format (< [some text] >)
    bracket_pattern = r'\(<[^>]*?>\)'
    
    urls_found = []
    
    # Define a replacement function
    def replace_link(match):
        url = match.group(1) if match.group(1) else match.group(2)
        urls_found.append(url)
        return ""  # Remove the matched URL from the text

    # Substitute the URLs with an empty string and capture them
    cleaned_text = re.sub(pattern, replace_link, text, flags=re.DOTALL)
    # Remove any text in the format (< [some text] >)
    cleaned_text = re.sub(bracket_pattern, "", cleaned_text)
    
    return cleaned_text, urls_found

### Clean text wrong characters

In [374]:
def map_special_characters(text, convert_greek_to_names=False):
    # Define specific mappings
    char_mapping = {
        "ƞ": "η",    # Map Latin n with long right leg to Greek eta
        "𝐸": "E",
        "𝐴": "A",
        "𝑇": "T",
        "²": "2",
        "³": "3",
        "¹": "1",
        "ł": "l",    # Map Polish ł to l
        "𝑠": "s",    # Map styled s to regular s
        "–": "-",    # Map en dash to hyphen
        "—": "-",     # Keep equals sign as is
        "≤": "<=",   # Map less than or equal to
        "≥": ">=",    # Map greater than or equal to
        "×": "*",
        "₂": "2",
        "Ł": "L"
        }
    
    # Define Greek letters to names mapping (if needed)
    greek_to_name = {
        "η": "eta",
        "λ": "lambda",
        "μ": "mu",
        "θ": "theta",
        "π": "pi"
    }

    # Define characters to delete (e.g., trademark symbol)
    chars_to_delete = ["®","ₛ","™","©"]
    
    # Apply specific character mappings
    for char, replacement in char_mapping.items():
        text = text.replace(char, replacement)

    # Remove characters specified for deletion
    for char in chars_to_delete:
        text = text.replace(char, "")
    
    # Optionally convert Greek letters to names
    if convert_greek_to_names:
        for greek, name in greek_to_name.items():
            text = text.replace(greek, name)
    
    # Normalize accents, excluding German-specific characters
    normalized_text = []
    for char in text:
        if char in "äöüßÄÖÜ":  # Keep German characters as they are
            normalized_text.append(char)
        else:
            # Decompose accented characters, keep only base letter
            normalized_char = unicodedata.normalize("NFD", char)
            normalized_char = "".join([c for c in normalized_char if unicodedata.category(c) != "Mn"])
            normalized_text.append(normalized_char)

    return "".join(normalized_text)

In [349]:
def clean_text_characters(text):
    clean_text = map_special_characters(text,convert_greek_to_names=True)
    clean_text = re.sub(r'\.{2,}', ' Seite ', clean_text)
    clean_text = re.sub(r' {2,}', ' ', clean_text)
    clean_text = re.sub(r'(\.\s)+\.', '', clean_text)
    clean_text = clean_text.replace("", "•")
    clean_text = re.sub(r"(\d+(?:\.\d+)?)\n(\d+)\n\n(.+)", r"\1 \3 Seite \2", clean_text)

    return clean_text

In [448]:

# Dictionary of specific abbreviations and their full forms
abbreviation_dict = {
    "z.B.": "zum Beispiel",
    "z. B.": "zum Beispiel",
    "bzw": "beziehungsweise",
    "bzw.": "beziehungsweise",
    "i. V. m.": "in Verbindung mit",
    "i. S. v.": "im Sinne von",
    "ggf.": "gegebenenfalls",
    "Abs.": "Absatz",
    "Nr.": "Nummer",
    "bspw.": "beispielsweise",
    "d.h.": "das heißt",
    "einschl.": "einschließlich",
    "e.V.": "eingetragener Verein",
    "Ges.m.b.H.": "Gesellschaft mit beschränkter Haftung",
    "inkl.": "inklusive",
    "gem.": "gemäß",
    "o. g.": "oben genannt",
    "S.": "Seite",
    "sog.": "sogenannt",
    "s.u.": "siehe unten",
    "u.a.": "unter anderem"
}

def replace_abbreviations(text, abbreviation_dict):
    sorted_abbreviations = sorted(abbreviation_dict.keys(), key=len, reverse=True)
    
    for abbr in sorted_abbreviations:
        # Modify the regex pattern to match word boundaries or certain punctuation around the abbreviation
        text = re.sub(rf'(?<!\w){re.escape(abbr)}(?!\w)', abbreviation_dict[abbr], text)
    
    return text


# Term extraction and cleaning

## Functions for extracting and cleaning

### Extract dates

In [206]:
def extract_and_remove_german_dates(text):
    # Define regex for "DD.MM.YYYY" format
    date_numeric_pattern = r'\b(?:[0-2]?[0-9]|3[0-1])\.(?:0?[1-9]|1[0-2])\.\d{4}\b'
    
    # Define regex for "D. Monat YYYY" and "Monat YYYY" formats
    date_month_pattern = r'\b(?:(?:([1-2]?[0-9]|3[0-1])\.\s*)?(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))\s+(\d{4})\b'
    
    # Month name to numeric mapping
    month_to_number = {
        "Januar": "01", "Februar": "02", "März": "03", "April": "04", "Mai": "05", "Juni": "06",
        "Juli": "07", "August": "08", "September": "09", "Oktober": "10", "November": "11", "Dezember": "12"
    }
    
    # Find and extract dates in "DD.MM.YYYY" format
    numeric_dates = re.findall(date_numeric_pattern, text)
    
    # Find and extract dates in "D. Monat YYYY" and "Monat YYYY" formats
    month_dates = re.findall(date_month_pattern, text)
    
    # Convert "D. Monat YYYY" and "Monat YYYY" format dates to "DD.MM.YYYY"
    converted_month_dates = [
        f"{int(day):02}.{month_to_number[month]}.{year}" if day else f"01.{month_to_number[month]}.{year}"
        for day, month, year in month_dates
    ]
    
    # Combine all extracted dates
    all_dates = numeric_dates + converted_month_dates
    
    # Create a pattern to match both date formats in the original text
    combined_pattern = f"({date_numeric_pattern})|({date_month_pattern})"
    
    # Remove all matches from the text
    cleaned_text = re.sub(combined_pattern, '', text)
    
    # Clean up any extra whitespace from removed dates
    cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text).strip()
    
    return cleaned_text, all_dates

### Extract norms

In [29]:
def find_norms(text):
    """
    Identifies norms like DIN, ISO, EN, etc., with their associated names, numbers, and variants.
    
    Parameters:
    - text: str, input text to search for norms.
    
    Returns:
    - List of norms found in the text.
    """
    # Regular expression pattern to capture norms
    norm_pattern = r'\b(?:DIN|ISO|EN)(?:\s+[A-Z]{1,3})?\s+\d{3,5}(?:[-‑]\d+)?\b'
    norm_pattern = r'\b(?:DIN|ISO|EN)(?:\s+(?:ISO|IEC))?(?:\s+[A-Z]{1,3})?\s+\d{3,5}(?:[-‑]\d+)?(?:\s+Beiblatt\s+\d+)?\b'
    norm_pattern = r'\b(DIN(?:\s+EN)?(?:\s+ISO(?:/IEC)?)?(?:\s+[A-Z])?\s+\d{3,5}(?:[-‑]\d+)?(?:\s+Beiblatt\s+\d+)?)\b'
    norm_pattern = r'\b(DIN(?:\s+EN)?(?:\s+ISO(?:/IEC)?)?(?:\s+[A-Z])?\s+\d{3,5}(?:[-‑]\d+)?(?:\s+Beiblatt\s+\d+)?(?:\s+\d+)?)\b'
    
    # Find all matches of the pattern in the text
    norms = re.findall(norm_pattern, text, re.IGNORECASE)
    for term in norms:
        text = text.replace(term,'')
    
    return text,norms


### Extract parentheses/bracket terms

In [208]:
def extract_and_clean_text(text):
    # Extract terms within parentheses or brackets
    extracted_terms = re.findall(r'\((.*?)\)|\[(.*?)\]', text)
    # Flatten list of extracted terms, ignoring None values
    extracted_terms = [term for group in extracted_terms for term in group if term]

    # Remove all occurrences of text within parentheses or brackets from the original text
    clean_text = re.sub(r'\(.*?\)|\[.*?\]', '', text).strip()

    return clean_text, extracted_terms

### Extract numbers/letters

In [320]:
def extract_numbers_letters(text):

    initial_pattern = r'(Nummer[n]?)\s+((?:\d+\.)+\d+)'
    extending_pattern = r'^(,\s*|und\s+|oder\s+|sowie\s+|bis\s+|Buchstabe\s+|Nummer\s+)'
    # number_or_letter_pattern = r'^\s*(\d+(?:\.\d+)*|[a-zA-Z])'
    # number_or_letter_pattern = r'^\s*(\d+(?:\.\d+)*|[a-zA-Z](?=\s|$)|Nummer\s+\d+(?:\.\d+)*|Buchstabe\s+[a-zA-Z])'
    number_or_letter_pattern = r'^\s*(\d+(?:\.\d+)*|[a-zA-Z](?=\s|[.,;]|$)|Nummer\s+\d+(?:\.\d+)*|Buchstabe\s+[a-zA-Z])'

    
    # Find all initial occurrences of the pattern
    end = 0
    matches = []
    for match in re.finditer(initial_pattern, text):
        # Start with the first "Nummer/n" pattern found
        if end >= match.end():
            continue
        start = match.start()
        end = match.end()
        extracted = match.group(0)
        while True:
            original_slice = text[end:]
            next_text = original_slice.lstrip()
            # Check if .strip() removed any leading whitespace
            whitespace_removed = len(original_slice) - len(next_text)
            add_whitespace = 0
            if whitespace_removed > 0:
                add_whitespace = whitespace_removed
            extension_match = re.match(extending_pattern, next_text)
            if extension_match:
                extension = extension_match.group(0)
                next_text2 = next_text[extension_match.end():]
                number_letter_match = re.match(number_or_letter_pattern,next_text2)
                if number_letter_match:
                    end += number_letter_match.end() + extension_match.end() + add_whitespace
                else:
                    break
            else:
                break
        text_found = text[start:end].strip()
        matches.append(text_found)

    return matches 

### Extract Manufacturers and Models

In [None]:
def ollama_query(instructions,prompt,chunk):
    
    prompt_send = f"{prompt}:\n{chunk}"
    
    response = ollama.chat(
    model="llama3",
    messages=[
        {
            "role": "system",
            "content": instructions
        },
        {
            "role": "user",
            "content": prompt_send,
        },
    ],
    )
    answer = response["message"]["content"]
    return answer 

table_description_name = 'table_description'

def batch_process(data, batch_size):
    """
    Yield successive batches of specified size from the data list.
    
    Parameters:
    - data (list): List of dictionaries to be batched.
    - batch_size (int): Number of items per batch.
    """
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

schema_table_description = {
  'id': 'int NOT NULL AUTO_INCREMENT PRIMARY KEY',
  'pdf_type': 'varchar(90) NOT NULL',
  'pdf_name': 'varchar(90) NOT NULL',
  'table_path': 'varchar(90) NOT NULL',
  'description': 'longtext NOT NULL',
  'footnote': 'longtext',
  'hersteller': 'longtext',
  'modelle': 'longtext'
}


def build_manufacturers_model_list(sql_con,table_description_name,new_table_description_name,batch_size = 100,schema_new_table_description = schema_table_description):

    sql_con.create_table(table_name=new_table_description_name,
                         schema=schema_new_table_description)
    
    model_list = sql_con.get_all_records_as_dict(new_table_description_name)
    model_list_filtered_repeat = [i['id'] for i in model_list if i['hersteller'] is None]
    model_list_filtered_done = [i['id'] for i in model_list if i['hersteller'] is not None]


    pdf_types_mit_hersteller_modellen = ['Liste förderfähigen Anlagen - Wärmepumpen','Liste förderfähigen Anlagen - Biomasse']
    hersteller_instructions = "List the Manufacturers/Hersteller stated on the text, with their full name. Just answer with the list using the following format: Name1 | Name2 | Name3. Don't add more text to the answer. If none are found, say: None"
    model_instructions = "List the different models stated on the text, with their full name. Don't mention the Hersteller. Just answer with the list using the following format: Name1 | Name2 | Name3. Don;t add more text to the answer. If none are found, say: None"
    prompt = "Analyze the following description"

    tables_descriptions = sql_con.get_all_records_as_dict(table_description_name)

    filtered_table_description = [i for i in tables_descriptions if i['pdf_type'] in pdf_types_mit_hersteller_modellen]
    filtered_table_description = [i for i in filtered_table_description if (i['id'] in model_list_filtered_repeat or i['id'] not in model_list_filtered_done)]

    for batch_number, batch in enumerate(batch_process(filtered_table_description, batch_size), start=1):
        upsert_elements = []
        print(f"Processing Batch {batch_number}:")
        for item in batch:
            text_desc = item['description']
            hersteller_answer = ollama_query(hersteller_instructions,prompt,text_desc)
            item['hersteller'] = hersteller_answer
            model_answer = ollama_query(model_instructions,prompt,text_desc)
            item['modelle'] = model_answer
            upsert_elements.append(item)

        sql_con.insert_many_records(table_name=new_table_description_name,
                            records=upsert_elements,overwrite=True)

        print("\n" + "-" * 50 + "\n") 

### Proces manufacturers/model list

In [126]:
def process_hersteller_model_list(sql_con,new_table_description_name):

    records_her_mod = sql_con.get_all_records_as_dict(new_table_description_name)

    hersteller_list = []
    modelle_list = []

    for record in records_her_mod:
        herstellers = record['hersteller']
        models = record['modelle']
        herstellers = herstellers.strip()
        if herstellers != 'None':
            herstellers_list = [item.strip() for item in herstellers.split("|")]
            models_list = [item.strip() for item in models.split("|")]
            herstellers_list = [item for item in herstellers_list if item not in ['None','']]
            models_list = [item for item in models_list if item not in ['None','']]

            hersteller_list.extend(herstellers_list)
            modelle_list.extend(models_list)
            

    return hersteller_list,modelle_list

def clean_hersteller_model_lists(hersteller_list,model_list,threshold=90):

    cleaned_hersteller = set()
    cleaned_models = set()

    for name in hersteller_list:
        clean_name = name.strip().lower()
        clean_name = " ".join(clean_name.split())
        cleaned_hersteller.add(clean_name)

    grouped_manufacturers = []
    visited = set()

    for name in cleaned_hersteller:
        if name in visited:
            continue

        # Find all names similar to the current name
        similar_names = [name]
        visited.add(name)
        for other_name in cleaned_hersteller:
            if other_name != name and other_name not in visited:
                similarity_score = fuzz.ratio(name, other_name)
                if similarity_score >= threshold:
                    similar_names.append(other_name)
                    visited.add(other_name)

        # Add the group of similar names to the main list
        grouped_manufacturers.append(similar_names)

    for name in model_list:
        clean_name = name.strip().lower()
        clean_name = " ".join(clean_name.split()) 
        cleaned_models.add(clean_name)

    return cleaned_hersteller,cleaned_models,grouped_manufacturers


# Applying cleaning test

## Load text

In [None]:
import os
import pandas as pd
from database_manager import MySQLDB, PineconeDBConnectorHybrid
from chunking_embeding_docs import config as db_config,db_name

root_dir = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_clean_v2'
original_dir = os.getcwd()
os.chdir(root_dir)

metadata_csv = 'data/documents/metadata/Files_date_version.csv'

# Initialize the connector
pinecone_api_key = "f5bc0357-3863-4072-b3af-d15fae1c500f"
embed_dimension = 768  # Dimension for embeddings
embedding_model_name = "jinaai/jina-embeddings-v2-base-de"
cloud = "aws"
region = "us-east-1"
model_name_only = embedding_model_name.split("/")[1]


# SQL Tables & Indexes
table_pages_clean = 'table_documents_clean'
table_documents_name = 'table_documents'

max_tokens = 500
suffix = 'clean'
sql_embed_table = f'embedding_table_pinecone_sparse_{suffix}_{max_tokens}'
sql_embed_table_embedding = sql_embed_table+'_embedding'
sql_table_vocab = f'vocabulary_bm25_{suffix}_{max_tokens}'
index_name = f'{model_name_only}-{suffix}-{max_tokens}'

# Db Connectors
con = PineconeDBConnectorHybrid(api_key=pinecone_api_key,
                          index_name=index_name,
                          embedding_model_name_dense=embedding_model_name,
                          dimension=embed_dimension,
                          cloud=cloud,
                          region=region)

sql_con = MySQLDB(config=db_config,database_name=db_name)

os.chdir(original_dir)
from chunking_embeding_docs import merge_pages,load_documents_pages

def load_complete_recent_docs(docs,df_codes):
    clean_text_list = []
    for doc in docs:
        pages_list = []
        doc_type = doc[0]['pdf_type']
        pdf_name = doc[0]['pdf_name']
        matched_row = df_codes[df_codes['file'] == pdf_name]
        recent = False
        # Check if any rows were found
        if not matched_row.empty:
            # Extract type_key and file_key from the matched row (assuming there's only one match)
            recent = matched_row["most_recent"].iloc[0]
        if recent:
            for page in doc:
                page['content'] = unicodedata.normalize('NFC',page['content'])
                page = page['content']
                pages_list.append(page)

            merged_doc = merge_pages(pages_list)
            dict_temp = {'name':pdf_name,
                         'original_text':merged_doc}
            clean_text_list.append(dict_temp)
    return clean_text_list

def load_complete(docs):
    clean_text_list = []
    for doc in docs:
        pages_list = []
        doc_type = doc[0]['pdf_type']
        pdf_name = doc[0]['pdf_name']
        for page in doc:
            page['content'] = unicodedata.normalize('NFC',page['content'])
            page = page['content']
            pages_list.append(page)

        merged_doc = merge_pages(pages_list)
        dict_temp = {'name':pdf_name,
                    'original_text':merged_doc,
                    'type':doc_type}
        clean_text_list.append(dict_temp)
    return clean_text_list

os.chdir(root_dir)

df_codes = pd.read_csv(metadata_csv)
df_codes['type_key'] = df_codes['type_key'].astype('int16')
df_codes['file_key'] = df_codes['file_key'].astype('int16')
df_codes['date_c'] = pd.to_datetime(df_codes['date'], format="%d/%m/%Y", errors='coerce', dayfirst=True)
df_codes = df_codes.sort_values(['type_key', 'date_c'], ascending=[True, False])
df_codes['most_recent'] = df_codes.groupby('type_key')['date_c'].transform('max') == df_codes['date_c']



Embedding Dimension: 768
Using existing Pinecone index: jina-embeddings-v2-base-de-clean-500


In [444]:
docs = load_documents_pages(sql_con=sql_con)

recent_docs = load_complete_recent_docs(docs,df_codes)

## Apply Cleaning text

In [449]:
total_urls = 0
for i in recent_docs:
    print(f"{i['name']}\n")
    # print(f"{i['original_text']}")
    clean_text,urls = remove_links(i['original_text'])
    clean_text = clean_text_characters(clean_text)
    clean_text = replace_abbreviations(clean_text,abbreviation_dict)
    print(f"Removed {len(urls)} URLs\n")
    print(f"Clean text:\n{clean_text}\n")
    i['clean_text'] = clean_text
    # print(f"Original text:\n{i['original_text']}\n")

    total_urls+=len(urls)
    for url in urls:
        print(f"Url removed: {url}")
print(f"Removed {total_urls} URLs in total\n\n")

Richtlinie BEG EM (2023-12-21).pdf

Removed 0 URLs

Clean text:
Bundesministerium
für Wirtschaft und Klimaschutz
Richtlinie für die Bundesförderung für effiziente Gebäude - Einzelmaßnahmen (BEG EM)
Vom 21. Dezember 2023
1 Präambel
Diese Richtlinie ersetzt die Richtlinie für die Bundesförderung für effiziente Gebäude - Einzelmaßnahmen (BEG EM) vom 9. Dezember 2022 (BAnz AT 30.12.2022 B1).
Die Bundesförderung für effiziente Gebäude (BEG) unterstützt die Erreichung der Klimaziele, die auf nationaler Ebene im Klimaschutzgesetz dargelegt sind. Sie dient auch der Umsetzung des Klimaschutzprogramms 2023. Mit der BEG wurde die energetische Gebäudeförderung des Bundes daher in Umsetzung des Klimaschutzprogramms 2030 und der Förderstrategie „Energieeffizienz und Wärme aus Erneuerbaren Energien“ des Bundesministeriums für Wirtschaft und Klimaschutz (BMWK) neu aufgesetzt und in 2021 eingeführt. Die BEG ersetzte damit das CO2-Gebäudesanierungsprogramm (EBS-Programme), das Marktanreizprogramm für er

# Extracting and creating Hersteller and Model List

In [176]:
table_description_name = 'table_description'
new_table_description_name = 'table_description_hersteller_model'

build_manufacturers_model_list(sql_con=sql_con,
                               table_description_name=table_description_name,
                               new_table_description_name=new_table_description_name,
                               batch_size=5)

Processing Batch 1:

--------------------------------------------------



In [184]:
hersteller_list,model_list = process_hersteller_model_list(sql_con=sql_con,
                                                           new_table_description_name=new_table_description_name)

cleaned_hersteller,cleaned_models,grouped_manufacturers = clean_hersteller_model_lists(hersteller_list,model_list)

In [None]:
from collections import defaultdict

def normalize_manufacturer_names(manufacturer_list):
    """
    Normalizes a list of manufacturer names by identifying and associating synonyms.
    
    Parameters:
    manufacturer_list (list): List of manufacturer names
    
    Returns:
    dict: Mapping of original manufacturer names to normalized names
    """
    # Convert all names to lower case for case-insensitive matching
    lower_names = [name.lower() for name in manufacturer_list]
    
    # Create a dictionary to store all name variations and their associated normalized name
    name_variations = defaultdict(set)
    
    # Iterate through the unique names and find all variations
    for name in set(lower_names):
        for other_name in set(lower_names):
            if name in other_name or other_name in name:
                name_variations[name].add(other_name)
    
    # Create a mapping of original names to their normalized form
    normalized_names = {name: min(variations, key=len) for name, variations in name_variations.items()}
    
    return normalized_names


{'smartheat deutschland gmbh': 'smartheat deutschland gmbh', 'haas & sohn': 'haas & sohn', 'biotech': 'biotech', 'elco gmbh': 'elco gmbh', 'autark power gmbh': 'autark power gmbh', 'compte r': 'compte r', 'zaklad produkcyjno handlowy stanislaw krzaczek': 'zaklad produkcyjno handlowy stanislaw krzaczek', 'levana aqua': 'levana aqua', 'rennery': 'rennery', 'scanson handels contor gmbh': 'scanson handels contor gmbh', 'swegon': 'swegon', 'regli energy systems ag': 'regli energy systems ag', 'bes building energy solutions gmbh': 'solution', 'qingdao economic and technological development zone haier water heater co., ltd.': 'haier', 'teon s.r.l.': 'teon s.r.l.', 'tongfang germany gmbh': 'tongfang germany gmbh', 'capito': 'capito', 'sunhybrid gmbh': 'sunhybrid gmbh', 'sht heiztechnik gmbh (lohberger)': 'sht', 'ritter energie- und umwelttechnik gmbh & co. kg': 'ritter energie', 'weishaupt': 'weishaupt', 'pelletti': 'pelletti', 'yanmar energy system co., ltd': 'yanmar energy system co., ltd', 

In [200]:
import pandas as pd
from collections import defaultdict

def normalize_manufacturer_names(manufacturer_list):
    """
    Normalizes a list of manufacturer names by identifying and associating synonyms using both name containment and fuzzy string matching.
    
    Parameters:
    manufacturer_list (list): List of manufacturer names
    
    Returns:
    dict: Mapping of original manufacturer names to normalized names
    """
    # Convert all names to lower case for case-insensitive matching
    lower_names = [name.lower() for name in manufacturer_list]
    
    # Create a dictionary to store all name variations and their associated normalized name
    name_variations = defaultdict(set)
    
    # Iterate through the unique names and find all variations using name containment
    for name in set(lower_names):
        for other_name in set(lower_names):
            if name in other_name or other_name in name:
                name_variations[name].add(other_name)
    
    # Add fuzzy matching to find additional variations
    for name in set(lower_names):
        for other_name in set(lower_names):
            if name != other_name and fuzz.ratio(name, other_name) > 90:
                name_variations[name].add(other_name)
                name_variations[other_name].add(name)
    
    # Create a mapping of original names to their normalized form (shortest version)
    normalized_names = {name: min(variations, key=len) for name, variations in name_variations.items()}
    
    return normalized_names

In [201]:
normalized_mapping = normalize_manufacturer_names(cleaned_hersteller)


In [204]:
normalized_mapping

{'smartheat deutschland gmbh': 'smartheat deutschland gmbh',
 'haas & sohn': 'haas & sohn',
 'biotech': 'biotech',
 'elco gmbh': 'elco gmbh',
 'autark power gmbh': 'autark power gmbh',
 'compte r': 'compte r',
 'zaklad produkcyjno handlowy stanislaw krzaczek': 'zaklad produkcyjno handlowy stanislaw krzaczek',
 'levana aqua': 'levana aqua',
 'rennery': 'rennery',
 'scanson handels contor gmbh': 'scanson handels contor gmbh',
 'swegon': 'swegon',
 'regli energy systems ag': 'regli energy systems ag',
 'bes building energy solutions gmbh': 'solution',
 'qingdao economic and technological development zone haier water heater co., ltd.': 'haier',
 'teon s.r.l.': 'teon s.r.l.',
 'tongfang germany gmbh': 'tongfang germany gmbh',
 'capito': 'capito',
 'sunhybrid gmbh': 'sunhybrid gmbh',
 'sht heiztechnik gmbh (lohberger)': 'sht',
 'ritter energie- und umwelttechnik gmbh & co. kg': 'ritter energie',
 'weishaupt': 'weishaupt',
 'pelletti': 'pelletti',
 'yanmar energy system co., ltd': 'yanmar ene

In [202]:
assoc_n = 0
for i in normalized_mapping.items():
    key = i[0]
    name = i[1]
    if key == name:
        print(name)
    else:
        print(f'{key} : {name}')
        assoc_n+=1

print(f"{assoc_n} were associated")

smartheat deutschland gmbh
haas & sohn
biotech
elco gmbh
autark power gmbh
compte r
zaklad produkcyjno handlowy stanislaw krzaczek
levana aqua
rennery
scanson handels contor gmbh
swegon
regli energy systems ag
bes building energy solutions gmbh : solution
qingdao economic and technological development zone haier water heater co., ltd. : haier
teon s.r.l.
tongfang germany gmbh
capito
sunhybrid gmbh
sht heiztechnik gmbh (lohberger) : sht
ritter energie- und umwelttechnik gmbh & co. kg : ritter energie
weishaupt
pelletti
yanmar energy system co., ltd
brager
gipo d.o.o.
blacksmith kienmühle : ksm
d'alessandro termomeccanica s.r.l.
fire bv
macon trading group b.v
home star sp. z o.o.
hitachi - johnson controls hitachi air conditioning europe sas : hitachi
remeha
mcz cadel s.r.l. : mcz
guntamatic heiztechnik : guntamatic
i̇fyil termo i̇klimlendirme san. ve tic. ltd. şti. : iyil termo i̇klimlendirme san. ve tic. ltd. şti.
nextherm industrie
amitime
hantech gmbh / hantech : hantech gmbh
hargas

# Apply term extraction and cleaning

In [338]:
def extract_special_characters(text):
    # Define pattern to exclude standard letters, numbers, whitespace, and German characters
    special_characters = re.findall(r"[^a-zA-Z0-9\säöüßÄÖÜ]", text)
    # Return unique special characters
    return set(special_characters)

In [416]:
from collections import Counter

def extract_abbreviations(corpus):
    # Define patterns for the abbreviations
    pattern_4_letter = r'\b[A-Za-z]{4}\.\b'     # 4-letter abbreviations ending with a dot
    pattern_1_to_3_letter = r'\b[A-Za-z]{1,3}\.?\b'  # 1 to 3-letter abbreviations with optional dot
    
    # Combine patterns with alternation
    combined_pattern = f'({pattern_4_letter})|({pattern_1_to_3_letter})'
    
    # Find all matches in the corpus
    matches = re.findall(combined_pattern, corpus)
    
    # Flatten results and remove empty strings
    matches = [match[0] if match[0] else match[1] for match in matches]
    
    # Count frequencies of each match
    abbreviation_counts = Counter(matches)
    return abbreviation_counts

In [450]:
# 8. August 2020
norms_count = 0
dates_count = 0
terms_count = 0
num_buch_count = 0

for doc in recent_docs:

    clean_text = doc['clean_text']
    nummers_buchstaben = extract_numbers_letters(clean_text)
    cleaner_text,dates = extract_and_remove_german_dates(clean_text)
    cleaner_text,norms = find_norms(cleaner_text)
    cleaner_text, par_brack_terms = extract_and_clean_text(cleaner_text)
    special_chars = extract_special_characters(cleaner_text)
    abbreviations = extract_abbreviations(cleaner_text)
    doc['cleaner_text'] = cleaner_text

    norms_count+= len(norms)
    dates_count+= len(dates)
    terms_count+= len(par_brack_terms)
    num_buch_count+= len(nummers_buchstaben)
    print(f"{doc['name']}:\n\n")
    print(f"Original:\n{clean_text}\n\n")
    print(f"Clean:\n{cleaner_text}\n\n")

    print(f"Extracted {len(dates)} dates")
    for date in dates:
        print(date)
    print(f"Extracted {len(norms)} norms")
    for norm in norms:
        print(norm)
    print(f"Extracted {len(par_brack_terms)} parentheses/brackets")
    for term in par_brack_terms:
        print(term)
    print(f"Extracted {len(nummers_buchstaben)} numbers/letters")
    for term in nummers_buchstaben:
        print(term)
    print("\nChars in doc:\n")
    for char in special_chars:
        print(f"< {char} >")
    print("\nAbbreviations:\n")
    for char in abbreviations.items():
        print(f"< {char[0]} {char[1]} >")



print(f"{norms_count} norms were extracted\n{dates_count} dates were extracted\n{terms_count} terms were extracted\n{num_buch_count} numbers/letters were extracted")

Richtlinie BEG EM (2023-12-21).pdf:


Original:
Bundesministerium
für Wirtschaft und Klimaschutz
Richtlinie für die Bundesförderung für effiziente Gebäude - Einzelmaßnahmen (BEG EM)
Vom 21. Dezember 2023
1 Präambel
Diese Richtlinie ersetzt die Richtlinie für die Bundesförderung für effiziente Gebäude - Einzelmaßnahmen (BEG EM) vom 9. Dezember 2022 (BAnz AT 30.12.2022 B1).
Die Bundesförderung für effiziente Gebäude (BEG) unterstützt die Erreichung der Klimaziele, die auf nationaler Ebene im Klimaschutzgesetz dargelegt sind. Sie dient auch der Umsetzung des Klimaschutzprogramms 2023. Mit der BEG wurde die energetische Gebäudeförderung des Bundes daher in Umsetzung des Klimaschutzprogramms 2030 und der Förderstrategie „Energieeffizienz und Wärme aus Erneuerbaren Energien“ des Bundesministeriums für Wirtschaft und Klimaschutz (BMWK) neu aufgesetzt und in 2021 eingeführt. Die BEG ersetzte damit das CO2-Gebäudesanierungsprogramm (EBS-Programme), das Marktanreizprogramm für erneuerbare Energi

In [433]:
text_test_pdf = recent_docs[0]

tokens_text = word_tokenize(preprocess(text_test_pdf['cleaner_text']))
stop_words = set(stopwords.words("german"))
tokens_text = [t for t in tokens_text if t not in stop_words]
tokens_text = lemmatize(tokens_text)

print(f"{len(tokens_text)} : {tokens_text}")

8852 : ['Bundesministerium', 'Wirtschaft', 'Klimaschutz', 'Richtlinie', 'Bundesförderung', 'effizient', 'Gebäude', 'einzelmaßnahm', 'Präambel', 'Richtlinie', 'ersetzen', 'Richtlinie', 'Bundesförderung', 'effizient', 'Gebäude', 'einzelmaßnahm', 'Bundesförderung', 'effizient', 'Gebäude', 'unterstützen', 'Erreichung', 'klimaziele', 'national', 'Ebene', 'Klimaschutzgesetz', 'darlegen', 'dienen', 'Umsetzung', 'Klimaschutzprogramms', 'beg', 'werden', 'energetisch', 'Gebäudeförderung', 'Bundes', 'daher', 'Umsetzung', 'Klimaschutzprogramms', 'Förderstrategie', 'Energieeffizienz', 'Wärme', 'erneuerbar', 'Energien', 'Bundesministerium', 'Wirtschaft', 'Klimaschutz', 'neu', 'aufgesetzt', 'einführen', 'beg', 'ersetzen', 'Co', 'Gebäudesanierungsprogramm', 'Marktanreizprogramm', 'erneuerbar', 'Energien', 'Wärmemarkt', 'Anreizprogramm', 'Energieeffizienz', 'Heizungsoptimierungsprogramm', 'bewährt', 'element', 'Förderprogramm', 'werden', 'übernehmen', 'weiterentwickeln', 'neu', 'Förderrichtlinie', 'Beg

In [397]:
import re
from collections import Counter
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

# Load German NLP model for lemmatization
nlp = spacy.load("de_core_news_lg")

def preprocess(text):
    # Lowercasing and removing special characters
    text = text.lower()
    text = re.sub(r"[^a-zäöüß\s]", " ", text)
    return text

def lemmatize(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

def extract_vocabulary(texts):
    # Initialize stopwords
    stop_words = set(stopwords.words("german"))
    vocab_counter = Counter()

    for text in texts:
        # Preprocessing
        clean_text = preprocess(text)
        
        # Tokenize and filter stopwords
        tokens = word_tokenize(clean_text)
        tokens = [t for t in tokens if t not in stop_words]
        
        # Lemmatize
        tokens = lemmatize(tokens)

        # Count unigrams and n-grams
        vocab_counter.update(ngrams(tokens, 1))
        vocab_counter.update(ngrams(tokens, 2))
        vocab_counter.update(ngrams(tokens, 3))
        vocab_counter.update(ngrams(tokens, 4))
        vocab_counter.update(ngrams(tokens, 5))

    return dict(vocab_counter)

# Example usage
texts = [text_test_pdf['cleaner_text']]
vocab = extract_vocabulary(texts)

In [398]:
for n,word_freq in enumerate(vocab.items()):
    word = word_freq[0]
    freq = word_freq[1]
    if len(word)>1 and freq > 2:
        print(f'{word} : {freq}')
    

('Bundesministerium', 'Wirtschaft') : 3
('Wirtschaft', 'Klimaschutz') : 3
('Richtlinie', 'Bundesförderung') : 4
('Bundesförderung', 'effizient') : 11
('effizient', 'Gebäude') : 11
('Gebäude', 'einzelmaßnahm') : 5
('erneuerbar', 'Energien') : 30
('Energieeffizienz', 'erneuerbar') : 3
('Prozent', 'erneuerbar') : 4
('gefördert', 'Maßnahme') : 14
('mindestens', 'Prozent') : 12
('erneuerbar', 'Energie') : 7
('vorliegend', 'Förderrichtlinie') : 3
('beg', 'em') : 28
('Förderung', 'einzelmaßnahm') : 3
('wohngebäud', 'nichtwohngebäud') : 3
('kfw', 'Bafa') : 4
('erlassen', 'allgemein') : 3
('Anbest', 'P') : 4
('allgemein', 'Verwaltungsvorschrift') : 3
('Verordnung', 'nr') : 9
('Sinn', 'Förderrichtlinie') : 3
('Zeitpunkt', 'Antragstellung') : 5
('Umsetzung', 'Maßnahme') : 9
('energetisch', 'Sanierungsmaßnahmen') : 3
('gebäudehüll', 'Anlagentechnik') : 3
('Gebäude', 'unmittelbar') : 3
('Anlage', 'Heizungsunterstützung') : 3
('technisch', 'Mindestanforderung') : 12
('Energieeffizienz', 'Experte') :

In [403]:
stop_words = set(stopwords.words("german"))
for word in stop_words:
    print(word)

sonst
viel
während
nach
ihnen
als
meines
weiter
dem
durch
soll
von
über
warst
dass
sollte
so
es
auch
einige
daß
einem
bin
meiner
würde
dieses
einigem
welcher
kann
weil
könnte
dort
welche
nur
zwischen
deines
demselben
ihr
jeder
einigen
manchem
jede
deinem
ohne
die
mit
und
unser
einmal
denn
manchen
allem
unseres
in
kein
alle
waren
derselben
wollte
dann
sein
bist
jenes
anderen
anderr
dein
deiner
musste
einiger
ob
hat
dieser
meinem
euren
man
eurer
jedem
hier
anders
selbst
das
da
weg
zum
mein
denselben
euch
dies
auf
jeden
seine
jener
wird
wenn
etwas
wirst
ihre
bei
unserem
was
vor
mancher
muss
nichts
unter
solchen
habe
eine
jenen
ander
allen
mich
ins
ist
keinen
vom
du
oder
hinter
im
eure
einen
des
sehr
solche
sondern
für
hab
dasselbe
hin
manches
zur
seinem
keiner
derselbe
ihn
meine
solches
deine
hatten
wo
mir
aller
also
meinen
sich
ich
um
ihren
sind
war
am
werden
gegen
sie
seiner
anderem
einer
dazu
solcher
keinem
machen
alles
eines
ihres
eurem
haben
anderer
bis
keine
euer
zu
wieder
jedes
and

In [434]:
import spacy
from nltk.corpus import stopwords
from nltk import ngrams
from collections import Counter

# Load German spaCy model
nlp = spacy.load('de_core_news_lg')

def preprocess_text(text):
    # Create spaCy doc
    doc = nlp(text)
    
    # Lemmatization and stop word removal
    german_stops = set(stopwords.words('german'))
    tokens = [token.lemma_.lower() for token in doc 
             if token.lemma_.lower() not in german_stops
             and not token.is_punct
             and len(token.lemma_) > 1]
    
    return tokens

def generate_ngrams(tokens, max_n=5):
    all_ngrams = []
    for n in range(1, max_n + 1):
        n_grams = list(ngrams(tokens, n))
        # Convert tuples to strings
        n_grams = [' '.join(gram) for gram in n_grams]
        all_ngrams.extend(n_grams)
    return all_ngrams

def filter_overlapping_ngrams(ngrams_list, similarity_threshold=0.8):
    # Count frequencies
    ngram_counts = Counter(ngrams_list)
    
    # Sort n-grams by length (longer first) and frequency
    sorted_ngrams = sorted(ngram_counts.keys(), 
                          key=lambda x: (-len(x.split()), -ngram_counts[x]))
    
    filtered_ngrams = []
    
    for ngram in sorted_ngrams:
        should_keep = True
        ngram_parts = set(ngram.split())
        
        # Compare with already filtered n-grams
        for kept_ngram in filtered_ngrams:
            kept_parts = set(kept_ngram.split())
            
            # Calculate overlap ratio
            overlap = len(ngram_parts & kept_parts) / len(ngram_parts)
            
            if overlap >= similarity_threshold:
                should_keep = False
                break
                
        if should_keep:
            filtered_ngrams.append(ngram)
    
    return filtered_ngrams

def create_vocabulary(text, max_ngram=5, similarity_threshold=0.8):
    # Preprocess
    tokens = preprocess_text(text)
    
    # Generate n-grams
    all_ngrams = generate_ngrams(tokens, max_ngram)
    
    # Filter overlapping n-grams
    filtered_ngrams = filter_overlapping_ngrams(all_ngrams, similarity_threshold)
    
    return filtered_ngrams





In [436]:
# Usage
text_test_pdf  = recent_docs[0]
vocabulary = create_vocabulary(text_test_pdf['cleaner_text'])

ZeroDivisionError: division by zero

In [None]:
def remove_ending_punctuation(text):
    # Use regex to match any punctuation (.,;:) at the end of a sentence before whitespace or the end of the string
    cleaned_text = re.sub(r'[.,;:]+(?=\s|$)', '', text)
    return cleaned_text

def clean_text(text):

    text = remove_ending_punctuation(text)
    # Replace all types of linebreaks with space
    text = re.sub(r'[\n\r\f]+', ' ', text)

    # Step 3: Remove unwanted control characters but keep printable characters, including German letters
    text = re.sub(r'[^\x20-\x7EäöüßÄÖÜ]', ' ', text)  # keep German characters and remove control characters
    
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Remove symbols attached to the beginning or end of words
    text = re.sub(r'(?<=\s)[^\wäöüßÄÖÜ]+(?=\w)', '', text)  # Remove symbols before a word
    text = re.sub(r'(?<=\w)[^\wäöüßÄÖÜ]+(?=\s)', '', text)
    
    # Remove isolated special characters
    # This pattern matches:
    # (?<=\s) - positive lookbehind for whitespace
    # [^\w\s]+ - one or more non-word, non-whitespace characters
    # (?=\s) - positive lookahead for whitespace
    text = re.sub(r'(?<=\s)[^\w\s]+(?=\s)', ' ', text)
    
    # Handle special characters at start of string
    text = re.sub(r'^[^\w\s]+(?=\s)', ' ', text)
    
    # Handle special characters at end of string
    text = re.sub(r'(?<=\s)[^\w\s]+$', ' ', text)
    
    # Clean up any resulting multiple spaces again
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading and trailing whitespace
    text = text.strip()
    
    return text

def preprocess_text(text):
    text = clean_text(text)

    # Create spaCy doc
    doc = nlp(text)
    
    # Define German stop words and unwanted words
    german_stops = set(stopwords.words('german'))
    unwanted_words = {
        "ja", "nein", "doch", "na", "oh", "aha", "hm", "ach", "genau", "klar", 
        "eben", "also", "wirklich", "eigentlich", "sicher", "vielleicht", "natürlich", 
        "schon", "wohl", "okay", "alles klar", "keine Ahnung", "selbstverständlich", 
        "nicht wirklich", "na ja", "auch", "sowie", "beziehungsweise", "sodass", "dazu", 
        "bzw.", "vorher", "danach", "nachdem", "nun", "insbesondere", "jedoch", "bereits", 
        "heute", "aktuell", "trotzdem", "zunächst", "besonders", "eventuell", "eher", 
        "einige", "mehrere", "verschiedene", "meistens", "möglichst", "weniger", "mehr", 
        "etwas", "wenige", "viele", "nur", "dabei", "davon", "ebenfalls", "immer", 
        "beispielsweise", "entsprechend", "daraufhin", "sofern", "bisher", "vorher", 
        "jeweils", "nachfolgend", "jeweilige", "dadurch", "zusätzlich", "ansonsten", 
        "voraussichtlich", "gegebenenfalls", "insgesamt", "deshalb", "daher", 
        "dementsprechend", "insoweit", "indem", "allerdings", "somit", "darunter", 
        "gleichzeitig", "aufgrund","einschließlich","beispiel","ja","nein","fußnote", 
        "werk", "lizenziert", "creative", "commons", "namensnennung", "bearbeitung","international",
        "lizenz","impressum", "presse", "e-mail", "tel", "fax", "initiative", "audit", "gemeinnützigen", "herausgeber",
        "hertie-stiftung", "personalpolitik", "rahmen"
    }
    
    # Lemmatization, stop word, and unwanted word removal
    tokens = [token.lemma_.lower() for token in doc 
              if token.lemma_.lower() not in german_stops
              and token.lemma_.lower() not in unwanted_words
              and not token.is_punct
              and len(token.lemma_) > 1]
    
    return tokens


def count_words(tokens):
    # Count occurrences of each word
    word_counts = Counter(tokens)
    
    # Create a unique set of words
    unique_words = set(word_counts.keys())
    
    return word_counts, unique_words

In [520]:
text_test_pdf  = recent_docs[7]

tokes = preprocess_text(text_test_pdf['cleaner_text'])

word_counts, unique_words = count_words(tokes)

print(f"{text_test_pdf['name']}\nTokens: {len(unique_words)}/{len(tokes)}")
for i in word_counts.items():
    print(f"{i[0]} - {i[1]}")

for i in tokes:
    print(i)

print(text_test_pdf['cleaner_text'])

Technische FAQ BEG EM_5 (2023-03-13).pdf
Tokens: 1733/5567
bundesförderung - 3
effizient - 4
gebäude - 71
liste - 3
technisch - 24
faq - 7
beg - 37
em - 26
version - 6
5.0 - 2
einzelmaßnahm - 26
thema - 80
grundlage - 4
häufig - 3
gestellt - 2
frage - 4
fachkundig - 1
vorkommend - 1
fehler - 1
nachweise - 2
effizienzhäusern - 1
effizienzgebäud - 1
zusammenstellen - 1
technischen - 2
mindestanforderung - 15
bestimmung - 6
geg - 43
geg-norme - 1
sonstig - 4
regelwerk - 2
erläutern - 1
teilbereich - 1
zusammengefasst - 1
weiterführende - 1
vorgabe - 4
jeweilig - 5
entnehmen - 4
sollen - 5
energieeffizienz-expert - 1
expertin - 3
fachunternehmer - 2
unternehmerinn - 1
unterstützen - 1
regelmäßig - 2
weiterentwickeln - 1
bedarf - 2
aktualisieren - 1
verwendung - 5
achten - 3
wichtig - 1
hinweis - 8
geltend - 3
fassung - 4
informationsblatt - 2
überarbeiten - 1
zeitpunkt - 5
antragstellung - 4
gültig - 3
regelung - 4
anforderung - 37
vorangegangen - 2
keinerlei - 1
gültigkeit - 1
begründung 

AttributeError: 'set' object has no attribute 'items'

In [454]:
def remove_ending_punctuation(text):
    # Use regex to match any punctuation (.,;:) at the end of a sentence before whitespace or the end of the string
    cleaned_text = re.sub(r'[.,;:]+(?=\s|$)', '', text)
    return cleaned_text

text = "This is a test sentence. Here is another one; and another: with various punctuation!"
cleaned_text = remove_ending_punctuation(text)
print(cleaned_text)

This is a test sentence Here is another one and another with various punctuation!


In [461]:
print(clean_text(text_test_pdf['cleaner_text']))

Bundesministerium für Wirtschaft und Klimaschutz Richtlinie für die Bundesförderung für effiziente Gebäude Einzelmaßnahmen Vom 1 Präambel Diese Richtlinie ersetzt die Richtlinie für die Bundesförderung für effiziente Gebäude Einzelmaßnahmen vom Die Bundesförderung für effiziente Gebäude unterstützt die Erreichung der Klimaziele die auf nationaler Ebene im Klimaschutzgesetz dargelegt sind Sie dient auch der Umsetzung des Klimaschutzprogramms 2023 Mit der BEG wurde die energetische Gebäudeförderung des Bundes daher in Umsetzung des Klimaschutzprogramms 2030 und der Förderstrategie Energieeffizienz und Wärme aus Erneuerbaren Energien des Bundesministeriums für Wirtschaft und Klimaschutz neu aufgesetzt und in 2021 eingeführt Die BEG ersetzte damit das CO2-Gebäudesanierungsprogramm das Marktanreizprogramm für erneuerbare Energien im Wärmemarkt das Anreizprogramm Energieeffizienz und das Heizungsoptimierungsprogramm Bewährte Elemente aus diesen Förderprogrammen wurden übernommen weiterentwic

In [484]:
tokes = preprocess_text("""8.5.2 Zinssatz
a) Höhe des Zinssatzes; Verbilligung aus Bundesmitteln""")
tokes

['8.5.2', 'zinssatz', '--', 'höhe', 'zinssatz', 'verbilligung', 'bundesmittel']

In [488]:
from nltk.stem.snowball import GermanStemmer
from nltk.corpus import stopwords
stemmer = GermanStemmer()
german_stops = set(stopwords.words('german'))

def preprocess_text_snowball(text):
    # Initial text cleaning
    text = clean_text(text)

    # Create spaCy doc for tokenization
    doc = nlp(text)
    
    # Stem each token, removing stop words and punctuation
    tokens = []
    for token in doc:
        if token.text.lower() not in german_stops and not token.is_punct:
            stemmed_token = stemmer.stem(token.text.lower())
            tokens.append(stemmed_token)
    
    return tokens

In [491]:
from collections import defaultdict

def check_stemming_inconsistencies(text):
    doc = nlp(text)
    inconsistencies = defaultdict(set)
    
    for token in doc:
        if token.text.lower() not in german_stops and not token.is_punct:
            stemmed_token = stemmer.stem(token.text.lower())
            inconsistencies[stemmed_token].add(token.text.lower())
    
    # Find stems with more than one original token mapping to them
    inconsistent_stems = {stem: originals for stem, originals in inconsistencies.items() if len(originals) > 1}
    return inconsistent_stems


def check_lemma_inconsistencies(text):
    doc = nlp(text)  # Tokenize the text
    inconsistencies = defaultdict(set)
    
    # Track lemmas for each unique word
    for token in doc:
        if not token.is_punct and len(token.text) > 1:
            original_word = token.text.lower()
            lemma = token.lemma_.lower()
            inconsistencies[original_word].add(lemma)
    
    # Find words with more than one distinct lemma
    inconsistent_lemmas = {word: lemmas for word, lemmas in inconsistencies.items() if len(lemmas) > 1}
    return inconsistent_lemmas

inconsistencies = check_lemma_inconsistencies(clean_text(text_test_pdf['cleaner_text']))
print("Inconsistent Stems:", inconsistencies)

Inconsistent Stems: {'energien': {'energie', 'energi'}, 'erneuerbare': {'erneuerbare', 'erneuerbar'}, 'erfolgt': {'erfolgt', 'erfolgen'}, 'alle': {'alle', 'aller'}, 'einzelnen': {'einzeln', 'einzelne'}, 'nichtwohngebäude': {'nichtwohngebäud', 'nichtwohngebäude'}, 'wohngebäuden': {'wohngebäude', 'wohngebäud'}, 'nichtwohngebäuden': {'nichtwohngebäude', 'nichtwohngebäuden', 'nichtwohngebaud'}, 'beauftragten': {'beauftragt', 'beauftragter'}, 'einer': {'einer', 'ein'}, 'sechste': {'sechste', 'sechster'}, 'eines': {'einer', 'ein'}, 'stellen': {'stellen', 'stelle'}, 'sanierungsmaßnahmen': {'sanierungsmaßnahm', 'sanierungsmaßnahme'}, 'dachflächen': {'dachfläche', 'dachflächen'}, 'außentüren': {'außentüre', 'außentür'}, 'energieeffizienz-experten': {'energieeffizienz-experten', 'energieeffizienz-expert'}, 'einem': {'einer', 'ein'}, 'betrieben': {'betrieb', 'betreiben'}, 'förderfähige': {'förderfähig', 'förderfähige'}, 'technische': {'technische', 'technisch'}, 'umfeldmaßnahmen': {'umfeldmaßnahm

In [492]:
text = """
Erweiterung durch Anbau und Ausbau von Nichtwohngebäuden, Umwidmung von Wohngebäuden zu Nichtwohngebäuden.
Erweiterung durch Anbau, Ausbau von Wohngebäuden, Umwidmung von Nichtwohngebäuden zu Wohngebäuden.
"""

inconsistencies = check_lemma_inconsistencies(text)
print("Words with Inconsistent Lemmas:", inconsistencies)

Words with Inconsistent Lemmas: {'nichtwohngebäuden': {'nichtwohngebäude', 'nichtwohngebaud', 'nichtwohngebäuden'}, 'wohngebäuden': {'wohngebaud', 'wohngebäude'}}


In [None]:
tokes = preprocess_text_snowball(text_test_pdf['cleaner_text'])

for i in tokes:
    print(i)

print(text_test_pdf['cleaner_text'])

bundesministerium
wirtschaft
klimaschutz
richtlini
bundesforder
effizient
gebaud
einzelmassnahm
1
praambel
richtlini
ersetzt
richtlini
bundesforder
effizient
gebaud
einzelmassnahm
bundesforder
effizient
gebaud
unterstutzt
erreich
klimaziel
national
eben
klimaschutzgesetz
dargelegt
dient
umsetz
klimaschutzprogramm
2023
beg
wurd
energet
gebaudeforder
bund
dah
umsetz
klimaschutzprogramm
2030
forderstrategi
energieeffizienz
warm
erneuerbar
energi
bundesministerium
wirtschaft
klimaschutz
neu
aufgesetzt
2021
eingefuhrt
beg
ersetzt
co2-gebaudesanierungsprogramm
marktanreizprogramm
erneuerbar
energi
warmemarkt
anreizprogramm
energieeffizienz
heizungsoptimierungsprogramm
bewahrt
element
forderprogramm
wurd
ubernomm
weiterentwickelt
neu
forderrichtlini
beg
gebundelt
integration
vier
bisher
bundesforderprogramm
wurd
forder
effizienz
erneuerbar
energi
gebaudesektor
erstmal
zusammengefuhrt
beg
somit
inhalt
komplexitat
bisher
forderprogramm
reduziert
zugang
verstand
burg
unternehm
kommun
gemacht
anr

### Create vocabulary for each type of document

In [526]:
def pre_clean_document(document):
    document_text = document['original_text']
    clean_text,urls = remove_links(document_text)
    clean_text = clean_text_characters(clean_text)
    clean_text = replace_abbreviations(clean_text,abbreviation_dict)
    document['clean_text'] = clean_text
    return document


def post_clean_document(document):
    extracted_dict={}
    clean_text = document['clean_text']
    nummers_buchstaben = extract_numbers_letters(clean_text)
    extracted_dict['numbers_letters'] = nummers_buchstaben
    cleaner_text,dates = extract_and_remove_german_dates(clean_text)
    extracted_dict['dates'] = dates
    cleaner_text,norms = find_norms(cleaner_text)
    extracted_dict['norms'] = norms 
    cleaner_text, par_brack_terms = extract_and_clean_text(cleaner_text)
    extracted_dict['parentheses_terms'] = par_brack_terms 
    document['cleaner_text'] = cleaner_text
    document['extracted_terms'] = extracted_dict
    return document



In [531]:
docs = load_documents_pages(sql_con=sql_con)
complete_docs = load_complete(docs)

# Initialize the vocabulary dictionary
vocabulary_dictionary = {}

# Loop through each document and process it
for document in complete_docs:
    # Get the document type
    doc_type = document['type']
    
    # Pre-process the document
    document = pre_clean_document(document)
    document = post_clean_document(document)
    
    # Tokenize and preprocess text to get tokens
    tokens = preprocess_text(document['cleaner_text'])
    
    # Get unique words in the document (as a set)
    _, unique_words = count_words(tokens)  # Assuming unique_words is a set
    
    # Add unique words to the vocabulary dictionary for the specific doc_type
    if doc_type not in vocabulary_dictionary:
        # If doc_type is not in the dictionary, initialize it with the unique words
        vocabulary_dictionary[doc_type] = unique_words
    else:
        # If doc_type already exists, update the set with new unique words
        vocabulary_dictionary[doc_type].update(unique_words)


In [535]:
total_voc = set()
for i in vocabulary_dictionary.items():
    print(f"{i[0]} - {len(i[1])}")
    total_voc.update(i[1])
print(F"Total vocabulary {len(total_voc)}")


Richtlinie BEG EM - 3304
Allgemeines Merkblatt zur Antragstellung - 805
Infoblätter förderfähigen Kosten - 2654
FAQ BEG (EM und EH-EG) - 4396
Liste förderfähigen Anlagen - Wärmepumpen - 15146
Förderübersicht BEG EM - 141
Liste förderfähigen Anlagen - Biomasse - 5590
Technische FAQ BEG EM - 1948
Total vocabulary 26611
