# Text Cleaning

## Functions for cleaning

### Imports

In [1]:
import re
import unicodedata

### Remove URLs

In [2]:

def remove_links(text):
    # Pattern to match URLs in both forms: (<http...>) or <http...>
    pattern = r'\(<(http[^\)]+?)>\)|<(http[^\>]+?)>'
    # Pattern to match any text in the format (< [some text] >)
    bracket_pattern = r'\(<[^>]*?>\)'
    
    urls_found = []
    
    # Define a replacement function
    def replace_link(match):
        url = match.group(1) if match.group(1) else match.group(2)
        urls_found.append(url)
        return ""  # Remove the matched URL from the text

    # Substitute the URLs with an empty string and capture them
    cleaned_text = re.sub(pattern, replace_link, text, flags=re.DOTALL)
    # Remove any text in the format (< [some text] >)
    cleaned_text = re.sub(bracket_pattern, "", cleaned_text)
    
    return cleaned_text, urls_found

### Clean text wrong characters

In [9]:
def clean_text_characters(text):
    
    clean_text = re.sub(r'\.{2,}', ' Seite ', text)
    clean_text = re.sub(r' {2,}', ' ', clean_text)
    clean_text = re.sub(r'(\.\s)+\.', '', clean_text)
    clean_text = clean_text.replace("", "•")
    clean_text = re.sub(r"(\d+(?:\.\d+)?)\n(\d+)\n\n(.+)", r"\1 \3 Seite \2", clean_text)

    return clean_text

# Applying cleaning test

## Load text

In [4]:
import os
import pandas as pd
from database_manager import MySQLDB, PineconeDBConnectorHybrid
from chunking_embeding_docs import config as db_config,db_name

root_dir = '/Users/rodolfocacacho/Documents/Documents/MAI/Master Thesis/Code/rag_clean_v2'
original_dir = os.getcwd()
os.chdir(root_dir)

metadata_csv = 'data/documents/metadata/Files_date_version.csv'

# Initialize the connector
pinecone_api_key = "f5bc0357-3863-4072-b3af-d15fae1c500f"
embed_dimension = 768  # Dimension for embeddings
embedding_model_name = "jinaai/jina-embeddings-v2-base-de"
cloud = "aws"
region = "us-east-1"
model_name_only = embedding_model_name.split("/")[1]


# SQL Tables & Indexes
table_pages_clean = 'table_documents_clean'
table_documents_name = 'table_documents'

max_tokens = 500
suffix = 'clean'
sql_embed_table = f'embedding_table_pinecone_sparse_{suffix}_{max_tokens}'
sql_embed_table_embedding = sql_embed_table+'_embedding'
sql_table_vocab = f'vocabulary_bm25_{suffix}_{max_tokens}'
index_name = f'{model_name_only}-{suffix}-{max_tokens}'

# Db Connectors
con = PineconeDBConnectorHybrid(api_key=pinecone_api_key,
                          index_name=index_name,
                          embedding_model_name_dense=embedding_model_name,
                          dimension=embed_dimension,
                          cloud=cloud,
                          region=region)

sql_con = MySQLDB(config=db_config,database_name=db_name)

os.chdir(original_dir)
from chunking_embeding_docs import merge_pages,load_documents_pages

def load_complete_recent_docs(docs,df_codes):
    clean_text_list = []
    for doc in docs:
        pages_list = []
        doc_type = doc[0]['pdf_type']
        pdf_name = doc[0]['pdf_name']
        matched_row = df_codes[df_codes['file'] == pdf_name]
        recent = False
        # Check if any rows were found
        if not matched_row.empty:
            # Extract type_key and file_key from the matched row (assuming there's only one match)
            recent = matched_row["most_recent"].iloc[0]
        if recent:
            for page in doc:
                page['content'] = unicodedata.normalize('NFC',page['content'])
                page = page['content']
                pages_list.append(page)

            merged_doc = merge_pages(pages_list)
            dict_temp = {'name':pdf_name,
                         'original_text':merged_doc}
            clean_text_list.append(dict_temp)
    return clean_text_list

os.chdir(root_dir)

df_codes = pd.read_csv(metadata_csv)
df_codes['type_key'] = df_codes['type_key'].astype('int16')
df_codes['file_key'] = df_codes['file_key'].astype('int16')
df_codes['date_c'] = pd.to_datetime(df_codes['date'], format="%d/%m/%Y", errors='coerce', dayfirst=True)
df_codes = df_codes.sort_values(['type_key', 'date_c'], ascending=[True, False])
df_codes['most_recent'] = df_codes.groupby('type_key')['date_c'].transform('max') == df_codes['date_c']



  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rodolfocacacho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Embedding Dimension: 768
Using existing Pinecone index: jina-embeddings-v2-base-de-clean-500


In [5]:
docs = load_documents_pages(sql_con=sql_con)

recent_docs = load_complete_recent_docs(docs,df_codes)

## Apply Cleaning text

In [10]:
total_urls = 0
for i in recent_docs:
    print(f"{i['name']}\n")
    # print(f"{i['original_text']}")
    clean_text,urls = remove_links(i['original_text'])
    clean_text = clean_text_characters(clean_text)
    print(f"Removed {len(urls)} URLs\n")
    print(f"Clean text:\n{clean_text}\n")
    # print(f"Original text:\n{i['original_text']}\n")

    total_urls+=len(urls)
    for url in urls:
        print(f"Url removed: {url}")
print(f"Removed {total_urls} URLs in total\n\n")

Richtlinie BEG EM (2023-12-21).pdf

Removed 0 URLs

Clean text:
Bundesministerium
für Wirtschaft und Klimaschutz
Richtlinie für die Bundesförderung für effiziente Gebäude – Einzelmaßnahmen (BEG EM)
Vom 21. Dezember 2023
1 Präambel
Diese Richtlinie ersetzt die Richtlinie für die Bundesförderung für effiziente Gebäude – Einzelmaßnahmen (BEG EM) vom 9. Dezember 2022 (BAnz AT 30.12.2022 B1).
Die Bundesförderung für effiziente Gebäude (BEG) unterstützt die Erreichung der Klimaziele, die auf nationaler Ebene im Klimaschutzgesetz dargelegt sind. Sie dient auch der Umsetzung des Klimaschutzprogramms 2023. Mit der BEG wurde die energetische Gebäudeförderung des Bundes daher in Umsetzung des Klimaschutzprogramms 2030 und der Förderstrategie „Energieeffizienz und Wärme aus Erneuerbaren Energien“ des Bundesministeriums für Wirtschaft und Klimaschutz (BMWK) neu aufgesetzt und in 2021 eingeführt. Die BEG ersetzte damit das CO2-Gebäudesanierungsprogramm (EBS-Programme), das Marktanreizprogramm für er