In [1]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

from markdownify import markdownify as md
from cleantext import clean

from toolz import *

# Experiment 1: try to use a parsing lib

In [2]:
from pathlib import Path

In [3]:
path_jun14act = Path('../data/Jun14_2023.pdf')
# https://artificialintelligenceact.eu/wp-content/uploads/2023/06/AIA-%E2%80%93-IMCO-LIBE-Draft-Compromise-Amendments-14-June-2023.pdf

In [34]:
def extract_text_wrt_thresholds(file_path, x_threshold, y_threshold):
    text_content = []

    for page_layout in extract_pages(file_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # Check if this text block's X-coordinate is greater than the threshold
                # If it is, it's in the second column
                if element.x0 > x_threshold and element.y0 > y_threshold:
                    text_content.append(element.get_text())

    return text_content

def our_clean(input_txt):
    return clean(input_txt,
                fix_unicode=True,               # fix various unicode errors
                to_ascii=False,                  # transliterate to closest ASCII representation
                lower=False,                     # lowercase text
                no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
                no_urls=False,                  # replace all URLs with a special token
                no_emails=False,                # replace all email addresses with a special token
                no_phone_numbers=False,         # replace all phone numbers with a special token
                no_numbers=False,               # replace all numbers with a special token
                no_digits=False,                # replace all digits with a special token
                no_currency_symbols=False,      # replace all currency symbols with a special token
                no_punct=False,                 # remove punctuations
                replace_with_punct="",          # instead of removing punctuations you may replace them
                replace_with_url="<URL>",
                replace_with_email="<EMAIL>",
                replace_with_phone_number="<PHONE>",
                replace_with_number="<NUMBER>",
                replace_with_digit="0",
                replace_with_currency_symbol="<CUR>",
                lang="en"                       # set to 'de' for German special handling
) 

def discard_unwanted_strs(str):
    str = str.lower().strip()
    return str != "deleted" and str != "amendment"



In [35]:
x_threshold, y_threshold = 300, 50
text_content = pipe(
                    extract_text_wrt_thresholds(path_jun14act, x_threshold, y_threshold), 
                    partial(filter, discard_unwanted_strs),
                    )


In [36]:
text_content_str = our_clean("\n".join(text_content))

# text_content now holds a list of strings, each string corresponding to a block of text in the second column.

markdown_content = md(text_content_str) # this prob isn't useful for this particular pdf

In [38]:
with open('output/exp_1_extracted_txt.txt', 'w') as file:
    file.write(text_content_str)

The issues with this: there were headers in the left column that the above basically ignores.

# Experiment 2: Try to leverage LLMs to do (even) more of the work

In [40]:
from pdfminer.high_level import extract_text

In [41]:
text_dump = extract_text(path_jun14act)

In [43]:
with open('output/simple_text_dump.txt', 'w') as file:
    file.write(text_dump)

In [44]:
# OK the results of my initial prompting weren't good -- will need to provide a more concrete spec

In [45]:
print(markdown_output)


Having regard to the opinion of the 
European Central Bank, 



Having regard to the joint opinion of the 
European Data Protection Board and the 
European Data Protection Supervisor, 



## Recital 1

(1) The purpose of this Regulation is to 
improve the functioning of the internal 
market by laying down a uniform legal 
framework in particular for the 
development, marketing and use of 
artificial intelligence in conformity with 
Union values. This Regulation pursues a 
number of overriding reasons of public 
interest, such as a high level of protection 
of health, safety and fundamental rights, 
and it ensures the free movement of AI-
based goods and services cross-border, 
thus preventing Member States from 
imposing restrictions on the development, 
marketing and use of AI systems, unless 
explicitly authorised by this Regulation.

(1) The purpose of this Regulation is to 
promote the uptake of human centric and 
trustworthy artificial intelligence and to 
ensure a high level of p