File full of testing functions or old functions or scripts used to find out about the html structures of the documents

In [1]:
import pandas as pd

serachResults_df = pd.read_csv('Search results 20240102.csv')

celex_numbers = serachResults_df["CELEX number"].tolist()
celex_numbers = [s.replace("(", "%28").replace(")", "%29") for s in celex_numbers]
download_urls = set()

for number in celex_numbers:
    download_urls.add('https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:' + number)

# remove urls there the File does not exist
download_urls = list(download_urls.difference(set([
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22006A1216%2804%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22003A0624%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22002A1127%2802%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31971G0055",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41967A0228%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41971X0056",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31972Y1011%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41964A0430%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2802%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21961A0126%2801%29",
])))

download_urls[:5], len(download_urls)

(['https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32015A0228%2801%29',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32020D2255',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32012R0547',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:42009D0913',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32017R0460'],
 537)

In [2]:
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from collections import Counter

MIN_PARAGRAPH_LEN = 20

def getHtmlText(url):
    # Fetch HTML content from the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch HTML content. Status code: {response.status_code}")

def getParagraphs(text):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Extract paragraphs using the find_all method
    paragraphs = soup.find_all('p')

    return [(p.get_text().replace("\xa0", " "), p.sourceline, p.sourcepos) for p in paragraphs]

def getParagraphsFilteredByTables(text, tables):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Extract paragraphs using the find_all method
    paragraphs = soup.find_all('p')
    res = []

    nextTable = 0

    for p in paragraphs:
        p_text = p.get_text().replace("\xa0", " ").replace("\n", " ").strip()
        if len(p_text) >= MIN_PARAGRAPH_LEN:
            if nextTable == len(tables):
                res.append((p_text, p.sourceline, p.sourcepos))
            elif p.sourceline < tables[nextTable][0].sourceline:
                res.append((p_text, p.sourceline, p.sourcepos))
            elif p.find_parents('table', class_=lambda c: classFunc(c)):
                continue
            else:
                res.append((p_text, p.sourceline, p.sourcepos))
                nextTable += 1
    return res

def classFunc(c):
    # tables without a class are just numerations in normal text, so we have to exclude them here
    if c != None:
        return len(c) > 0
    else:
        return False

def getTables(text):
    soup = BeautifulSoup(text, 'html.parser')

    # Extract real tables using the find_all method
    tables = soup.find_all('table', class_=lambda c: classFunc(c))
    return [(table, table.sourceline, table.sourcepos) for table in tables]

def processHtml(url):
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')
    tables = getTables(htmlText)
    filteredPs = getParagraphsFilteredByTables(htmlText, tables)

    tableSentences = []
    for j, table in enumerate(tables):
        tableClass = table[0].get('class')[0]
        rows = table[0].find_all('tr')
        rowInputs = []
        hasHeaderLine = False
        for i, row in enumerate(rows):
            if i == 0:
                if tableClass == 'oj-table':
                    row_classes = [element.get('class') for element in row.descendants if isinstance(element, type(soup.new_tag('')))]
                    if 'oj-tbl-hdr' in row_classes:
                        hasHeaderLine = True
                if tableClass == 'table':
                    row_classes = [element.get('class') for element in row.descendants if isinstance(element, type(soup.new_tag('')))]
                    if 'tbl-hdr' in row_classes:
                        hasHeaderLine = True
            cells = row.find_all('td')
            rowInputs.append(cells)
        if hasHeaderLine:
            for row in range(1, len(rowInputs)):
                sentence = "The " + str(rowInputs[0])
        else:
            print("error")

    return filteredPs, tables


def saveToFile(url,title):
    # Fetch HTML content from the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        f = open(title + ".html", "w")
        f.write(response.text)
    else:
        print(f"Failed to fetch HTML content. Status code: {response.status_code}")

In [3]:
saveToFile("https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32011R0626", "file8")

In [None]:
from tqdm import tqdm
import csv

documents = []

for url in tqdm(download_urls[:5]):
    documents.append(getParagraphs(getHtmlText(url)))


with open('paragraphs_file.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(documents)

In [None]:
# findAllTableTypes
tableTypes = set()
for url in tqdm(download_urls):
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')
    tables = getTables(htmlText)
    for table in tables:
        tableTypes.add(str(table[0].get('class')))
tableTypes

In [None]:
# tables = getTables(getHtmlText("https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32009R0663"))

# print(len(tables))
# print("")
# print(tables[0])
# print("")
# print(tables[1])

saveToFile("https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32023L1791","test")

for url in download_urls[:10]:
    print(url + "    " + str(len(processHtml(url)[1])))

In [None]:
hasTheId = True
for url in tqdm(download_urls):
    htmlText = getHtmlText(url)
    if htmlText == None:
        print("error on URL: " + url)
    soup = BeautifulSoup(htmlText, 'html.parser')

    withId = soup.findAll('div', id='docHtml')
    if withId == None:
        hasTheId = False
    elif len(withId) > 1:
        hasTheId = False
hasTheId

In [None]:
hrSet = set()
for url in download_urls:
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')
    sepCount = 0
    docSepCount = 0
    noteCount = 0
    endCount = 0
    hrs = soup.findAll('hr')
    for hr in hrs:
        # if hr in ['separator', 'oj-separator']:
        #     sepCount += 1
        # elif hr in ['doc-sep', 'oj-doc-sep']:
        #     docSepCount += 1
        # elif hr in ['note', 'oj-note']:
        #     noteCount += 1
        # elif hr in ['doc-end', 'oj-doc-end']:
        #     endCount += 1
        if hr.get('class') != None:
            hrSet.update(hr.get('class'))
        else:
            print(url)
            hrSet.add('None')
hrSet

In [None]:
import pandas as pd
# read df from file

# make sure everything is in the right order
df = pd.read_csv('lines.csv')
for celexNumber, group  in df.groupby('CELEX number'):
    i = 0
    for index, row in group.iterrows():
        assert(row['lineID'] == i)
        i += 1