In [2]:
import os
import re
import random
import spacy

from tqdm import tqdm
from bs4 import BeautifulSoup

from elqm.data import get_raw_data

In [3]:
spacy.prefer_gpu()
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
data = get_raw_data()

In [5]:
random_keys = random.sample(list(data.keys()), min(3, len(data)))

# Print the random elements
for key in random_keys:
    print(f"{key}: {data[key]}")

31997Y0711(01): {'Dates': {'Date of document': '27/06/1997', 'Date of effect': '27/06/1997', 'Date of end of validity': 'No end date'}, 'Misc': {'Author': 'Council of the European Union', 'Form': 'Resolution'}, 'Classification': {'EUROVOC descriptor': ['energy supply', 'environmental protection', 'EU publication', 'European standard', 'economic development', 'renewable energy'], 'Subject matter': ['Energy'], 'Directory code': {'code': '12.10.10.00', 'level 1': 'Energy', 'level 2': 'General principles and programmes', 'level 3': 'General'}}, 'html': '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html lang="EN">\n<head><meta name="format-detection" content="telephone=no"/>\n<meta name="DC.language" content="EN">\n<meta name="DC.title" content="EUR-Lex - 31997Y0711(01) - EN">\n<meta name="DC.subject" content="Energy, renewable energy, Community publication, environmental protection, economic development, energy supply, European standard, ">\n<meta name="DC.description" c

## Manual inspection

In [6]:
# Select 20 random documents and write them to files for manual inspection
sampleNumber = 5

random.seed(42)
# keys of the data dict. Each key accesses a dict insice the data dict that has the following attributes: Dates, Misc, Classification, html
specific_keys = list(data.keys())[:sampleNumber]

# Useless documents
uselessDocKey = []

for i, key in enumerate(tqdm(specific_keys)):
    # Data processing
    doc = data[key]
    html = doc["html"]
    soupHTML = BeautifulSoup(html, 'html.parser')
    text = soupHTML.get_text(separator='\n')
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    # Count invalid documents and save their key
    if (text.find("The requested document does not exist.") != -1):
        uselessDocKey.append(key)
        print(i)

    # Printing progress
    #print(key)
    #print(i)

    # Save extracted text to files
    subdirectory = 'filtered'
    file_name1 = f"{i}.html"
    file_name2 = f"{i}.txt"
    file_name3 = f"{i}original.html"
    file_path1 = os.path.join(subdirectory, file_name1)
    file_path2 = os.path.join(subdirectory, file_name2)
    file_path3 = os.path.join(subdirectory, file_name3)

    print(file_path3)
    
    os.makedirs(subdirectory, exist_ok=True)


    with open(file_path1, 'w', encoding='utf-8') as file:
        file.write(str(soupHTML))
    with open(file_path2, 'w', encoding='utf-8') as file:
        file.write(text)
    with open(file_path3, 'w', encoding='utf-8') as file:
        file.write(html)

100%|██████████| 5/5 [00:00<00:00, 25.97it/s]

0
filtered\0original.html
1
filtered\1original.html
2
filtered\2original.html
filtered\3original.html
filtered\4original.html





## Text only documents percentage

In [31]:
# Parse the HTML of each document in data and find out if there is a div with id "TexteOnly"

docs_text_only = []
docs_mixed = []
for id, doc in tqdm(data.items(), total=len(data)):
    soup = BeautifulSoup(doc['html'], 'html.parser')
    div = soup.find('div', id='TexteOnly')
    
    if div is None:
        docs_mixed.append(id)
    else:
        docs_text_only.append(id)

  0%|          | 0/508 [00:00<?, ?it/s]

100%|██████████| 508/508 [00:14<00:00, 36.20it/s] 


In [32]:
print(f'Found {len(docs_text_only)} ({len(docs_text_only)/len(data)*100:.2f}%) documents with only text')

Found 233 (45.87%) documents with only text


## Text only parsing

In [None]:
example_text_only = data[docs_text_only[0]]

# Remove all tags from the text
soup = BeautifulSoup(example_text_only['html'], 'html.parser')
text = soup.get_text()

# Reduce multiple line breaks to one with regex
text = re.sub(r'\n+', '\n', text)

print(text)


EUR-Lex - 32001Y0123(02) - EN
Avis juridique important
|
32001Y0123(02)
Commission opinion of 20 December 2000 concerning the plan for the disposal of radioactive waste from the commissioning of the liquid metal disposal plant (LMDP) and the waste receipt, assay, characterisation and supercompaction facility (WRACS) located on the Dounreay nuclear site in Scotland (United Kingdom), in accordance with Article 37 of the Euratom Treaty  
Official Journal C 020 , 23/01/2001 P. 0004 - 0004 
Commission opinionof 20 December 2000concerning the plan for the disposal of radioactive waste from the commissioning of the liquid metal disposal plant (LMDP) and the waste receipt, assay, characterisation and supercompaction facility (WRACS) located on the Dounreay nuclear site in Scotland (United Kingdom), in accordance with Article 37 of the Euratom Treaty(2001/C 20/03)(Only the English text is authentic)On 8 June 2000 the European Commission received from the United Kingdom Government, in accordanc

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]

In [None]:
sentences

['\nEUR-Lex - 32001Y0123(02) - EN\nAvis juridique important\n|\n32001Y0123(02)\nCommission opinion of 20 December 2000 concerning the plan for the disposal of radioactive waste from the commissioning of the liquid metal disposal plant (LMDP) and the waste receipt, assay, characterisation and supercompaction facility (WRACS) located on the Dounreay nuclear site in Scotland (United Kingdom), in accordance with Article 37 of the Euratom Treaty  \nOfficial Journal C 020 , 23/01/2001 P. 0004 - 0004 \nCommission opinionof 20 December 2000concerning the plan for the disposal of radioactive waste from the commissioning of the liquid metal disposal plant (LMDP) and the waste receipt, assay, characterisation and supercompaction facility (WRACS) located on the Dounreay nuclear site in Scotland (United Kingdom), in accordance with Article 37 of the Euratom Treaty(2001/C',
 '20/03)(Only the English text is authentic)On 8 June 2000 the European Commission received from the United Kingdom Government,

## Mixed text parsing

In [None]:
example_mixed = data[docs_mixed[0]]

# Remove all tags from the text
soup = BeautifulSoup(example_mixed['html'], 'html.parser')
text = soup.get_text()

# Reduce multiple line breaks to one with regex
text = re.sub(r'\n+', '\n', text)

print(text)


L_2009191EN.01003501.xml
23.7.2009   
EN
Official Journal of the European Union
L 191/35
COMMISSION REGULATION (EC) No 641/2009
of 22 July 2009
implementing Directive 2005/32/EC of the European Parliament and of the Council with regard to ecodesign requirements for glandless standalone circulators and glandless circulators integrated in products
(Text with EEA relevance)
THE COMMISSION OF THE EUROPEAN COMMUNITIES,
Having regard to the Treaty establishing the European Community,
Having regard to Directive 2005/32/EC of the European Parliament and of the Council of 6 July 2005 establishing a framework for the setting of ecodesign requirements for energy-using products and amending Council Directive 92/42/EEC and Directives 96/57/EC and 2000/55/EC of the European Parliament and of the Council (1), and in particular Article 15(1) thereof,
After consulting the Ecodesign Consultation Forum,
Whereas:
(1)
Under Directive 2005/32/EC ecodesign requirements should be set by the Commission for en

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]

In [None]:
sentences

['\nL_2009191EN.01003501.xml\n23.7.2009\xa0\xa0\xa0\nEN\nOfficial Journal of the European Union\nL 191/35\nCOMMISSION REGULATION (EC)',
 'No 641/2009\nof 22 July 2009\nimplementing Directive 2005/32/EC of the European Parliament and of the Council with regard to ecodesign requirements for glandless standalone circulators and glandless circulators integrated in products\n(Text with EEA relevance)\n',
 'THE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Directive 2005/32/EC of the European Parliament and of the Council of 6 July 2005 establishing a framework for the setting of ecodesign requirements for energy-using products and amending Council Directive 92/42/EEC and Directives 96/57/EC and 2000/55/EC of the European Parliament and of the Council\xa0(1), and in particular Article 15(1) thereof,\n',
 'After consulting the Ecodesign Consultation Forum,\nWhereas:\n(1)\nUnder Directive 2005/32/EC ecodesign require