For Search Results: Fields that contain helpful information

- CELEX number
- EUROVOC descriptor
- Subject matter
- Directory code
- Dates (multile fields possible)

In [1]:
import pandas as pd

serachResults_df = pd.read_csv('Search_results_20240102.csv')

celex_numbers = serachResults_df["CELEX number"].tolist()
celex_numbers = [s.replace("(", "%28").replace(")", "%29") for s in celex_numbers]
download_urls = set()

for number in celex_numbers:
    download_urls.add('https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:' + number)

# remove urls there the File does not exist
download_urls = list(download_urls.difference(set([
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22006A1216%2804%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22003A0624%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:22002A1127%2802%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31971G0055",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41967A0228%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41971X0056",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31972Y1011%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41964A0430%2801%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2802%29",
    "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21961A0126%2801%29",
])))

download_urls[:5], len(download_urls)

(['https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31966S0022',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31984Y0620%2802%29',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32022R2576',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32015R1188',
  'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31974Y0614%2801%29'],
 537)

In [2]:
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from collections import Counter

MIN_PARAGRAPH_LEN = 20

def getHtmlText(url):
    # Fetch HTML content from the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch HTML content. Status code: {response.status_code}")

def getParagraphs(text):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Extract paragraphs using the find_all method
    paragraphs = soup.find_all('p')

    return [(p.get_text().replace("\xa0", " "), p.sourceline, p.sourcepos) for p in paragraphs]

def getParagraphsFilteredByTables(text, tables):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')

    # Extract paragraphs using the find_all method
    paragraphs = soup.find_all('p')
    res = []

    nextTable = 0

    for p in paragraphs:
        p_text = p.get_text().replace("\xa0", " ").replace("\n", " ").strip()
        if len(p_text) >= MIN_PARAGRAPH_LEN:
            if nextTable == len(tables):
                res.append((p_text, p.sourceline, p.sourcepos))
            elif p.sourceline < tables[nextTable][0].sourceline:
                res.append((p_text, p.sourceline, p.sourcepos))
            elif p.find_parents('table', class_=lambda c: classFunc(c)):
                continue
            else:
                res.append((p_text, p.sourceline, p.sourcepos))
                nextTable += 1
    return res

def classFunc(c):
    # tables without a class are just numerations in normal text, so we have to exclude them here
    if c != None:
        return len(c) > 0
    else:
        return False

def getTables(text):
    soup = BeautifulSoup(text, 'html.parser')

    # Extract real tables using the find_all method
    tables = soup.find_all('table', class_=lambda c: classFunc(c))
    return [(table, table.sourceline, table.sourcepos) for table in tables]

def processHtml(url):
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')
    tables = getTables(htmlText)
    filteredPs = getParagraphsFilteredByTables(htmlText, tables)

    tableSentences = []
    for j, table in enumerate(tables):
        tableClass = table[0].get('class')[0]
        rows = table[0].find_all('tr')
        rowInputs = []
        hasHeaderLine = False
        for i, row in enumerate(rows):
            if i == 0:
                if tableClass == 'oj-table':
                    row_classes = [element.get('class') for element in row.descendants if isinstance(element, type(soup.new_tag('')))]
                    if 'oj-tbl-hdr' in row_classes:
                        hasHeaderLine = True
                if tableClass == 'table':
                    row_classes = [element.get('class') for element in row.descendants if isinstance(element, type(soup.new_tag('')))]
                    if 'tbl-hdr' in row_classes:
                        hasHeaderLine = True
            cells = row.find_all('td')
            rowInputs.append(cells)
        if hasHeaderLine:
            for row in range(1, len(rowInputs)):
                sentence = "The " + str(rowInputs[0])
        else:
            print("error")

    return filteredPs, tables


def saveToFile(url,title):
    # Fetch HTML content from the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        f = open(title + ".html", "w")
        f.write(response.text)
    else:
        print(f"Failed to fetch HTML content. Status code: {response.status_code}")

In [None]:
hrSet = set()
for url in download_urls:
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')
    sepCount = 0
    docSepCount = 0
    noteCount = 0
    endCount = 0
    hrs = soup.findAll('hr')
    for hr in hrs:
        # if hr in ['separator', 'oj-separator']:
        #     sepCount += 1
        # elif hr in ['doc-sep', 'oj-doc-sep']:
        #     docSepCount += 1
        # elif hr in ['note', 'oj-note']:
        #     noteCount += 1
        # elif hr in ['doc-end', 'oj-doc-end']:
        #     endCount += 1
        if hr.get('class') != None:
            hrSet.update(hr.get('class'))
        else:
            print(url)
            hrSet.add('None')
hrSet

https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31971G0055
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41967A0228%2801%29
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41971X0056
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31972Y1011%2801%29
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2801%29
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:41964A0430%2801%29
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21959A1006%2802%29
https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:21961A0126%2801%29


{'None',
 'doc-end',
 'doc-sep',
 'note',
 'oj-doc-end',
 'oj-doc-sep',
 'oj-note',
 'oj-separator',
 'separator'}

In [23]:
%pip install lxml



In [24]:
import pandas as pd

df = pd.DataFrame(columns=['text', 'section', 'article', 'number1', 'number2', 'number3', 'number4', 'number5', 'number6', 'CELEX number'])

In [41]:
import re

def processTreeLike(url):
    index_of_colon = url.rfind(':')
    substring_after_colon = url[index_of_colon + 1:].strip()
    celexNumber = substring_after_colon.replace("%28", "(").replace("%29", ")")
    current = {
        'section': None,
        'article': None,
        'number1': None,
        'number2': None,
        'number3': None,
        'number4': None,
        'number5': None,
        'number6': None,
        'searchingSectionName': False,
        'status': 'Waiting',
        'enumerationTableCount': 0,
    }

    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')

    root = soup.find('body')


    def remove_empty_tr(html_table):
        # Remove empty <tr> tags with or without whitespace or newline characters
        cleaned_html = re.sub(r'<tr>\s*</tr>', '', html_table, flags=re.DOTALL)
        return cleaned_html

    def handleTable(htmlTable):
        global df
        hasHeader = False
        htmlTable = BeautifulSoup(remove_empty_tr(str(htmlTable)), 'html.parser')
        first_row = htmlTable.find('tr')
        ths = first_row.find_all('th')
        if first_row and ths != None and len(ths) > 0:
            hasHeader = True
        else:
            def isHeaderP(c):
                return c in ['tbl-hdr', 'oj-tbl-hdr']
            hdrPs = htmlTable.find('p', class_=lambda c: isHeaderP(c))
            if hdrPs:
                hasHeader = True
        try:
            pd.read_html(str(htmlTable), header=0)[0].iterrows()
        except Exception as e:
            # There is a table in 32014R0065 with only images that will crash it
            return
        if hasHeader:
            for _, row in pd.read_html(str(htmlTable), header=0)[0].iterrows():
                sentence = f"There is a correlation between: "
                sentence += ", ".join([f"{col}: '{value}'" for col, value in row.items() if not pd.isna(value)])
                new_row = {
                    'text': sentence,
                    'section': current['section'],
                    'article': current['article'],
                    'number1': current['number1'],
                    'number2': current['number2'],
                    'number3': current['number3'],
                    'number4': current['number4'],
                    'number5': current['number5'],
                    'number6': current['number6'],
                    'CELEX number': celexNumber,
                }
                new_df = pd.DataFrame([new_row], columns=df.columns)
                df = pd.concat([df, new_df], ignore_index=True)
        else:
            for _, row in pd.read_html(str(htmlTable))[0].iterrows():
                sentence = f"There is a correlation between: "
                sentence += ", ".join([f"'{value}'" for _, value in row.items() if not pd.isna(value)])
                new_row = {
                    'text': sentence,
                    'section': current['section'],
                    'article': current['article'],
                    'number1': current['number1'],
                    'number2': current['number2'],
                    'number3': current['number3'],
                    'number4': current['number4'],
                    'number5': current['number5'],
                    'number6': current['number6'],
                    'CELEX number': celexNumber,
                }
                new_df = pd.DataFrame([new_row], columns=df.columns)
                df = pd.concat([df, new_df], ignore_index=True)


    def extract_numerization_info(s):
        # Define a regular expression pattern for numerization
        numerization_pattern = re.compile(r'^(\d+)\.\s')

        # Use the pattern to check and extract the number
        match = re.match(numerization_pattern, s)
        if match:
            number = int(match.group(1))
            return True, number
        else:
            return False, None

    def isEnumberationInBrackets(s):
        # Define a regular expression pattern for the specified format
        pattern = re.compile(r'^\([a-zA-Z]+\)$')
        if s in ['(IRENA)','(recast)', '(signed)', '(watts)']:
            return False
        if len(s) > 0 and s[0] == "‘" and bool(re.match(pattern, s[1:])):
            return True
        # Use the pattern to check if the string matches the desired format
        return bool(re.match(pattern, s))

    def extract_article_number(text):
        # Define a regular expression pattern for the specified format
        pattern = re.compile(r'^Article (\d+)$')

        # Use the pattern to check and extract the number
        match = re.match(pattern, text)
        if match:
            number = int(match.group(1))
            return True, number
        else:
            return False, None

    def recursivePass(htmlElement):
        for child in htmlElement.children:
            if child.name:
                if child.name == 'hr' and child.get('class') != None:
                    hrClass = child.get('class')[0]
                    if hrClass in ['separator', 'oj-separator']:
                        current['section'] = None
                        current['article'] = None
                        current['number1'] = None
                        current['number2'] = None
                        current['number3'] = None
                        current['number4'] = None
                        current['number5'] = None
                        current['number6'] = None
                        current['status'] = 'Normal'
                        current['section'] = "Document"
                    elif hrClass in ['doc-sep', 'oj-doc-sep']:
                        current['section'] = None
                        current['article'] = None
                        current['number1'] = None
                        current['number2'] = None
                        current['number3'] = None
                        current['number4'] = None
                        current['number5'] = None
                        current['number6'] = None
                        current['status'] = 'Normal'
                        current['searchingSectionName'] = True
                    elif hrClass in ['note', 'oj-note']:
                        current['status'] = 'Footnote'
                    elif hrClass in ['doc-end', 'oj-doc-end']:
                        current['status'] = 'Waiting'

                elif current['status'] == 'Normal':
                    if child.name == 'p':
                        text = child.get_text().replace("\xa0", " ").replace("\n", " ").strip()
                        isNewNumber, number = extract_numerization_info(text)
                        isArticleNumber, articleNumber = extract_article_number(text)
                        if current['searchingSectionName']:
                            current['section'] = text
                            current['searchingSectionName'] = False
                        elif isNewNumber:
                            current['number1'] = number
                            current['number2'] = None
                            current['number3'] = None
                            current['number4'] = None
                            current['number5'] = None
                            current['number6'] = None
                        elif isEnumberationInBrackets(text):
                            if current['enumerationTableCount'] == 1:
                                current['number2'] = text
                                current['number3'] = None
                                current['number4'] = None
                                current['number5'] = None
                                current['number6'] = None
                            elif current['enumerationTableCount'] == 2:
                                current['number3'] = text
                                current['number4'] = None
                                current['number5'] = None
                                current['number6'] = None
                            elif current['enumerationTableCount'] == 3:
                                current['number4'] = text
                                current['number5'] = None
                                current['number6'] = None
                            elif current['enumerationTableCount'] == 4:
                                current['number5'] = text
                                current['number6'] = None
                            elif current['enumerationTableCount'] == 5:
                                current['number6'] = text
                            else:
                                print('error: Unexpected enumerationTableCount')
                        elif isArticleNumber:
                            current['article'] = articleNumber
                            current['number1'] = None
                            current['number2'] = None
                            current['number3'] = None
                            current['number4'] = None
                            current['number5'] = None
                            current['number6'] = None
                        else:
                            new_row = {
                                'text': text,
                                'section': current['section'],
                                'article': current['article'],
                                'number1': current['number1'],
                                'number2': current['number2'],
                                'number3': current['number3'],
                                'number4': current['number4'],
                                'number5': current['number5'],
                                'number6': current['number6'],
                                'CELEX number': celexNumber,
                            }
                            global df
                            new_df = pd.DataFrame([new_row], columns=df.columns)

                            # Concatenate the existing DataFrame and the new DataFrame
                            df = pd.concat([df, new_df], ignore_index=True)
                    elif child.name == 'table' and child.get('class') != None:
                        handleTable(child)
                    else:
                        if child.name == 'table':
                            current['enumerationTableCount'] += 1
                        recursivePass(child)
                        if child.name == 'table':
                            current['enumerationTableCount'] -= 1
                elif current['status'] == 'Footnote':
                    continue #TODO for now just ignoring footnotes
    recursivePass(root)

    return celexNumber

def processAllPsInside(url, celexNumber):
    htmlText = getHtmlText(url)
    soup = BeautifulSoup(htmlText, 'html.parser')

    for p in soup.findAll('p'):
        new_row = {
            'text': p.get_text().replace("\xa0", " ").replace("\n", " ").strip(),
            'section': None,
            'article': None,
            'number1': None,
            'number2': None,
            'number3': None,
            'number4': None,
            'number5': None,
            'number6': None,
            'CELEX number': celexNumber,
        }
        global df
        new_df = pd.DataFrame([new_row], columns=df.columns)

        # Concatenate the existing DataFrame and the new DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

for url in tqdm(download_urls):
    celexNumber = processTreeLike(url)
    if celexNumber not in df['CELEX number'].values:
        processAllPsInside(url, celexNumber)
    if celexNumber not in df['CELEX number'].values:
        print("error: Unprocessed " + url)


print('files processed:' + str(df['CELEX number'].nunique()) + " of " + str(len(download_urls)))
df

100%|██████████| 537/537 [24:35<00:00,  2.75s/it]

files processed:537 of 537





Unnamed: 0,text,section,article,number1,number2,number3,number4,number5,number6,CELEX number,embedding
0,Avis juridique important,,,,,,,,,31966S0022,"[-0.58367527, -0.6265276, -0.39014927, 0.05694..."
1,ECSC High Authority: Decision No 22/66 of 16 N...,,,,,,,,,31966S0022,"[-0.43895453, -0.36127993, -0.17728768, 0.0759..."
2,DECISION No 22-66 of 16 November 1966 on infor...,,,,,,,,,31966S0022,"[0.17323147, -0.20722093, -0.06914423, 0.02820..."
3,DECISION No 22-66 of 16 November 1966 on infor...,,,,,,,,,31966S0022,"[-0.111640394, -0.23657109, 0.033515215, 0.345..."
4,"THE HIGH AUTHORITY,",,,,,,,,,31966S0022,"[0.29226238, -0.35313487, 0.0027079135, 0.2969..."
...,...,...,...,...,...,...,...,...,...,...,...
107811,There is a correlation between: Regulation No ...,ANNEX II,,,,,,,,32006R0066,
107812,There is a correlation between: Regulation No ...,ANNEX II,,,,,,,,32006R0066,
107813,There is a correlation between: Regulation No ...,ANNEX II,,,,,,,,32006R0066,
107814,There is a correlation between: Regulation No ...,ANNEX II,,,,,,,,32006R0066,


In [27]:
df

Unnamed: 0,text,section,article,number1,number2,number3,number4,number5,number6,CELEX number
0,Avis juridique important,,,,,,,,,31966S0022
1,ECSC High Authority: Decision No 22/66 of 16 N...,,,,,,,,,31966S0022
2,DECISION No 22-66 of 16 November 1966 on infor...,,,,,,,,,31966S0022
3,DECISION No 22-66 of 16 November 1966 on infor...,,,,,,,,,31966S0022
4,"THE HIGH AUTHORITY,",,,,,,,,,31966S0022
...,...,...,...,...,...,...,...,...,...,...
9321,when considered necessary by the executive off...,ANNEX,14,1,(a),,,,,32008D0114
9322,"at the request of the Director General, in par...",ANNEX,14,1,(b),,,,,32008D0114
9323,at the request in writing of not less than one...,ANNEX,14,1,(c),,,,,32008D0114
9324,The agenda shall be prepared by the Agency in ...,ANNEX,14,1,(c),,,,,32008D0114


In [28]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to get spaCy embedding for a text string
def get_embedding(text):
    doc = nlp(text)
    # Use the vector for the entire document
    return doc.vector

# Apply the function to the 'text' column and create a new 'embedding' column
df['embedding'] = df['text'].apply(get_embedding)

# Display the DataFrame with embeddings
print(df[['text', 'embedding']])


                                                   text  \
0                              Avis juridique important   
1     ECSC High Authority: Decision No 22/66 of 16 N...   
2     DECISION No 22-66 of 16 November 1966 on infor...   
3     DECISION No 22-66 of 16 November 1966 on infor...   
4                                   THE HIGH AUTHORITY,   
...                                                 ...   
9321  when considered necessary by the executive off...   
9322  at the request of the Director General, in par...   
9323  at the request in writing of not less than one...   
9324  The agenda shall be prepared by the Agency in ...   
9325  The Agency shall send documents relating to th...   

                                              embedding  
0     [-0.58367527, -0.6265276, -0.39014927, 0.05694...  
1     [-0.43895453, -0.36127993, -0.17728768, 0.0759...  
2     [0.17323147, -0.20722093, -0.06914423, 0.02820...  
3     [-0.111640394, -0.23657109, 0.033515215, 0.345...  
4

In [29]:
!pip install faiss-cpu



In [40]:
import numpy as np
import faiss

# Find the maximum length of embeddings
max_length = df['embedding'].apply(len).max()
print(max_length)

# Pad or truncate embeddings to a fixed size
df['embedding'] = df['embedding'].apply(lambda emb: np.pad(emb, (0, max_length - len(emb)), 'constant'))

# Convert the list of embeddings to a 2D numpy array
embeddings_array = np.vstack(df['embedding'].to_numpy())

# Check if there are valid embeddings
if embeddings_array.shape[0] == 0:
    print("No valid embeddings found.")
else:
    # Print the embeddings
    print(embeddings_array)
    print(embeddings_array.shape)

    # Initialize Faiss index
    index = faiss.IndexFlatL2(embeddings_array.shape[1])

    # Add embeddings to the index
    index.add(embeddings_array)



# Save Faiss index and embeddings_array for future use
#faiss.write_index(index, '/content/drive/MyDrive/faiss_index.index')
#np.save('/content/drive/MyDrive/faiss_embeddings.npy', embeddings_array)


300
[[-0.58367527 -0.6265276  -0.39014927 ...  0.          0.
   0.        ]
 [-0.43895453 -0.36127993 -0.17728768 ...  0.          0.
   0.        ]
 [ 0.17323147 -0.20722093 -0.06914423 ...  0.          0.
   0.        ]
 ...
 [ 0.1837628   0.1001529   0.06646074 ...  0.          0.
   0.        ]
 [ 0.15063582 -0.31780526 -0.02039889 ...  0.          0.
   0.        ]
 [ 0.09893484 -0.0559872  -0.07965943 ...  0.          0.
   0.        ]]
(9326, 300)


In [38]:
import numpy as np
import faiss
import spacy

# Load pre-trained spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to get the spaCy embedding for a given text
def get_embedding_from_model(text):
    doc = nlp(text)
    # Use the vector for the entire document
    if len(doc) > 0:
        return doc.vector
    else:
        return None

# Provide a query text
query_text = "According to FINAL TEST METHOD FOR DISPLAYS, what is the Maximum Total Harmonic Distortion for the market of North America and Taiwan?"

# Get the embedding for the query text
query_embedding = get_embedding_from_model(query_text)

# Check if the dimensions match
if query_embedding is not None and len(query_embedding) == index.d:
    # Convert the query embedding to a numpy array
    query_embedding_array = np.array([query_embedding])

    # Perform similarity search for the query
    distances, result_indices = index.search(query_embedding_array, k=5)

    # Display the results
    print("Query Text:")
    print(query_text)
    print("\nSimilar Documents:")

    # 'result_indices' is the list of indices obtained from the similarity search
    for distance, idx in zip(distances[0], result_indices[0]):
        print(f"\nDocument Index: {idx}")
        print("Similarity Score:", 1 - distance)
        print("\n")
        # print(texts[idx].page_content)
else:
    print("Error: Query embedding dimensions do not match Faiss index dimensions.")


Error: Query embedding dimensions do not match Faiss index dimensions.
