In [148]:
import requests
from bs4 import BeautifulSoup

def extract_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

url='https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679#d1e2182-1-1'
text = extract_content(url)

file_path = "./original/gdrp.txt"
with open(file_path, 'w', encoding = "utf-8") as file:
        file.write(text)

In [2]:
#make manual adjustments

file_path = "./original/gdpr.txt"
with open(file_path, 'r', encoding = "utf-8") as file:
    text = file.read()
    
# 1) Text in Article 50 jumps directly to minor indexes without major indexes
text_in_regs = "In relation to third countries and international organisations, the Commission and supervisory authorities shall take appropriate steps to:"
replacement_text = "1.   In relation to third countries and international organisations, the Commission and supervisory authorities shall take appropriate steps to:"
text = text.replace('text_in_regs', 'replacement_text')

with open(file_path, 'w', encoding = "utf-8") as file:
    file.write(text)



In [1]:
import re

def replace_newlines(text):
    '''
    Replaces multiple newline characters with a single newline character
    '''
    return re.sub(r'\n+', '\n', text)

def replace_nonbreaking_space(text):
    return text.replace('\xa0', ' ')

def reformat_bullets(text):
    return text.replace('—\n', '- ')

    
def reformat_lines(text):
    '''
    This function takes a string as input and searches for patterns where a line starts with "(",
    followed by a letter or a 1, 2 or 3 digit number, then ")", and finally a newline character "\n".
    It replaces these newline characters with a space " ", effectively joining such lines with
    the following text. This reformatting is applied throughout the entire input string.
    '''
    return re.sub(r'\((\d{1,3}|[A-Za-z])\)\n', r'(\1) ', text)

In [4]:
file_path = "../original/gdpr.txt"
with open(file_path, 'r', encoding = "utf-8") as file:
    text = file.read()

In [5]:
regs = reformat_lines(reformat_bullets(replace_nonbreaking_space(replace_newlines(text))))

In [6]:
def parse_recitals(text, recitals_start, recital_end):
    lines = text.split('\n')
    start_parsing = False
    recitals = []
    current_recital = {}

    for line in lines:
        if not start_parsing:
            if line == recitals_start:
                start_parsing = True
            continue
        
        if line == recital_end:
            break

        match = re.match(r'\((\d{1,3})\)(.*)', line)
        if match:
            if current_recital:
                recitals.append(current_recital)
            current_recital = {'recital_number': int(match.group(1)), 'recital_text': match.group(2).strip()}
        else:
            if current_recital:
                current_recital['recital_text'] += '\n' + line.strip()

    if current_recital:
        recitals.append(current_recital)
    
    return recitals



In [7]:
recitals_start = "Whereas:"
recitals_end = "HAVE ADOPTED THIS REGULATION:"
recitals = parse_recitals(regs, recitals_start, recitals_end)

In [8]:
import pandas as pd
recitals_df = pd.DataFrame(recitals)
recitals_df.rename(columns={'recital_text': 'text'}, inplace=True)
recitals_file = "./tmp/recitals_gdpr_with_embedding.parquet"
recitals_df.to_parquet(recitals_file, engine = 'pyarrow')
recitals_df

Unnamed: 0,recital_number,text
0,1,The protection of natural persons in relation ...
1,2,"The principles of, and rules on the protection..."
2,3,Directive 95/46/EC of the European Parliament ...
3,4,The processing of personal data should be desi...
4,5,The economic and social integration resulting ...
...,...,...
168,169,The Commission should adopt immediately applic...
169,170,"Since the objective of this Regulation, namely..."
170,171,Directive 95/46/EC should be repealed by this ...
171,172,The European Data Protection Supervisor was co...


In [38]:
# Note: this is very bespoke. I need to ensure I do not impact the formatting of the Article and Chapters so that 
#       these can be parsed in line with the others
def extract_definitions(regs_with_definitions, definitions_start, definitions_end):
    lines = regs_with_definitions.split('\n')
    start_index = lines.index(definitions_start) + 1
    end_index = lines.index(definitions_end)

    definitions = lines[start_index:end_index]
    definitions = definitions.replace('‘', "'")
    definitions = definitions.replace('’', "'")



    remaining_text = lines[:start_index] + ["definitions"] + lines[end_index:] # the line "Definitions" will appear twice - once for the heading and once for the content
    #remaining_text = lines[:start_index] + definitions + lines[end_index:] # the line "Definitions" will appear twice - once for the heading and once for the content
    regs = "\n".join(remaining_text)

    definitions = "\n".join(definitions)

    return definitions, regs



In [39]:
import re
import pandas as pd

definitions_start = "Definitions"
definitions_end = "CHAPTER II"

definitions, regs_without_definitions = extract_definitions(regs, definitions_start, definitions_end)
definitions = definitions.replace('‘', "'")
definitions = definitions.replace('’', "'")
definitions = re.split(r'\n\(\d{1,2}\) ', definitions)

definitions = definitions[1:]

df_definitions = pd.DataFrame(definitions, columns = ["text"])
df_definitions['term'] = df_definitions['text'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))

definitions_file = "./tmp/definitions_gdpr_with_embedding.parquet"
df_definitions.to_parquet(definitions_file, engine = 'pyarrow')


df_definitions


OSError: Cannot save file into a non-existent directory: 'tmp'

In [30]:
chapter_number, chapter_heading, section_number, section_heading, article_number, article_heading = "I", "General provisions", "", "", "4", "Definitions"

major_reference = 0	
minor_reference = ""
content = ""
section_reference = ""

formatted = []
counter = 1
for dfn in definitions:
    major_reference = counter
    minor_reference = ""
    content = dfn
    section_reference = f"4({counter})"

    formatted.append([chapter_number, chapter_heading, section_number, section_heading, article_number, article_heading, major_reference, minor_reference, content, section_reference])
    counter = counter + 1

df_definitions = pd.DataFrame(formatted, columns = ["chapter_number", "chapter_heading", "section_number", "section_heading", "article_number", "article_heading", "major_reference", "minor_reference", "content", "section_reference"])
df_definitions
#print(regs_without_definitions)

Unnamed: 0,chapter_number,chapter_heading,section_number,section_heading,article_number,article_heading,major_reference,minor_reference,content,section_reference
0,I,General provisions,,,4,Definitions,1,,'personal data' means any information relating...,4(1)
1,I,General provisions,,,4,Definitions,2,,'processing' means any operation or set of ope...,4(2)
2,I,General provisions,,,4,Definitions,3,,'restriction of processing' means the marking ...,4(3)
3,I,General provisions,,,4,Definitions,4,,'profiling' means any form of automated proces...,4(4)
4,I,General provisions,,,4,Definitions,5,,'pseudonymisation' means the processing of per...,4(5)
5,I,General provisions,,,4,Definitions,6,,'filing system' means any structured set of pe...,4(6)
6,I,General provisions,,,4,Definitions,7,,'controller' means the natural or legal person...,4(7)
7,I,General provisions,,,4,Definitions,8,,"'processor' means a natural or legal person, p...",4(8)
8,I,General provisions,,,4,Definitions,9,,"'recipient' means a natural or legal person, p...",4(9)
9,I,General provisions,,,4,Definitions,10,,"'third party' means a natural or legal person,...",4(10)


In [6]:
from regulations_rag.regulation_reader import load_csv_data

df = load_csv_data("../inputs/documents/gdpr.csv")

In [11]:
df
df[df["article_number"] == 4]

Unnamed: 0,chapter_number,chapter_heading,section_number,section_heading,article_number,article_heading,major_reference,minor_reference,content,section_reference
0,I,General provisions,,,4,Definitions,1,,'personal data' means any information relating...,4(1)
1,I,General provisions,,,4,Definitions,2,,'processing' means any operation or set of ope...,4(2)
2,I,General provisions,,,4,Definitions,3,,'restriction of processing' means the marking ...,4(3)
3,I,General provisions,,,4,Definitions,4,,'profiling' means any form of automated proces...,4(4)
4,I,General provisions,,,4,Definitions,5,,'pseudonymisation' means the processing of per...,4(5)
5,I,General provisions,,,4,Definitions,6,,'filing system' means any structured set of pe...,4(6)
6,I,General provisions,,,4,Definitions,7,,'controller' means the natural or legal person...,4(7)
7,I,General provisions,,,4,Definitions,8,,"'processor' means a natural or legal person, p...",4(8)
8,I,General provisions,,,4,Definitions,9,,"'recipient' means a natural or legal person, p...",4(9)
9,I,General provisions,,,4,Definitions,10,,"'third party' means a natural or legal person,...",4(10)


In [12]:
df.to_csv("../inputs/documents/gdpr.csv", sep="|", encoding = "utf-8", index = False)

In [10]:
df = df.drop(index=42)
df = df.reset_index(drop=True)

In [31]:
combined = pd.concat([df_definitions, df], ignore_index=False)

In [33]:
combined.to_csv("../inputs/documents/gdpr.csv", sep="|", encoding = "utf-8", index = False)

In [40]:

def process_regulations(regs, content_start, content_ends):
    lines = regs.split('\n')
    processing = False
    skip_next_line = False  # Flag to skip the next line after a heading
    entries = []

    # Note Chapter and Section details do not form part of the full references used in GDPR. References start at the article 
    chapter_number, chapter_heading, section_number, section_title, article_number, article_heading = "", "", "", "", "", ""
    major_reference, minor_reference = "", ""

    for i, line in enumerate(lines):
        if line == content_ends:
            break
        if processing:
            if skip_next_line:
                skip_next_line = False
                continue  # Skip this line as it's a heading continuation
            if line.startswith(('CHAPTER', 'Section', 'Article')):
                parts = line.split(' ', 1)
                number = parts[1].strip() if len(parts) > 1 else ""
                skip_next_line = True  # Next line will be the heading, so skip it

                if line.startswith('CHAPTER'):
                    chapter_number = number
                    chapter_heading = lines[i + 1]
                    section_number, section_title, article_number, article_heading = "", "", "", ""
                    major_reference, minor_reference = "", ""
                elif line.startswith('Section'):
                    section_number = number
                    section_title = lines[i + 1]
                    article_number, article_heading = "", ""
                    major_reference, minor_reference = "", ""
                elif line.startswith('Article'):
                    article_number = number
                    article_heading = lines[i + 1]
                    major_reference, minor_reference = "", ""

                continue

            match_index = re.match(r'^(\d+)\.', line)
            match_letter = re.match(r'^\(([a-z])\)', line)
            #if not (match_index or match_letter):
                #raise Exception(f"Line {i+1} does not start with a valid index: {line}")

            if match_index:
                major_reference = match_index.group(1)
                minor_reference = ""  # Reset minor reference on major change
                content = line[match_index.end():].strip()
            elif match_letter:
                minor_reference = match_letter.group(1)
                content = line[match_letter.end():].strip()
            else:
                minor_reference = ""
                content = line.strip()

            entries.append({
                "chapter_number": chapter_number, "chapter_heading": chapter_heading,
                "section_number": section_number, "section_heading": section_title,
                "article_number": article_number, "article_heading": article_heading,
                "major_reference": major_reference, "minor_reference": minor_reference, 
                "text": content
            })

        elif line == content_start:
            processing = True

    return entries




In [41]:
content_start = "HAVE ADOPTED THIS REGULATION:" 
content_end = "This Regulation shall be binding in its entirety and directly applicable in all Member States."
processed_regs = process_regulations(regs_without_definitions, content_start, content_end)


In [42]:
import pandas as pd
df = pd.DataFrame(processed_regs)

In [43]:
df[['article_number',
 'article_heading',
 'major_reference',
 'minor_reference',
 'text']]

Unnamed: 0,article_number,article_heading,major_reference,minor_reference,text
0,1,Subject-matter and objectives,1,,This Regulation lays down rules relating to th...
1,1,Subject-matter and objectives,2,,This Regulation protects fundamental rights an...
2,1,Subject-matter and objectives,3,,The free movement of personal data within the ...
3,2,Material scope,1,,This Regulation applies to the processing of p...
4,2,Material scope,2,,This Regulation does not apply to the processi...
...,...,...,...,...,...
781,97,Commission reports,4,,In carrying out the evaluations and reviews re...
782,97,Commission reports,5,,"The Commission shall, if necessary, submit app..."
783,98,Review of other Union legal acts on data prote...,,,"The Commission shall, if appropriate, submit l..."
784,99,Entry into force and application,1,,This Regulation shall enter into force on the ...


In [30]:
import importlib
import gdpr_rag.gdpr_reference_checker
importlib.reload(gdpr_rag.gdpr_reference_checker)
from gdpr_rag.gdpr_reference_checker import GDPRReferenceChecker

#def add_section_reference(df):
df["section_reference"] =  df["article_number"].astype(str) + \
                    df["major_reference"].apply(lambda x: "(" + x + ")" if x != "" else "") + \
                    df["minor_reference"].apply(lambda x: "(" + x + ")" if x != "" else "")


reference_checker = GDPRReferenceChecker()
for index, row in df.iterrows():
    if not index_checker.is_valid(reference=row["section_reference"]):
        print(f"Row {index} does not have a valid index")


In [31]:
from regulations_rag.data_in_dataframes import required_columns_regulation
required_columns_regulation

['indent', 'reference', 'text', 'heading', 'section_reference', 'word_count']

In [10]:
processed_regs_file = "./regs/gdpr.csv"
df.to_csv(processed_regs_file, sep = "|", encoding = "utf-8", index = False)

In [16]:
import csv

# Open and read the first CSV file
with open("./regs/gdpr.csv", 'r', encoding='utf-8') as file1:
    #reader1 = csv.reader(file1, sep = "|", encoding = "utf-8")
    reader1 = csv.reader(file1, delimiter='|')
    data1 = list(reader1)

# Open and read the second CSV file
with open("./regs/gdpr copy.csv", encoding='utf-8') as file2:
    reader2 = csv.reader(file2, delimiter='|')
    data2 = list(reader2)

# Compare line by line
differences = []
for line1, line2 in zip(data1, data2):
    if line1 != line2:
        differences.append((line1, line2))

# differences now contains tuples of lines that differ between the two files

In [17]:
differences

[]