In [1]:
from gdpr_rag.gdpr_reference_checker import GDPRReferenceChecker
from gdpr_rag.gdpr_reader import GDPRReader

from regulations_rag.regulation_table_of_content import StandardTableOfContent, split_tree

reference_checker = GDPRReferenceChecker()
reader = GDPRReader()
df = reader.regulation_df


## Create a DataFrame that can be used to build the ToC
Note that while GDPR contains chapters, sections and article, references only use the article numbers

In [2]:
import pandas as pd
df_for_toc_list = []
current_heading = ""
for index, row in df.iterrows():
    heading = row['article_heading']
    if heading != current_heading:
        df_for_toc_list.append([row["article_heading"], True, str(row["article_number"])])
        current_heading = heading
    # always append the content
    df_for_toc_list.append([row['content'], False, row["section_reference"]])

#df_for_toc_list
df_for_toc = pd.DataFrame(df_for_toc_list, columns = ["text", "heading", "section_reference"])


In [3]:
toc = StandardTableOfContent(root_node_name = "GDPR", reference_checker = reference_checker, regulation_df = df_for_toc)
toc.print_tree()

GDPR []
|-- 1 [Subject-matter and objectives]
|   |-- (1) []
|   |-- (2) []
|   +-- (3) []
|-- 2 [Material scope]
|   |-- (1) []
|   |-- (2) []
|   |   |-- (a) []
|   |   |-- (b) []
|   |   |-- (c) []
|   |   +-- (d) []
|   |-- (3) []
|   +-- (4) []
|-- 3 [Territorial scope]
|   |-- (1) []
|   |-- (2) []
|   |   |-- (a) []
|   |   +-- (b) []
|   +-- (3) []
|-- 4 [Definitions]
|-- 5 [Principles relating to processing of personal data]
|   |-- (1) []
|   |   |-- (a) []
|   |   |-- (b) []
|   |   |-- (c) []
|   |   |-- (d) []
|   |   |-- (e) []
|   |   +-- (f) []
|   +-- (2) []
|-- 6 [Lawfulness of processing]
|   |-- (1) []
|   |   |-- (a) []
|   |   |-- (b) []
|   |   |-- (c) []
|   |   |-- (d) []
|   |   |-- (e) []
|   |   +-- (f) []
|   |-- (2) []
|   |-- (3) []
|   |   |-- (a) []
|   |   +-- (b) []
|   +-- (4) []
|       |-- (a) []
|       |-- (b) []
|       |-- (c) []
|       |-- (d) []
|       +-- (e) []
|-- 7 [Conditions for consent]
|   |-- (1) []
|   |-- (2) []
|   |-- (3) []
| 

In [20]:
split_df = split_tree(node = toc.root, regulation_reader = reader, table_of_content = toc, token_limit = 1000)
print(f"Number of sections to index: {len(split_df)}")

Number of sections to index: 143


In [21]:
split_df[split_df["section_reference"].str.contains(r"\(")]

Unnamed: 0,section_reference,text,token_count
27,28(1),28 Processor\n 1. Where processing is to be...,62
28,28(2),28 Processor\n 2. The processor shall not e...,66
29,28(3),28 Processor\n 3. Processing by a processor...,476
30,28(4),28 Processor\n 4. Where a processor engages...,136
31,28(5),28 Processor\n 5. Adherence of a processor ...,63
32,28(6),28 Processor\n 6. Without prejudice to an i...,89
33,28(7),28 Processor\n 7. The Commission may lay do...,46
34,28(8),28 Processor\n 8. A supervisory authority m...,45
35,28(9),28 Processor\n 9. The contract or the other...,33
36,28(10),28 Processor\n 10. Without prejudice to Art...,51


In [25]:
print(reader.get_regulation_detail("83"))

83 General conditions for imposing administrative fines
    1. Each supervisory authority shall ensure that the imposition of administrative fines pursuant to this Article in respect of infringements of this Regulation referred to in paragraphs 4, 5 and 6 shall in each individual case be effective, proportionate and dissuasive.
    2. Administrative fines shall, depending on the circumstances of each individual case, be imposed in addition to, or instead of, measures referred to in points (a) to (h) and (j) of Article 58(2). When deciding whether to impose an administrative fine and deciding on the amount of the administrative fine in each individual case due regard shall be given to the following:
        (a) the nature, gravity and duration of the infringement taking into account the nature scope or purpose of the processing concerned as well as the number of data subjects affected and the level of damage suffered by them;
        (b) the intentional or negligent character of the inf