In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger

import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')


In [3]:
import fitz

import os
path = 'E:/Code/chat/gdpr/pdf/guidelines'
file_name = 'edpb_recommendations_20221_bcr-c_v2_en.pdf'
paper_name = 'Recommendations 1/2022 on the Application for Approval and on the elements and principles to be found in Controller Binding Corporate Rules (Art. 47 GDPR)'
full_path = os.path.join(path, file_name)
doc = fitz.open(full_path)

In [4]:
import importlib
import src.extract_from_pdf
importlib.reload(src.extract_from_pdf)
from src.extract_from_pdf import output_doc_as_text

lines_to_delete = [] 
characters_to_replace = []
characters_to_replace.append(['“', '"'])
characters_to_replace.append(['”', '"'])
characters_to_replace.append(['‘', "'"])
characters_to_replace.append(['’', "'"])
document_text = output_doc_as_text(doc, start_page = 3, end_page = 0, header_size=80, footer_size=60, lines_to_delete = lines_to_delete, characters_to_replace = characters_to_replace)

file = "../tmp/article_47_bcr.txt"
with open(file, 'w', encoding='utf-8') as f:
    f.write(document_text)

In [2]:
file_path = "../../original/article_47_bcr.md"
with open(file_path, 'r', encoding = "utf-8") as file:
    text = file.read()

lines = text.split('\n')
# get rid of empty lines
lines = [line for line in lines if line]

doc_as_array = []
notes_as_array = []
#footnote_pattern = re.compile(r'^(\[\^\d{1,2}\]:)(.*)$')
footnote_pattern = re.compile(r'^\[\^(\d{1,2})\]:(.*)$')
for entry in lines:
    footnote_match = footnote_pattern.match(entry)
    if footnote_match:
        notes_as_array.append([footnote_match.group(1), footnote_match.group(2).strip()])
    else:
        doc_as_array.append(entry)

columns = ["note_number", "text"]
df_notes = pd.DataFrame(notes_as_array, columns = columns)

In [3]:
columns = ["section", "point", "heading", "text", "section_reference"]
section = ""
point = ""
heading = False
text = ""
section_reference = ""
section_pattern = re.compile(r'^(\d+)\s+(.+)$') # number, space, text
point_pattern = re.compile(r'^(\d+)\.\s+(.+)$') # number, fill stop, space, text
part_pattern = re.compile(r'^PART (\d+):(.+)$') # PART, number, :,  text
annex_pattern = re.compile(r'^ANNEX (\d+):(.+)$') # PART, number, :,  text

table_pattern = re.compile(r'\|')
table_entry_number_pattern = re.compile(r'(\d+(\.\d+)?(\.\d+)?)\s+(.+)')
table_section = re.compile(r'<td colspan=6>(\d+) - (.+)$')
table_subsection = re.compile(r'<td colspan=6>\s*(.+)\s*')


data = []
table_data = []
for entry in doc_as_array:
    section_match = section_pattern.match(entry)
    point_match = point_pattern.match(entry)
    part_match = part_pattern.match(entry)
    annex_match = annex_pattern.match(entry)
    table_match = table_pattern.match(entry)
    if section_match:
        match = section_match
        section = match.group(1)
        point = ""
        heading = True
        text = match.group(2)
        section_reference = section
        data.append([section, point, heading, text, section_reference])
    elif point_match:
        match = point_match
        section = section
        point = match.group(1)
        if "Application" in section:
            heading = True
        else:
            heading = False
        text = match.group(2)
        section_reference = section + "." + point
        data.append([section, point, heading, text, section_reference])
    elif part_match:
        match = part_match
        section = "Application. Part " + match.group(1)
        point = ""
        heading = True
        text = match.group(2)
        section_reference = section
        data.append([section, point, heading, text, section_reference])
    elif annex_match:
        match = annex_match
        section = "Application. Annex " + match.group(1)
        point = ""
        heading = True
        text = match.group(2)
        section_reference = section
        data.append([section, point, heading, text, section_reference])
    elif table_match:
        components = [component.strip() for component in entry.strip().strip('|').split('|')]
        if len(components) == 5:
            table_entry_number_match = table_entry_number_pattern.match(components[0])
            if table_entry_number_match:
                components = [table_entry_number_match.group(1), table_entry_number_match.group(4)] + components[1:]
                table_data.append(components)
            else:
                print(f"Row removed from table_data: {components}")
                #components = [""] + components
        elif len(components) == 1:
            table_section_match = table_section.match(components[0])
            table_subsection_match = table_subsection.match(components[0])
            if table_section_match:
                components = [table_section_match.group(1), table_section_match.group(2)]
                table_data.append(components)
            elif table_subsection_match:
                print(f"Row removed from table_data: {components}")
                # components = ["", table_subsection_match.group(1)]
            else:
                raise AttributeError(f"A row with one element did not match the required patterns: {entry}")
        else:
            raise AttributeError("A table entry did not have the correct number of elements")
    else:
        section = section
        point = point
        heading = False
        text = entry
        section_reference = section
        if point:
            section_reference = section_reference + "." + point
        data.append(["", "", heading, text, section_reference])



df = pd.DataFrame(data, columns = columns)
# Remove my note about the table
df = df[df["text"] != 'Note this table contains a column "References to BCR-C, application form BCR-C, and / or supporting documents[^14]" which is empty in the document because it is supposed to be filled out by the controller'] 

Row removed from table_data: ['Criteria for BCR-C approval', 'In BCR-C', 'In application form', 'Reference', 'Comments']
Row removed from table_data: ['---', '---', '---', '---', '---']
Row removed from table_data: ['<td colspan=6>Internally']
Row removed from table_data: ['<td colspan=6>Externally']


In [7]:
bcr_requirements = "Points that need to be addressed in a binding corporate resolution for controllers (BCR-C) are:\n"
bcr_requirement_comments = []
columns = ["section", "point", "heading", "text", "section_reference"]

section = '3'
for row in table_data:
    if len(row) == 2:
        point = row[0]
        heading = True 
        text = row[1] 
        section_reference = section + "." + point
        bcr_requirement_comments.append([section, point, heading, text, section_reference])
    else:
        point = row[0]
        heading = False 
        text = row[5] 
        section_reference = section + "." + point
        bcr_requirement_comments.append([section, point, heading, text, section_reference])

        bcr_requirements += f"{section_reference}: {row[1]} ({row[4]}).\n"

# NOTE: Add the table sections as a piece of analysis with its own numbering (not sure how clever this is?)
bcr_requirement_comments.append(["Analysis", "1", False, bcr_requirements, f"Analysis 1"])

df_table = pd.DataFrame(bcr_requirement_comments, columns = columns)

combined = pd.concat([df, df_table], ignore_index=True)


In [8]:
# Add footnotes
import re

def find_footnote_references(text):
    pattern = r'\[\^(\d+)\]'
    return re.findall(pattern, text)

for index, row in combined.iterrows():
    footnotes = find_footnote_references(row['text'])
    if footnotes:
        augmented_note = row['text']
        for note in footnotes:
            augmented_note += f"\n[^{note}]: {df_notes[df_notes['note_number'] == note].iloc[0]['text']}"
        print(f"Row {index} augmented with footnotes")
        #print(augmented_note)
        combined.at[index, "text"] = augmented_note
        #print

Row 7 augmented with footnotes
Row 9 augmented with footnotes
Row 12 augmented with footnotes
Row 13 augmented with footnotes
Row 14 augmented with footnotes
Row 15 augmented with footnotes
Row 32 augmented with footnotes
Row 60 augmented with footnotes
Row 66 augmented with footnotes
Row 114 augmented with footnotes
Row 120 augmented with footnotes
Row 126 augmented with footnotes
Row 128 augmented with footnotes
Row 139 augmented with footnotes
Row 141 augmented with footnotes
Row 144 augmented with footnotes
Row 145 augmented with footnotes
Row 154 augmented with footnotes


In [9]:

# doc.document_as_df.at[154, "section_reference"] = "Analysis"

file = "../../inputs/documents/article_47_bcr.csv"
combined.to_csv(file, encoding = "utf-8", sep="|", index = False, na_rep="")


In [13]:
import importlib
import gdpr_rag.documents.article_47_bcr
importlib.reload(gdpr_rag.documents.article_47_bcr)
from gdpr_rag.documents.article_47_bcr import Article_47_BCR

import sys
sys.path.append('E:/Code/chat/gdpr')
path_to_manual_as_csv_file = "../../inputs/documents/article_47_bcr.csv"

doc = Article_47_BCR(path_to_manual_as_csv_file)


In [18]:
from IPython.display import Markdown, display

section = "1"
section = "1.3"
# section = "3.3.1"
section = "3.5.4.1"
# section = "Application. Annex 1"
# section = "Analysis"
#section = "1.10"
print(doc.get_heading(section))
display(Markdown(doc.get_text(section)))


3 ELEMENTS AND PRINCIPLES TO BE FOUND IN BCR-C
3.5 DATA PROTECTION SAFEGUARDS


# 3 ELEMENTS AND PRINCIPLES TO BE FOUND IN BCR-C

## 3.5 DATA PROTECTION SAFEGUARDS

The BCR C shall contain a clear commitment that BCR members will use the BCR-C as a tool for transfers only where they have assessed that the law and practices in the third country of destination applicable to the processing of the personal data by the BCR member acting as data importer, including any requirements to disclose personal data or measures authorising access by public authorities, do not prevent it from fulfilling its obligations under these BCR-C. <br>The BCR-C should further specify that this is based on the understanding that laws and practices that respect the essence of the fundamental rights and freedoms, and do not exceed what is necessary and proportionate in a democratic society[^24] to safeguard one of the objectives listed in Article 23(1) GDPR, are not in contradiction with the BCR-C.<br> The BCR-C should also contain a commitment that, in assessing the laws and practices of the third country which may affect the respect of the commitments contained in the BCR-C, the BCR members have taken due account, in particular, of the following elements: <ol type="i"> <li>The specific circumstances of the transfers or set of transfers, and of any envisaged onward transfers within the same third country or to another third country, including: <ul><li> purposes for which the data are transferred and processed (e.g. marketing, HR, storage, IT support, clinical trials); </li><li> types of entities involved in the processing (the data importer and any further recipient of any onward transfer); </li><li> economic sector in which the transfer or set of transfers occur; </li><li> categories and format of the personal data transferred; </li><li> location of the processing, including storage; and </li><li> transmission channels used. </ul> </li><li> The laws and practices of the third country of destination relevant in light of the circumstances of the transfer[^25], including those requiring to disclose data to public authorities or authorising access by such authorities and those providing for access to these data during the transit between the country of the data exporter and the country of the data importer, as well as the applicable limitations and safeguards[^26]. </li><li> Any relevant contractual, technical or organisational safeguards put in place to supplement the safeguards under the BCR-C, including measures applied during the transmission and to the processing of the personal data in the country of destination. </li></ol> The BCR-C should also contain a commitment that where any safeguards in addition to those envisaged under the BCR-C should be put in place, the Liable BCR member(s), and the relevant Privacy officer or Function will be informed and involved in such assessment. <br>The BCR-C should contain also an obligation for the BCR members to document appropriately such assessment, as well as the supplementary measures selected and implemented. They should make such documentation available to the competent SAs upon request. <br>The BCR-C should oblige any BCR member acting as data importer to promptly notify the data exporter if, when using these BCR-C as a tool for transfers, and for the duration of the BCR membership, it has reasons to believe that it is or has become subject to laws or practices that would prevent it from fulfilling its obligations under the BCR-C, including following a change in the laws in the third country or a measure (such as a disclosure request). This information should also be provided to the Liable BCR member(s). <br>Upon verification of such notification, the BCR member acting as data exporter, along with the Liable BCR member(s) and the relevant Privacy officer or Function, should commit to promptly identify supplementary measures (e.g. technical or organisational measures to ensure security and confidentiality) to be adopted by the BCR member acting as data exporter and/or data importer, in order to enable them to fulfil their obligations under the BCR-C. The same applies if a BCR member acting as data exporter has reasons to believe that a BCR member acting as its data importer can no longer fulfil its obligations under this BCR-C. <br>Where the BCR member acting as data exporter, along with the Liable BCR member(s) and the relevant Privacy officer or Function, assesses that the BCR-C – even if accompanied by supplementary measures – cannot be complied with for a transfer or set of transfers, or if instructed by the Competent SAs, it commits to suspend the transfer or set of transfers at stake, as well as all transfers for which the same assessment and reasoning would lead to a similar result, until compliance is again ensured or the transfer is ended. <br>The BCR-C should contain a commitment that following such a suspension, the BCR member acting as data exporter has to end the transfer or set of transfers if the BCR C cannot be complied with and compliance with the BCR is not restored within one month of suspension. In this case, personal data that have been transferred prior to the suspension, and any copies thereof, should, at the choice of the BCR member acting as data exporter, be returned to it or destroyed in their entirety. <br>The BCR-C should contain a commitment that the Liable BCR member(s) and the relevant Privacy officer or Function will inform all other BCR members of the assessment carried out and of its results, so that the identified supplementary measures will be applied in case the same type of transfers is carried out by any other BCR member or, where effective supplementary measures could not be put in place, the transfers at stake are suspended or ended. <br>The BCR-C needs to include a duty for data exporters to monitor, on an ongoing basis, and where appropriate in collaboration with data importers, developments in the third countries to which the data exporters have transferred personal data that could affect the initial assessment of the level of protection and the decisions taken accordingly on such transfers.

  
[^24]: See EDPB Recommendations 02/2020 on the European Essential Guarantees for surveillance measures.  
[^25]: As regards the assessment of the impact of the laws and practices of the third countries, please see EDPB Recommendations 01/2020 on measures that supplement transfer tools to ensure compliance with the EU level of protection of personal data.  
[^26]: As regards the impact of such laws and practices on compliance with the BCR, different elements may be considered as part of an overall assessment. Such elements may include relevant and documented practical experience with prior instances of requests for disclosure from public authorities, or the absence of such requests, covering a sufficiently representative time-frame. This refers in particular to internal records or other documentation, drawn up on a continuous basis in accordance with due diligence and certified at senior management level, provided that this information can be lawfully shared with third parties. Where this practical experience is relied upon to conclude that the data importer will not be prevented from complying with the BCR, it needs to be supported by other relevant, objective elements, and it is for the BCR members to consider carefully whether these elements together carry sufficient weight, in terms of their reliability and representativeness, to support this conclusion. In particular, the BCR members have to take into account whether their practical experience is corroborated and not contradicted by publicly available or otherwise accessible, reliable information on the existence or absence of requests within the same sector and/or the application of the law in practice, such as case law and reports by independent oversight bodies.

In [126]:
print(doc.get_heading("Application. Part 1"))


Application. Part 1  APPLICANT INFORMATION

