In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger

import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')


In [2]:
import fitz

import os
path = 'E:/Code/chat/gdpr/pdf/guidelines'
file_name = 'wp242_rev_01_en_D8A6FCF6-9039-846A-0C8040819826D818_44099.pdf'
paper_name = "Guidelines on the right to data portability"
full_path = os.path.join(path, file_name)
doc = fitz.open(full_path)

In [4]:
import importlib
import src.extract_from_pdf
importlib.reload(src.extract_from_pdf)
from src.extract_from_pdf import output_doc_as_text

lines_to_delete = [] 
characters_to_replace = []
characters_to_replace.append(['“', '"'])
characters_to_replace.append(['”', '"'])
characters_to_replace.append(['‘', "'"])
characters_to_replace.append(['’', "'"])
characters_to_replace.append(['', "-"])
characters_to_replace.append(['–', "-"])




document_text = output_doc_as_text(doc, start_page = 2, end_page = 0, header_size=70, footer_size=90, lines_to_delete = lines_to_delete, characters_to_replace = characters_to_replace)

file = "../tmp/data_portability.md"
with open(file, 'w', encoding='utf-8') as f:
    f.write(document_text)

## NOTE: I have manually added subsections


## Once you have the markdown file

In [6]:
import re
import pandas as pd
file_path = "../../original/data_portability.md"
with open(file_path, 'r', encoding = "utf-8") as file:
    text = file.read()

lines = text.split('\n')
# get rid of empty lines
lines = [line for line in lines if line]

doc_as_array = []
notes_as_array = []
#footnote_pattern = re.compile(r'^(\[\^\d{1,2}\]:)(.*)$')
footnote_pattern = re.compile(r'^\[\^(\d{1,2})\]:(.*)$')
for entry in lines:
    footnote_match = footnote_pattern.match(entry)
    if footnote_match:
        notes_as_array.append([footnote_match.group(1), footnote_match.group(2).strip()])
    else:
        doc_as_array.append(entry)

columns = ["note_number", "text"]
df_notes = pd.DataFrame(notes_as_array, columns = columns)
df_notes = df_notes[df_notes["text"].str.strip() != '']

In [7]:
columns = ["section", "subsection", "point", "heading", "text", "section_reference"]
section = ""
subsection = ""
point = ""
heading = False
text = ""
section_reference = ""

section_pattern = re.compile(r'\b(I|II|III|IV|V|VI|VII)\.\s*(.*)')
subsection_pattern = re.compile(r'([a-z])\.\s*(.*)') 
point_pattern = re.compile(r'^(\d+)\.\s+(.+)$') 
annex_pattern = re.compile(r'(?i)^ANNEX (\d+) - (.+)$') # ignore capitalization


annex_started = False
data = []
table_data = []
for entry in doc_as_array:
    if entry.strip() != '':
        section_match = section_pattern.match(entry)
        subsection_match = subsection_pattern.match(entry)
        point_match = point_pattern.match(entry)
        annex_match = annex_pattern.match(entry)

        if section_match:
            if annex_started:
                data.append(["", "", "", False, entry, "Annex"])
            else:
                match = section_match
                section = match.group(1)
                subsection = ""
                point = ""
                heading = True
                text = match.group(2)
                section_reference = section
                data.append([section, subsection, point, heading, text, section_reference])
        elif subsection_match:
            match = subsection_match
            section = section
            subsection = match.group(1)
            point = ""
            heading = True
            text = match.group(2)
            section_reference = section + "." + subsection
            data.append([section, subsection, point, heading, text, section_reference])
        elif point_match:
            match = point_match
            section = section
            subsection = subsection
            point = match.group(1)
            heading = True
            text = match.group(2)
            section_reference = section + "." + subsection + "." + point
            data.append([section, subsection, point, heading, text, section_reference])
        elif annex_match:
            match = annex_match
            annex_started = True
            section = "Annex"
            subsection = ""
            point = ""
            heading = True
            text = match.group(1)
            section_reference = section
            data.append([section, subsection, point, heading, text, section_reference])
        
        else:
            if annex_started:
                data.append(["", "", "", False, entry, "Annex"])
            else:
                section = section
                subsection = subsection
                point = point
                heading = False
                text = entry
                section_reference = section_reference
                
                data.append(["", "", "", heading, text, section_reference])




df = pd.DataFrame(data, columns = columns)

df.at[0, "heading"] = True # make Executive summary a heading
df.loc[df["section_reference"] == "",  "section_reference"] = "Executive summary"
# Remove my note about the table
#df = df[df["text"] != 'Note this table contains a column "References to BCR-C, application form BCR-C, and / or supporting documents[^14]" which is empty in the document because it is supposed to be filled out by the controller'] 

In [4]:
df[60:80]
#df.iloc[192]["text"]
#df[df["text"] == " "]

Unnamed: 0,section,subsection,point,heading,text,section_reference
60,,,,False,- Observed data provided by the data subject b...,III.b.2
61,,,,False,"In contrast, inferred data and derived data ar...",III.b.2
62,,,,False,"In general, given the policy objectives of the...",III.b.2
63,,,,False,"Thus, the term ""provided by"" includes personal...",III.b.2
64,III,b,3.0,True,Third condition: the right to data portability...,III.b.3
65,,,,False,With respect to personal data concerning other...,III.b.3
66,,,,False,The third condition is intended to avoid the r...,III.b.3
67,,,,False,"Such an adverse effect would occur, for instan...",III.b.3
68,,,,False,The data subject initiating the transmission o...,III.b.3
69,,,,False,"For example, a webmail service may allow the c...",III.b.3


In [8]:
# Add footnotes
import re

def find_footnote_references(text):
    pattern = r'\[\^(\d+)\]'
    return re.findall(pattern, text)

for index, row in df.iterrows():
    footnotes = find_footnote_references(row['text'])
    if footnotes:
        augmented_note = row['text']
        for note in footnotes:
            augmented_note += f"\n[^{note}]: {df_notes[df_notes['note_number'] == note].iloc[0]['text']}"
        print(f"Row {index} augmented with footnotes")
        #print(augmented_note)
        df.at[index, "text"] = augmented_note
        #print

Row 9 augmented with footnotes
Row 10 augmented with footnotes
Row 11 augmented with footnotes
Row 18 augmented with footnotes
Row 20 augmented with footnotes
Row 21 augmented with footnotes
Row 23 augmented with footnotes
Row 25 augmented with footnotes
Row 27 augmented with footnotes
Row 28 augmented with footnotes
Row 31 augmented with footnotes
Row 35 augmented with footnotes
Row 43 augmented with footnotes
Row 44 augmented with footnotes
Row 52 augmented with footnotes
Row 56 augmented with footnotes
Row 61 augmented with footnotes
Row 62 augmented with footnotes
Row 66 augmented with footnotes
Row 72 augmented with footnotes
Row 82 augmented with footnotes
Row 88 augmented with footnotes
Row 91 augmented with footnotes
Row 98 augmented with footnotes
Row 103 augmented with footnotes
Row 106 augmented with footnotes
Row 110 augmented with footnotes
Row 111 augmented with footnotes
Row 113 augmented with footnotes
Row 116 augmented with footnotes
Row 119 augmented with footnotes
Ro

In [9]:
file = "../../inputs/documents/data_portability.parquet" # use parquet to deal with the complex text so I don't need to worry about escape characters
df.to_parquet(file, engine = 'pyarrow')

#df_no_table.to_csv(file, encoding = "utf-8", sep="|", index = False, na_rep="", quotechar='"')


## Check that the document class works as expected

In [13]:
import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.data_portability
importlib.reload(gdpr_rag.documents.data_portability)
from gdpr_rag.documents.data_portability import DataPortability

path_to_manual_as_csv_file = "../../inputs/documents/data_portability.parquet"

doc = DataPortability(path_to_manual_as_csv_file)


In [14]:
from IPython.display import Markdown, display

section = "I"
section = "II"
section = "II.d"
section = "III.b.3"

#section = "Executive summary"


print(doc.get_heading(section))
display(Markdown(doc.get_text(section)))


III When does data portability apply?
III.b What personal data must be included?
III.b.3 Third condition: the right to data portability shall not adversely affect the rights and freedoms of others


# III When does data portability apply?

## III.b What personal data must be included?

### III.b.3 Third condition: the right to data portability shall not adversely affect the rights and freedoms of others

With respect to personal data concerning other data subjects:

The third condition is intended to avoid the retrieval and transmission of data containing the personal data of other (non-consenting) data subjects to a new data controller in cases where these data are likely to be processed in a way that would adversely affect the rights and freedoms of the other data subjects (Article 20(4) of the GDPR)[^22].

Such an adverse effect would occur, for instance, if the transmission of data from one data controller to another, would prevent third parties from exercising their rights as data subjects under the GDPR (such as the rights to information, access, etc.).

The data subject initiating the transmission of his or her data to another data controller, either gives consent to the new data controller for processing or enters into a contract with that controller. Where personal data of third parties are included in the data set another legal basis for the processing must be identified. For example, a legitimate interest may be pursued by the data controller under Article 6(1)(f), in particular when the purpose of the data controller is to provide a service to the data subject that allows the latter to process personal data for a purely personal or household activity. The processing operations initiated by the data subject in the context of personal activity that concern and potentially impact third parties remain under his or her responsibility, to the extent that such processing is not, in any manner, decided by the data controller.

For example, a webmail service may allow the creation of a directory of a data subject's contacts, friends, relatives, family and broader environment. Since these data relate to (and are created by) the identifiable individual that wishes to exercise his right to data portability, data controllers should transmit the entire directory of incoming and outgoing e-mails to that data subject.

Similarly, a data subject's bank account can contain personal data relating to the transactions not just of the account holder but also those of other individuals (e.g., if they have transferred money to the account holder). The rights and freedoms of those third parties are unlikely to be adversely affected by the transmission of the bank account information to the account holder once a portability request is made—provided that in both examples the data are used for the same purpose (i.e., a contact address only used by the data subject or a history of the data subject's bank account.

Conversely, the rights and freedoms of third parties will not be respected if the new data controller uses the personal data for other purposes, e.g. if the receiving data controller uses personal data of other individuals within the data subject's contact directory for marketing purposes.

Therefore, to prevent adverse effects on the third parties involved, the processing of such personal data by another controller is allowed only to the extent that the data are kept under the sole control of the requesting user and is only managed for purely personal or household needs. A receiving 'new' data controller (to whom the data can be transmitted at the request of the user) may not use the transmitted third party data for his own purposes e.g. to propose marketing products and services to those other third party data subjects. For example, this information should not be used to enrich the profile of the third party data subject and rebuild his social environment, without his knowledge and consent[^23]. Neither can it be used to retrieve information about such third parties and create specific profiles, even if their personal data are already held by the data controller. Otherwise, such processing is likely to be unlawful and unfair, especially if the third parties concerned are not informed and cannot exercise their rights as data subjects.

Furthermore, it is a leading practice for all data controllers (both the "sending" and "receiving" parties) to implement tools to enable data subjects to select the relevant data they wish to receive and transmit and exclude, where relevant, data of other individuals. This will further assist in reducing the risks for third parties whose personal data may be ported.

Additionally, the data controllers should implement consent mechanisms for other data subjects involved, to ease data transmission for those cases where such parties are willing to consent, e.g. if they also want to move their data to some other data controller. Such a situation might arise, for example, with social networks, but it is up to data controllers to decide on the leading practice to follow.

With respect to data covered by intellectual property and trade secrets:

The rights and freedoms of others are mentioned in Article 20(4). While not directly related to portability, this can be understood as "including trade secrets or intellectual property and in particular the copyright protecting the software. However, even though these rights should be considered before answering a data portability request, "the result of those considerations should not be a refusal to provide all information to the data subject". Furthermore, the data controller should not reject a data portability request on the basis of the infringement of another contractual right (for example, an outstanding debt, or a trade conflict with the data subject).

The right to data portability is not a right for an individual to misuse the information in a way that could be qualified as an unfair practice or that would constitute a violation of intellectual property rights.

A potential business risk cannot, however, in and of itself serve as the basis for a refusal to answer the portability request and data controllers can transmit the personal data provided by data subjects in a form that does not release information covered by trade secrets or intellectual property rights.

  
[^22]: Recital 68 provides that "where, in a certain set of personal data, more than one data subject is concerned, the right to receive the personal data should be without prejudice to the rights and freedoms of other data subjects in accordance with this Regulation "  
[^23]: A social networking service should not enrich the profile of its members by using personal data transmitted by a data subject as part of his right to data portability, without respecting the principle of transparency and also making sure they rely on an appropriate legal basis regarding this specific processing

In [75]:
rc = doc.reference_checker
rc.is_valid("VII.B")

True