In [2]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [9]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.dpia
importlib.reload(gdpr_rag.documents.dpia)
from gdpr_rag.documents.dpia import DPIA

path_to_manual_as_csv_file = "../inputs/documents/dpia.parquet"

doc = DPIA(path_to_manual_as_csv_file)


In [10]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet("../inputs/documents/dpia.parquet", engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "decisions", reference_checker = reference_checker, regulation_df = df)

In [11]:
# Remove the annexes from the tree for indexing
annex_1_node = toc.get_node("Annex 1")
annex_1_node.parent = None

toc.print_tree()

decisions []
|-- I [Introduction  ]
|-- II [Scope of the Guidelines  ]
|-- III [DPIA: the Regulation explained  ]
|   |-- .A [What does a DPIA address? A single processing operation or a set of similar processing operations.   ]
|   |-- .B [Which processing operations are subject to a DPIA? Apart from exceptions, where they are "likely to result in a high risk".   ]
|   |   |-- .a [When is a DPIA mandatory? When processing is "likely to result in a high risk".   ]
|   |   +-- .b [When isn't a DPIA required? When the processing is not "likely to result in a high risk", or a similar DPIA exists, or it has been authorized prior to May 2018, or it has a legal basis, or it is in the list of processing operations for which a DPIA is not required.   ]
|   |-- .C [What about already existing processing operations? DPIAs are required in some circumstances.  ]
|   |-- .D [How to carry out a DPIA?   ]
|   |   |-- .a [At what moment should a DPIA be carried out? Prior to the processing. The DPIA s

In [35]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)
# from regulations_rag.regulation_table_of_content import split_tree

# split_df = split_tree(node = toc.root, document = doc, table_of_content = toc, token_limit = 1300)

In [36]:
split_df

Unnamed: 0,section_reference,text,token_count
0,I,I. Introduction \nRegulation 2016/679[^1] (GD...,831
1,II,II. Scope of the Guidelines \nThese Guideline...,682
2,III,III. DPIA: the Regulation explained \nThe GDP...,475
3,III.A,III. DPIA: the Regulation explained \n\nA. Wh...,579
4,III.B,III. DPIA: the Regulation explained \n\nB. Wh...,101
5,III.B.a,III. DPIA: the Regulation explained \nB. Whic...,2589
6,III.B.b,III. DPIA: the Regulation explained \nB. Whic...,656
7,III.C,III. DPIA: the Regulation explained \n\nC. Wh...,597
8,III.D,III. DPIA: the Regulation explained \n\nD. Ho...,19
9,III.D.a,III. DPIA: the Regulation explained \nD. How ...,324


In [37]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [44]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [40]:
summary_file = "../tmp/dpia_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "DPIA"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/dpia_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "DPIA"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [95]:
df_summary = df_summary[df_summary["section_reference"] != "II"]
df_questions = df_questions[df_questions["section_reference"] != "II"]

df_summary = df_summary[df_summary["section_reference"] != "III.B"]
df_questions = df_questions[df_questions["section_reference"] != "III.B"]

df_summary = df_summary[df_summary["section_reference"] != "III.D"]
df_questions = df_questions[df_questions["section_reference"] != "III.D"]

df_summary = df_summary[df_summary["section_reference"] != "IV"]
df_questions = df_questions[df_questions["section_reference"] != "IV"]




df_summary.reset_index(drop=True, inplace=True)
df_questions.reset_index(drop=True, inplace=True)


In [97]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
Annex 2. Criteria for an acceptable DPIA   
The WP29 proposes the following criteria which data controllers can use to assess whether or not a DPIA, or a methodology to carry out a DPIA, is sufficiently comprehensive to comply with the GDPR: 
- a systematic description of the processing is provided (Article 35(7)(a)): 
    - nature, scope, context and purposes of the processing are taken into account (recital 90); 
    - personal data, recipients and period for which the personal data will be stored are recorded; 
    - a functional description of the processing operation is provided; 
    - the assets on which personal data rely (hardware, software, networks, people, paper or paper transmission channels) are identified; 
    - compliance with approved codes of conduct is taken into account (Article 35(8)); 
- necessity and proportionality are assessed (Article 35(7)(b)): 
    - measures envisaged to comply with the Regulation are determined (Article 35(7)(d) and recital

In [98]:
df_summary.loc[index, "text"] = "Criteria for an acceptable DPIA"

df_questions.loc[index, "text"] = "What is needed for a comprehensive Data Protection Impact Assessment (DPIA)?|What risks should be evaluated in a DPIA?"





index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [99]:
#df_questions.at[9, "section_reference"] ='III.D.d'
df_questions

Unnamed: 0,section_reference,text,source,embedding,document
0,I,What is a Data Protection Impact Assessment (D...,question,,DPIA
1,III,What are the steps involved in managing risks ...,question,,DPIA
2,III.A,What does a Data Protection Impact Assessment ...,question,,DPIA
3,III.B.a,When is it necessary to conduct a Data Protect...,question,,DPIA
4,III.B.b,What are the exceptions to the requirement for...,question,,DPIA
5,III.C,When is a Data Protection Impact Assessment (D...,question,,DPIA
6,III.D.a,When should a Data Protection Impact Assessmen...,question,,DPIA
7,III.D.b,Who is responsible for conducting a Data Prote...,question,,DPIA
8,III.D.c,What are the essential components of a Data Pr...,question,,DPIA
9,III.D.d,Is it mandatory to publish the Data Protection...,question,,DPIA


In [100]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Perform a Data Protection Impact Assessment (D...,summary,,DPIA
1,III,You must implement measures to ensure and demo...,summary,,DPIA
2,III.A,A Data Protection Impact Assessment (DPIA) can...,summary,,DPIA
3,III.B.a,A Data Protection Impact Assessment (DPIA) is ...,summary,,DPIA
4,III.B.b,A Data Protection Impact Assessment (DPIA) is ...,summary,,DPIA
5,III.C,Carry out a Data Protection Impact Assessment ...,summary,,DPIA
6,III.D.a,You must conduct a Data Protection Impact Asse...,summary,,DPIA
7,III.D.b,The controller is responsible for ensuring the...,summary,,DPIA
8,III.D.c,Describe the processing operations and their p...,summary,,DPIA
9,III.D.d,"No obligation exists to publish the DPIA, thou...",summary,,DPIA


In [101]:
summary_file = "../tmp/dpia_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/dpia_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [102]:
import pandas as pd
summary_file = "../tmp/dpia_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/dpia_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [103]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions.reset_index(drop=True, inplace=True)


In [104]:
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [106]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Perform a Data Protection Impact Assessment (D...,summary,,DPIA
1,III,You must implement measures to ensure and demo...,summary,,DPIA
2,III.A,A Data Protection Impact Assessment (DPIA) can...,summary,,DPIA
3,III.B.a,A Data Protection Impact Assessment (DPIA) is ...,summary,,DPIA
4,III.B.b,A Data Protection Impact Assessment (DPIA) is ...,summary,,DPIA
...,...,...,...,...,...
59,III.E,What should you do if residual risks are high ...,question,,DPIA
60,III.E,What is considered an example of a high residu...,question,,DPIA
61,III.E,When must you consult the supervisory authorit...,question,,DPIA
62,Annex 2,What is needed for a comprehensive Data Protec...,question,,DPIA


In [107]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines


In [108]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

file = "../inputs/index/dpia.parquet"
df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [109]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Perform a Data Protection Impact Assessment (D...,summary,"[-0.0651446282863617, -0.014580177143216133, -...",DPIA
1,III,You must implement measures to ensure and demo...,summary,"[-0.06786536425352097, -0.012024881318211555, ...",DPIA
2,III.A,A Data Protection Impact Assessment (DPIA) can...,summary,"[-0.04701923951506615, -0.02683098055422306, -...",DPIA
3,III.B.a,A Data Protection Impact Assessment (DPIA) is ...,summary,"[-0.04772201552987099, 0.006425258703529835, -...",DPIA
4,III.B.b,A Data Protection Impact Assessment (DPIA) is ...,summary,"[-0.02549058012664318, 0.019770581275224686, -...",DPIA
...,...,...,...,...,...
59,III.E,What should you do if residual risks are high ...,question,"[-0.04703465476632118, -0.05478291213512421, -...",DPIA
60,III.E,What is considered an example of a high residu...,question,"[-0.05717877298593521, -0.00804865825921297, -...",DPIA
61,III.E,When must you consult the supervisory authorit...,question,"[-0.026660023257136345, -0.040645912289619446,...",DPIA
62,Annex 2,What is needed for a comprehensive Data Protec...,question,"[-0.017761580646038055, -0.021252399310469627,...",DPIA


In [31]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
dpia_index = load_parquet_data("../inputs/index/dpia.parquet", key)



In [33]:
questions_to_fix = gdpr_index[
    (~gdpr_index["text"].str.contains('DPIA')) & 
    (gdpr_index["text"].str.lower().str.contains('data protection impact assessment'))
]

In [34]:
len(questions_to_fix)
#questions_to_fix.iloc[0]["text"]

0

In [32]:
import re

import sys
sys.path.append('E:/Code/chat/gdpr')
from src.index_tools import update_text_in_index, add_to_index

from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)


model = "text-embedding-3-large"
dimensions = 1024

for index, row in questions_to_fix.iterrows():
    s = row['text']
    text_to_change = s
    #changed_text = s.replace("data protection impact assessment", "data protection impact assessment (DPIA)", 1)
    changed_text = re.sub(r"data protection impact assessment", "data protection impact assessment (DPIA)", s, flags=re.IGNORECASE, count=1)

    gdpr_index = update_text_in_index(openai_client = openai_client, index_df = gdpr_index, text_to_change = text_to_change, changed_text = changed_text, embedding_model = model, embedding_dimensions = dimensions)


In [4]:
section = gdpr_index[gdpr_index["section_reference"] == "35"]

#section
for index, row in section.iterrows():
    print(f"-- {row['text']}")

-- When must you conduct a data protection impact assessment?
-- Who should you consult when carrying out a data protection impact assessment?
-- What triggers the requirement for a data protection impact assessment?
-- How do supervisory authorities contribute to the data protection impact assessment process?
-- What content is required in a data protection impact assessment?
-- Should individuals' views be considered in the assessment process?
-- What happens when processing activities are already regulated by Union or Member State law?
-- How often should you review the data protection impact assessment?
-- Before starting any processing activities that use new technologies and are likely to pose a high risk to individuals' rights and freedoms, you must conduct a data protection impact assessment. This is particularly mandatory for:
- Automated processing including profiling that significantly affects individuals.
- Large-scale processing of special categories of data or data concer

In [50]:
tmp = decision_index[(decision_index["section_reference"]== "I")]
for index, row in tmp.iterrows():
    print(f"-- {row['text']}")

-- Profiling and automated decision-making are increasingly utilised across various sectors such as banking, healthcare, and marketing, driven by technological advancements in big data, AI, and machine learning. These techniques can greatly benefit efficiency and personalisation but also pose risks to individual rights and freedoms due to their potential invasiveness and the issues around transparency and discrimination they can create.

The GDPR seeks to mitigate these risks by implementing specific provisions to protect privacy and other related rights. You must ensure appropriate safeguards when employing profiling and automated decision-making technologies, keeping in mind their potential to impact individuals based on inaccurate data, perpetuate stereotypes, or restrict individual choices.

Particularly under Article 22, you have obligations regarding solely automated decisions that have legal or similarly significant effects on individuals, including the requirement to provide me

In [35]:
#file = "../inputs/index/dpia.parquet"
file = "../inputs/index/gdpr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(gdpr_index, file, key)

In [41]:
from src.index_tools import update_text_in_index, add_to_index

index_df = decision_index
#index_df = gdpr_index

text_to_change = "How do automated decision-making and profiling differ?"
changed_text = "What is automated decision-making?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [51]:
from src.index_tools import remove_from_index

text_to_delete = "What safeguards does the General Data Protection Regulation (GDPR) introduce for profiling and automated decision-making?"
decision_index = remove_from_index(decision_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)