In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
class_name = "Protection"
prefix = "protection"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.protection
importlib.reload(gdpr_rag.documents.protection)
from gdpr_rag.documents.protection import Protection

doc = Protection(path_to_manual_as_csv_file)


In [4]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "forgotten", reference_checker = reference_checker, regulation_df = df)

In [5]:
toc.print_tree()

forgotten []
|-- Summary [Executive summary ]
|-- 1 [SCOPE ]
|-- 2 [ANALYSIS OF ARTICLE 25(1) AND (2) DATA PROTECTION BY DESIGN AND BY DEFAULT ]
|   |-- .1 [Article 25(1): Data protection by design ]
|   |   |-- .1 [Controller's obligation to implement appropriate technical and organisational measures and necessary safeguards into the processing ]
|   |   |-- .2 [Designed to implement the data protection principles in an effective manner and protecting data subjects' rights and freedoms ]
|   |   |-- .3 [Elements to take into account ]
|   |   |   |-- .1 ["state of the art" ]
|   |   |   |-- .2 ["cost of implementation" ]
|   |   |   |-- .3 ["nature, scope, context and purpose of processing" ]
|   |   |   +-- .4 ["risks of varying likelihood and severity for rights and freedoms of natural persons posed by the processing" ]
|   |   +-- .4 [Time aspect ]
|   |       |-- .1 [At the time of the determination of the means for processing ]
|   |       +-- .2 [At the time of the processing it

In [12]:
from IPython.display import display
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0, 1]) # summary, scope
split_df = split_df[split_df["token_count"] > 178] 
split_df.reset_index(drop=True, inplace=True)

display(split_df)

Unnamed: 0,section_reference,text,token_count
0,2.1.1,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,443
1,2.1.2,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,802
2,2.1.3.1,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,693
3,2.1.3.2,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,239
4,2.1.3.3,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,239
5,2.1.3.4,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,666
6,2.1.4.1,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,338
7,2.1.4.2,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,307
8,2.2.1,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,836
9,2.2.2.1,# 2 ANALYSIS OF ARTICLE 25(1) AND (2) DATA PRO...,250


In [8]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [9]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [13]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [98]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
model_summary = model_summary.replace('"', '\\"')
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
model_questions = model_questions.replace('"', '\\"')
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
# 6 RECOMMENDATIONS

94. Although not directly addressed in Article 25, processors and producers are also recognized as key enablers for DPbDD, they should be aware that controllers are required to only process personal data with systems and technologies that have built-in data protection.

95. When processing on behalf of controllers, or providing solutions to controllers, processors and producers should use their expertise to build trust and guide their customers, including SMEs, in designing /procuring solutions that embed data protection into the processing. This means in turn that the design of products and services should facilitate controllers' needs.

96. It should be kept in mind when implementing Article 25 that the main design objective is the effective implementation of the principles and protection of the rights of data subjects into the appropriate measures of the processing. In order to facilitate and enhance the adoption of DPbDD, we make the following re

In [100]:
model_summary

"Processors and producers should be aware that controllers must use systems and technologies with built-in data protection. They should help design or procure solutions that embed data protection, facilitating controllers' needs.\n\nControllers should plan for data protection from the beginning of a processing operation. Involvement of the Data Protection Officer (DPO) is encouraged throughout the processing life-cycle.\n\nControllers may pursue certification of processing operations for added value. Producers should strive for certification to demonstrate DPbDD. If certification isn't available, controllers should seek other guarantees of compliance.\n\nSpecific protection must be provided for children under 18 and vulnerable groups in DPbDD compliance.\n\nProducers and processors must support Article 25 obligations, and controllers should avoid those who don't offer compliant systems, as they will be accountable.\n\nProducers and processors must meet “state of the art” criteria and i

In [101]:
# index: 25, section_reference: 6

df_summary.loc[index, "text"] = "Processors and producers should be aware that controllers must use systems and technologies with built-in data protection. They should help design or procure solutions that embed data protection, facilitating controllers' needs.\n\nControllers should plan for data protection from the beginning of a processing operation. Involvement of the Data Protection Officer (DPO) is encouraged throughout the processing life-cycle.\n\nControllers may pursue certification of processing operations for added value. Producers should strive for certification to demonstrate DPbDD. If certification isn't available, controllers should seek other guarantees of compliance.\n\nSpecific protection must be provided for children under 18 and vulnerable groups in DPbDD compliance.\n\nProducers and processors must support Article 25 obligations, and controllers should avoid those who don't offer compliant systems, as they will be accountable.\n\nProducers and processors must meet “state of the art” criteria and inform controllers of any changes. Controllers should include this requirement in contracts.\n\nControllers should require producers and processors to demonstrate how their systems enable compliance with DPbDD using key performance indicators.\n\nA harmonised approach to implementing principles and rights is recommended. Sector-specific guidance on DPbDD should be included in codes of conduct.\n\nControllers must be fair and transparent about how they assess and demonstrate effective DPbDD implementation, showing compliance under the accountability principle.\n\nPrivacy-enhancing technologies (PETs) may be used if they meet state-of-the-art standards and are appropriate in a risk-based approach.\n\nLegacy systems must comply with DPbDD. If they can't be updated to comply, they must not be used for processing personal data.\n\nSMEs have the same DPbDD obligations. They can facilitate compliance by conducting early risk assessments, starting small and scaling up, seeking guarantees, partnering with reputable entities, consulting DPAs, reading guidance, adhering to codes of conduct, and obtaining professional advice."

df_questions.loc[index, "text"] = "What should processors and producers consider when helping you design or procure solutions that embed data protection by design and by default (DPbDD)?|Why is it important to consider data protection from the initial stages of planning a processing operation?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
from IPython.display import Markdown, display
if index < len(df_summary):
    print(f"The next section is:")
    display(Markdown(doc.get_text(df_summary.iloc[index]['section_reference'])))
else:
    print("All done")

You have completed 100.00% of the work
All done


In [104]:
from IPython.display import display

# Increase the display width
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1.1,### Article 25(1): Data Protection by Design\n...,summary,,Protection
1,2.1.2,Article 25(1): Data Protection by Design\n\n D...,summary,,Protection
2,2.1.3.1,Stay informed on technological progress and us...,summary,,Protection
3,2.1.3.2,You may consider the cost of implementation wh...,summary,,Protection
4,2.1.3.3,"Take into consideration the nature, scope, con...",summary,,Protection
5,2.1.3.4,Determine risks to individuals' rights from da...,summary,,Protection
6,2.1.4.1,Implement data protection measures at the time...,summary,,Protection
7,2.1.4.2,"Once processing has started, you must continua...",summary,,Protection
8,2.2.1,"By default, only essential personal data for e...",summary,,Protection
9,2.2.2.1,Ensure that both the volume and categories of ...,summary,,Protection


In [105]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [106]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [107]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [108]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1.1,### Article 25(1): Data Protection by Design\n...,summary,,Protection
1,2.1.2,Article 25(1): Data Protection by Design\n\n D...,summary,,Protection
2,2.1.3.1,Stay informed on technological progress and us...,summary,,Protection
3,2.1.3.2,You may consider the cost of implementation wh...,summary,,Protection
4,2.1.3.3,"Take into consideration the nature, scope, con...",summary,,Protection
...,...,...,...,...,...
84,5,How can compliance with Article 25 be assessed?,question,,Protection
85,5,What procedures can be used to enforce Article...,question,,Protection
86,5,What corrective actions can be taken if there'...,question,,Protection
87,6,What should processors and producers consider ...,question,,Protection


In [109]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines


In [110]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [111]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')


gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
protection_index = load_parquet_data("../inputs/index/forgotten.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('25')].iterrows():
    print(f"-- {row['text']}")


# for index, row in gdpr_index.iterrows():
#     if 'location' in row['text'].lower():
#         print(f"* {row['text']}")

-- What is required from you to ensure data protection by design and by default?
-- How should you integrate data protection principles when processing personal data?
-- What measures should you take to limit the processing of personal data to only what is necessary?
-- How can you show that you comply with data protection by design and by default requirements?
-- What role does pseudonymisation play in ensuring data protection?
-- How can you ensure that personal data are not unnecessarily accessible?
-- You must incorporate data protection principles such as data minimisation into the processing system from the start, considering the technology available, implementation costs, nature, scope, context, purposes of processing, and the potential risks to individuals' rights and freedoms. This applies when deciding how data will be processed and during the processing itself. Measures like pseudonymisation should be used to effectively embed these principles and protections into the proces

In [7]:
covid_index['section_reference'] = covid_index['section_reference'].astype(str)

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)