In [7]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [8]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.dpo
importlib.reload(gdpr_rag.documents.dpo)
from gdpr_rag.documents.dpo import DPO

path_to_manual_as_csv_file = "../inputs/documents/dpo.parquet"

doc = DPO(path_to_manual_as_csv_file)


In [5]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet("../inputs/documents/dpo.parquet", engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "decisions", reference_checker = reference_checker, regulation_df = df)

In [6]:
# Remove the annexes from the tree for indexing

toc.print_tree()

decisions []
|-- 1 [Introduction  ]
|-- 2 [Designation of a DPO ]
|   |-- .1 [Mandatory designation  ]
|   |   |-- .1 ['PUBLIC AUTHORITY OR BODY' ]
|   |   |-- .2 ['CORE ACTIVITIES' ]
|   |   |-- .3 ['LARGE SCALE' ]
|   |   |-- .4 ['REGULAR AND SYSTEMATIC MONITORING'  ]
|   |   +-- .5 [SPECIAL CATEGORIES OF DATA AND DATA RELATING TO CRIMINAL CONVICTIONS AND OFFENCES ]
|   |-- .2 [DPO of the processor   ]
|   |-- .3 [Designation of a single DPO for several organisations   ]
|   |-- .4 [Accessibility and localisation of the DPO   ]
|   |-- .5 [Expertise and skills of the DPO   ]
|   +-- .6 [Publication and communication of the DPO's contact details  ]
|-- 3 [Position of the DPO ]
|   |-- .1 [Involvement of the DPO in all issues relating to the protection of personal data  ]
|   |-- .2 [Necessary resources  ]
|   |-- .3 [Instructions and 'performing their duties and tasks in an independent manner' ]
|   |-- .4 [Dismissal or penalty for performing DPO tasks ]
|   +-- .5 [Conflict of intere

In [7]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)
# from regulations_rag.regulation_table_of_content import split_tree

# split_df = split_tree(node = toc.root, document = doc, table_of_content = toc, token_limit = 1300)

In [10]:
split_df = split_df[split_df["token_count"] > 67] # exclude "sections" that don't contain data
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,1,1 Introduction \nThe General Data Protection ...,955
1,2.1,2 Designation of a DPO \n\n2.1 Mandatory desig...,725
2,2.1.1,2 Designation of a DPO \n2.1 Mandatory designa...,439
3,2.1.2,2 Designation of a DPO \n2.1 Mandatory designa...,330
4,2.1.3,2 Designation of a DPO \n2.1 Mandatory designa...,658
5,2.1.4,2 Designation of a DPO \n2.1 Mandatory designa...,511
6,2.1.5,2 Designation of a DPO \n2.1 Mandatory designa...,113
7,2.2,2 Designation of a DPO \n\n2.2 DPO of the proc...,437
8,2.3,2 Designation of a DPO \n\n2.3 Designation of ...,591
9,2.4,2 Designation of a DPO \n\n2.4 Accessibility a...,144


In [1]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [4]:
import pandas as pd
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [5]:
summary_file = "../tmp/dpo_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "DPO"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/dpo_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "DPO"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 2


In [95]:
# Delete sections which do not need to be indexed

# df_summary = df_summary[df_summary["section_reference"] != "II"]
# df_questions = df_questions[df_questions["section_reference"] != "II"]

# df_summary = df_summary[df_summary["section_reference"] != "III.B"]
# df_questions = df_questions[df_questions["section_reference"] != "III.B"]

# df_summary = df_summary[df_summary["section_reference"] != "III.D"]
# df_questions = df_questions[df_questions["section_reference"] != "III.D"]

# df_summary = df_summary[df_summary["section_reference"] != "IV"]
# df_questions = df_questions[df_questions["section_reference"] != "IV"]

# df_summary.reset_index(drop=True, inplace=True)
# df_questions.reset_index(drop=True, inplace=True)


In [99]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
5 DPO GUIDELINES: WHAT YOU NEED TO KNOW 

5.13 What is the role of the DPO with respect to data protection impact assessments and 
records of processing activities? 
As far as the data protection impact assessment is concerned, the controller or the processor should seek the advice of the DPO, on the following issues, amongst others: 
- whether or not to carry out a DPIA 
- what methodology to follow when carrying out a DPIA 
- whether to carry out the DPIA in-house or whether to outsource it 
- what safeguards (including technical and organisational measures) to apply to mitigate any risks to the rights and interests of the data subjects 
- whether or not the data protection impact assessment has been correctly carried out and whether its conclusions (whether or not to go ahead with the processing and what safeguards to apply) are in compliance with data protection requirements 
As far as the records of processing activities are concerned, it is the controller or the pr

In [101]:
model_summary

"Seek the DPO's advice on:\n\n- Deciding whether to carry out a Data Protection Impact Assessment (DPIA).\n- The methodology to follow for a DPIA.\n- Whether to conduct the DPIA internally or outsource it.\n- Safeguards to apply to mitigate risks to individuals' rights and interests.\n- Ensuring the DPIA is correctly carried out and its conclusions comply with data protection requirements.\n\nYou are responsible for maintaining records of processing activities. You may assign this task to the DPO, who can use these records to monitor compliance and provide advice."

In [102]:
# index: 34, section_reference: 5.13

df_summary.loc[index, "text"] = "Seek the DPO's advice on:\n\n- Deciding whether to carry out a Data Protection Impact Assessment (DPIA).\n- The methodology to follow for a DPIA.\n- Whether to conduct the DPIA internally or outsource it.\n- Safeguards to apply to mitigate risks to individuals' rights and interests.\n- Ensuring the DPIA is correctly carried out and its conclusions comply with data protection requirements.\n\nYou are responsible for maintaining records of processing activities. You may assign this task to the DPO, who can use these records to monitor compliance and provide advice."

df_questions.loc[index, "text"] = "What is the role of the Data Protection Officer (DPO) in data protection impact assessments (DPIAs)?"


index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [107]:
#df_questions.at[9, "section_reference"] ='III.D.d'
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,1,question,,DPO,Who needs to appoint a Data Protection Officer...
1,1,question,,DPO,What is the function of a Data Protection Offi...
2,1,question,,DPO,What responsibilities do controllers and proce...
3,1,question,,DPO,Can a Data Protection Officer (DPO) be held pe...
4,2.1,question,,DPO,When is it mandatory to designate a Data Prote...
...,...,...,...,...,...
77,5.9,question,,DPO,What resources should be provided to the DPO b...
78,5.10,question,,DPO,What safeguards ensure the Data Protection Off...
79,5.11,question,,DPO,What does it mean to monitor compliance?
80,5.12,question,,DPO,Can a DPO be held personally liable for data p...


In [104]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,1,The General Data Protection Regulation (GDPR) ...,summary,,DPO
1,2.1,A Data Protection Officer (DPO) must be design...,summary,,DPO
2,2.1.1,Designation of a Data Protection Officer (DPO)...,summary,,DPO
3,2.1.2,Designate a Data Protection Officer (DPO) when...,summary,,DPO
4,2.1.3,You must appoint a Data Protection Officer (DP...,summary,,DPO
5,2.1.4,If your activities involve regular and systema...,summary,,DPO
6,2.1.5,Designate a Data Protection Officer (DPO) if y...,summary,,DPO
7,2.2,When appointing a Data Protection Officer (DPO...,summary,,DPO
8,2.3,A group of organisations can appoint a single ...,summary,,DPO
9,2.4,The Data Protection Officer (DPO) should be ea...,summary,,DPO


In [138]:
summary_file = "../tmp/dpo_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/dpo_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [142]:
import pandas as pd
summary_file = "../tmp/dpo_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/dpo_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [106]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]

df_questions.reset_index(drop=True, inplace=True)


In [143]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,1,The General Data Protection Regulation (GDPR) ...,summary,,DPO
1,2.1,A Data Protection Officer (DPO) must be design...,summary,,DPO
2,2.1.1,Designation of a Data Protection Officer (DPO)...,summary,,DPO
3,2.1.2,Designate a Data Protection Officer (DPO) when...,summary,,DPO
4,2.1.3,You must appoint a Data Protection Officer (DP...,summary,,DPO
...,...,...,...,...,...
112,5.9,What resources should be provided to the DPO b...,question,,DPO
113,5.10,What safeguards ensure the Data Protection Off...,question,,DPO
114,5.11,What does it mean to monitor compliance?,question,,DPO
115,5.12,Can a DPO be held personally liable for data p...,question,,DPO


In [147]:
df_index.iloc[112]["text"]

'What resources should be provided to the data protection officer (DPO) by the controller or the processor? '

In [145]:
questions_to_fix = df_index[
    (df_index["text"].str.contains('DPO')) & 
    ~(df_index["text"].str.lower().str.contains('data protection officer'))
]

In [146]:
import re

import sys
sys.path.append('E:/Code/chat/gdpr')
from src.index_tools import update_text_in_index, add_to_index

from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)


model = "text-embedding-3-large"
dimensions = 1024

#MB: Do this before creating the embedding or use the update_text_in_index method
for index, row in questions_to_fix.iterrows():
    s = row['text']
    changed_text = s.replace("DPO", "data protection officer (DPO)", 1)
    df_index.at[index, "text"] = changed_text

    # s = row['text']
    # text_to_change = s
    # changed_text = re.sub(r"data protection impact assessment", "data protection impact assessment (dpo)", s, flags=re.IGNORECASE, count=1)
    # df_index = update_text_in_index(openai_client = openai_client, index_df = df_index, text_to_change = text_to_change, changed_text = changed_text, embedding_model = model, embedding_dimensions = dimensions)


In [151]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines


In [152]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

file = "../inputs/index/dpo.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [153]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,1,The General Data Protection Regulation (GDPR) ...,summary,"[-0.09357940405607224, -0.015285134315490723, ...",DPO
1,2.1,A Data Protection Officer (DPO) must be design...,summary,"[-0.043087348341941833, -0.008121910504996777,...",DPO
2,2.1.1,Designation of a Data Protection Officer (DPO)...,summary,"[-0.06720229238271713, 0.002132399007678032, -...",DPO
3,2.1.2,Designate a Data Protection Officer (DPO) when...,summary,"[-0.047003354877233505, 1.2145385881012771e-05...",DPO
4,2.1.3,You must appoint a Data Protection Officer (DP...,summary,"[-0.06781543791294098, 0.0029785060323774815, ...",DPO
...,...,...,...,...,...
112,5.9,What resources should be provided to the data ...,question,"[-0.06573357433080673, -0.03166726976633072, -...",DPO
113,5.10,What safeguards ensure the Data Protection Off...,question,"[-0.07683565467596054, -0.059310588985681534, ...",DPO
114,5.11,What does it mean to monitor compliance?,question,"[-0.046405281871557236, -0.0008119646809063852...",DPO
115,5.12,Can a data protection officer (DPO) be held pe...,question,"[-0.09526393562555313, -0.049128782004117966, ...",DPO


In [154]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
dpo_index = load_parquet_data("../inputs/index/dpo.parquet", key)



In [155]:
section = gdpr_index[gdpr_index["section_reference"] == "37"]

#section
for index, row in section.iterrows():
    print(f"-- {row['text']}")

-- When must you designate a data protection officer?
-- Can multiple organisations share a data protection officer?
-- What criteria must a data protection officer meet?
-- Can a data protection officer be employed directly or work under a service contract?
-- How should the contact details of the data protection officer be shared?
-- What types of processing activities require you to appoint a data protection officer?
-- Is it mandatory for public authorities or bodies to have a data protection officer?
-- How does the scale of data processing affect the requirement to appoint a data protection officer?
-- You must appoint a data protection officer (DPO) if you're a public authority or body (excluding courts acting in their judicial capacity), if your core activities require regular and systematic monitoring of individuals on a large scale, or if you process special categories of data or personal data relating to criminal convictions and offences on a large scale. A group of undertak

In [50]:
tmp = decision_index[(decision_index["section_reference"]== "I")]
for index, row in tmp.iterrows():
    print(f"-- {row['text']}")

-- Profiling and automated decision-making are increasingly utilised across various sectors such as banking, healthcare, and marketing, driven by technological advancements in big data, AI, and machine learning. These techniques can greatly benefit efficiency and personalisation but also pose risks to individual rights and freedoms due to their potential invasiveness and the issues around transparency and discrimination they can create.

The GDPR seeks to mitigate these risks by implementing specific provisions to protect privacy and other related rights. You must ensure appropriate safeguards when employing profiling and automated decision-making technologies, keeping in mind their potential to impact individuals based on inaccurate data, perpetuate stereotypes, or restrict individual choices.

Particularly under Article 22, you have obligations regarding solely automated decisions that have legal or similarly significant effects on individuals, including the requirement to provide me

In [35]:
#file = "../inputs/index/dpo.parquet"
file = "../inputs/index/gdpr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(gdpr_index, file, key)

In [41]:
from src.index_tools import update_text_in_index, add_to_index

index_df = decision_index
#index_df = gdpr_index

text_to_change = "How do automated decision-making and profiling differ?"
changed_text = "What is automated decision-making?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [51]:
from src.index_tools import remove_from_index

text_to_delete = "What safeguards does the General Data Protection Regulation (GDPR) introduce for profiling and automated decision-making?"
decision_index = remove_from_index(decision_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)