In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
class_name = "Transparency"
summary_file = "../tmp/transparency_summary.csv"
question_file = "../tmp/transparency_question.csv"

path_to_manual_as_csv_file = "../inputs/documents/transparency.parquet"
index_file = "../inputs/index/transparency.parquet"



In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.transparency
importlib.reload(gdpr_rag.documents.transparency)
from gdpr_rag.documents.transparency import Transparency


doc = Transparency(path_to_manual_as_csv_file)


In [4]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [5]:
toc.print_tree()

transparency []
|-- A [Introduction  ]
|-- B [The meaning of transparency  ]
|-- C [Elements of transparency under the GDPR  ]
|   |-- .a ["Concise, transparent, intelligible and easily accessible" ]
|   |-- .b ["Clear and plain language"    ]
|   |-- .c [Providing information to children and other vulnerable people ]
|   |-- .d ["In writing or by other means" ]
|   |-- .e ["..the information may be provided orally" ]
|   +-- .f ["Free of charge"  ]
|-- D [Information to be provided to the data subject - Articles 13 & 14 ]
|   |-- .a [Content ]
|   |-- .b ["Appropriate measures" ]
|   |-- .c [Timing for provision of information  ]
|   |-- .d [Changes to Article 13 and Article 14 information  ]
|   |-- .e [Timing of notification of changes to Article 13 and Article 14 information ]
|   |-- .f [Modalities - format of information provision ]
|   |-- .g [Layered approach in a digital environment and layered privacy statements/ notices  ]
|   |-- .h [Layered approach in a non-digital enviro

In [7]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df[split_df["token_count"] > 20] 
split_df = split_df.drop([0]) # Introduction
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,B,B The meaning of transparency \n6. Transparen...,171
1,C,C Elements of transparency under the GDPR \n7...,252
2,C.a,C Elements of transparency under the GDPR \n\...,1002
3,C.b,C Elements of transparency under the GDPR \n\...,913
4,C.c,C Elements of transparency under the GDPR \n\...,1003
5,C.d,C Elements of transparency under the GDPR \n\...,876
6,C.e,C Elements of transparency under the GDPR \n\...,487
7,C.f,C Elements of transparency under the GDPR \n\...,394
8,D.a,D Information to be provided to the data subje...,170
9,D.b,D Information to be provided to the data subje...,392


In [8]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [9]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [10]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [129]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f"df_summary.loc[index, 'text'] = '{model_summary}'")
print()
print(f"df_questions.loc[index, 'text'] = '{model_questions}'")

##############
J Transparency and data breaches  
70. WP29 has produced separate Guidelines on Data Breaches[^57] but for the purposes of these guidelines, a data controller's obligations in relation to communication of data breaches to a data subject must take full account of the transparency requirements set out in Article 12.[^58] The communication of a data breach must satisfy the same requirements, detailed above (in particular for the use of clear and plain language), that apply to any other communication with a data subject in relation to their rights or in connection with conveying information under Articles 13 and 14.   

[^57] Guidelines on Personal data breach notification under Regulation 2016/679, WP 250
[^58] This is made clear by Article 12.1 which specifically refers to "…any communication under Articles 15 to 22 and 34 relating to processing to the data subject..." [emphasis added].
##############
# index: 35, section_reference: J

df_summary.loc[index, 'text'] = 'When

In [126]:
model_summary

"Member States or the EU can legislate to restrict individual rights related to data protection when necessary and proportionate to safeguard objectives like national security or public interest. If relying on such provisions, you must demonstrate how the national provision applies and inform individuals about these restrictions unless it undermines the restriction's purpose. Provide upfront information about rights and potential caveats to avoid surprising individuals when they attempt to exercise their rights.\n\nMember States must also reconcile data protection with freedom of expression and information. This includes exempting journalistic, academic, artistic, or literary processing from certain GDPR provisions when necessary."

In [131]:
# index: 35, section_reference: J

df_summary.loc[index, 'text'] = 'When communicating data breaches to individuals, ensure the same transparency requirements as outlined in Article 12 are met. This communication must use clear and plain language, similar to other information shared with individuals about their rights under Articles 13 and 14.'

df_questions.loc[index, 'text'] = "What guidelines should be followed for data breaches?|What are a controller's obligations regarding communicating data breaches to individuals?|What transparency requirements must be met when informing individuals about data breaches?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 97.30% of the work
The next section is:
K Annex
Information that must be provided to a data subject under Article 13 or Article 14**
| Required Information Type | Relevant article (if personal data collected directly from data subject) | Relevant article (if personal data not obtained from the data subject) | WP29 comments on information requirement|
|---|---|---|---|
| The identity and contact details of the controller and, where applicable, their representative [^59]| Article 13.1(a) | Article 14.1(a) | This information should allow for easy identification of the controller and preferably allow for different forms of communications with the data controller (e.g. phone number, email, postal address, etc.)|
| Contact details for the data protection officer, where applicable | Article 13.1(b) | Article 14.1(b) | See WP29 Guidelines on Data Protection Officers [^60]|
| The purposes and legal basis for the processing | Article 13.1(c) | Article 14.1(c) | In addition to 

In [140]:
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,B,question,,Transparency,What is the principle of transparency?
1,B,question,,Transparency,What specific information should be provided t...
2,C,question,,Transparency,Where does GDPR address transparency?
3,C.a,question,,Transparency,"What does it mean for information to be ""conci..."
4,C.a,question,,Transparency,What is a layered privacy statement or notice?
...,...,...,...,...,...
106,I,question,,Transparency,How should you handle transparency regarding ...
107,I,question,,Transparency,What is the role of pseudonymisation and data...
108,J,question,,Transparency,What guidelines should be followed for data br...
109,J,question,,Transparency,What are a controller's obligations regarding ...


In [137]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [135]:
df_summary = df_summary[df_summary["section_reference"] != "K"]
df_questions = df_questions[df_questions["section_reference"] != "K"]

In [138]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [139]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [141]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,B,Ensure individuals know that their personal da...,summary,,Transparency
1,C,Key articles related to transparency are in Ch...,summary,,Transparency
2,C.a,Information provided to individuals should be ...,summary,,Transparency
3,C.b,Use clear and plain language for all written i...,summary,,Transparency
4,C.c,When targeting or making services available to...,summary,,Transparency
...,...,...,...,...,...
142,I,How should you handle transparency regarding ...,question,,Transparency
143,I,What is the role of pseudonymisation and data...,question,,Transparency
144,J,What guidelines should be followed for data br...,question,,Transparency
145,J,What are a controller's obligations regarding ...,question,,Transparency


In [142]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines
Completed 130 lines
Completed 140 lines
Completed 150 lines


In [143]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [144]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,B,Ensure individuals know that their personal da...,summary,"[-0.04707790166139603, 0.00023617436818312854,...",Transparency
1,C,Key articles related to transparency are in Ch...,summary,"[-0.05879134684801102, -0.01688995771110058, -...",Transparency
2,C.a,Information provided to individuals should be ...,summary,"[-0.06416782736778259, 0.011052381247282028, -...",Transparency
3,C.b,Use clear and plain language for all written i...,summary,"[-0.0384405292570591, 0.014206735417246819, -0...",Transparency
4,C.c,When targeting or making services available to...,summary,"[-0.0598132498562336, 0.02376491017639637, -0....",Transparency
...,...,...,...,...,...
142,I,How should you handle transparency regarding ...,question,"[-0.03133254125714302, -0.07001733034849167, -...",Transparency
143,I,What is the role of pseudonymisation and data...,question,"[-0.05166357010602951, -0.009405859746038914, ...",Transparency
144,J,What guidelines should be followed for data br...,question,"[-0.0193608608096838, -0.035326067358255386, -...",Transparency
145,J,What are a controller's obligations regarding ...,question,"[-0.03810231015086174, -0.04537346959114075, -...",Transparency


In [148]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
trans_index = load_parquet_data("../inputs/index/data_breach.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('14')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- What information must be provided to individuals when their data is collected indirectly?
-- Within what timeframe must information be provided to the data subject when their data is collected indirectly?
-- What actions must be taken if the purpose of data processing changes after initial collection?
-- Are there any circumstances under which the obligation to provide information to data subjects does not apply?
-- How are data subjects informed about their rights concerning their personal data?
-- When personal data is not obtained from the data subject directly, the controller must inform the data subject about the controller's identity, contact details, the contact of the data protection officer if applicable, the purposes and legal basis for processing, the categories of personal data, the recipients of the data, and any potential international transfers along with safeguards. Furthermore, the controller must inform the data subject about the storage period, the legitimate inte

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)