In [25]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [26]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.data_breach
importlib.reload(gdpr_rag.documents.data_breach)
from gdpr_rag.documents.data_breach import DataBreach

path_to_manual_as_csv_file = "../inputs/documents/data_breach.parquet"

doc = DataBreach(path_to_manual_as_csv_file)


In [5]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "intl_transfer", reference_checker = reference_checker, regulation_df = df)

In [6]:
toc.print_tree()

intl_transfer []
|-- INTRODUCTION []
|-- I [Personal data breach notification under the GDPR]
|   |-- .A [Basic security considerations ]
|   +-- .B [What is a personal data breach? ]
|       |-- .1 [Definition ]
|       |-- .2 [Types of personal data breaches ]
|       +-- .3 [The possible consequences of a personal data breach ]
|-- II [Article 33 - Notification to the supervisory authority ]
|   |-- .A [When to notify ]
|   |   |-- .1 [Article 33 requirements ]
|   |   |-- .2 [When does a controller become "aware"? ]
|   |   |-- .3 [Joint controllers ]
|   |   +-- .4 [Processor obligations ]
|   |-- .B [Providing information to the supervisory authority ]
|   |   |-- .1 [Information to be provided ]
|   |   |-- .2 [Notification in phases ]
|   |   +-- .3 [Delayed notifications ]
|   |-- .C [Cross-border breaches and breaches at non-EU establishments  ]
|   |   |-- .1 [Cross-border breaches ]
|   |   +-- .2 [Breaches at non-EU establishments ]
|   +-- .D [Conditions where notificatio

In [10]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df[split_df["token_count"] > 25]
split_df = split_df.drop([0]) # introduction
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,I.A,I Personal data breach notification under the ...,354
1,I.B.1,I Personal data breach notification under the ...,575
2,I.B.2,I Personal data breach notification under the ...,1603
3,I.B.3,I Personal data breach notification under the ...,863
4,II.A.1,II Article 33 - Notification to the supervisor...,287
5,II.A.2,II Article 33 - Notification to the supervisor...,2067
6,II.A.3,II Article 33 - Notification to the supervisor...,138
7,II.A.4,II Article 33 - Notification to the supervisor...,659
8,II.B.1,II Article 33 - Notification to the supervisor...,639
9,II.B.2,II Article 33 - Notification to the supervisor...,807


In [11]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [12]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [53]:
summary_file = "../tmp/data_breach_summary.csv"
class_name = "DataBreach"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/data_breach_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 11


In [93]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
VII Annex 

VII.B Examples of personal data breaches and who to notify  
The following non-exhaustive examples will assist controllers in determining whether they need to notify in different personal data breach scenarios. These examples may also help to distinguish between risk and high risk to the rights and freedoms of individuals. 
|Example | Notify the supervisory authority? | Notify the data subject? | Notes/recommendations | 
|---|---|---|---|
|i. A controller stored a backup of an archive of personal data encrypted on a USB key. The key is stolen during a break-in. |No. | No. | As long as the data are encrypted with a state of the art algorithm, backups of the data exist the unique key is not compromised, and the data can be restored in good time, this may not be a reportable breach. However if it is later compromised, notification is required. |
|ii. A controller maintains an online service. As a result of a cyber attack on that service, personal data of individ

In [94]:
model_summary

"### VII.B Examples of Personal Data Breaches and Who to Notify\n\n|Example | Notify the supervisory authority? | Notify the individual? | Notes/recommendations | \n|---|---|---|---|\n|A controller stored a backup of an archive of personal data encrypted on a USB key. The key is stolen during a break-in. |No | No | Data is encrypted with a state-of-the-art algorithm, backups exist, the unique key isn't compromised, and data can be restored in good time. If later compromised, notify. |\n|A controller maintains an online service. A cyber attack exfiltrates personal data of individuals in a single Member State. |Yes | Yes, depending on the nature of personal data and severity of likely consequences. ||\n|A brief power outage at a controller's call centre means customers cannot call and access their records. |No | No | Not notifiable, but record the incident under Article 33(5). Maintain appropriate records. |\n|A ransomware attack encrypts all data with no backups available, making data i

In [95]:
df_summary.loc[index, "text"] = "### VII.B Examples of Personal Data Breaches and Who to Notify\n\n|Example | Notify the supervisory authority? | Notify the individual? | Notes/recommendations | \n|---|---|---|---|\n|A controller stored a backup of an archive of personal data encrypted on a USB key. The key is stolen during a break-in. |No | No | Data is encrypted with a state-of-the-art algorithm, backups exist, the unique key isn't compromised, and data can be restored in good time. If later compromised, notify. |\n|A controller maintains an online service. A cyber attack exfiltrates personal data of individuals in a single Member State. |Yes | Yes, depending on the nature of personal data and severity of likely consequences. ||\n|A brief power outage at a controller's call centre means customers cannot call and access their records. |No | No | Not notifiable, but record the incident under Article 33(5). Maintain appropriate records. |\n|A ransomware attack encrypts all data with no backups available, making data irretrievable. The ransomware's only functionality was encryption. | Yes | Yes, depending on the nature of personal data affected and the possible effect of lack of data availability. | If backups existed and data could be restored in good time, no need to report. Supervisory authority might investigate compliance with Article 32 if they become aware. |\n|An individual reports receiving another person's monthly bank statement. The controller investigates and confirms a personal data breach within 24 hours. | Yes | Notify affected individuals if high risk exists; if more individuals are later found to be affected, update the supervisory authority and notify other individuals. | Notify the supervisory authority and affected individuals initially if high risk is evident. |\n|A controller's online marketplace suffers a cyber-attack, publishing usernames, passwords, and purchase history of customers in multiple Member States. |Yes, to lead supervisory authority if cross-border processing occurs. | Yes, as it could lead to high risk. | Take actions such as forcing password resets and other steps to mitigate risk. Consider other obligations, e.g., under the NIS Directive. |\n|A data processor identifies a flaw that lets any user access any account details. | The processor must notify affected controllers without undue delay. Controllers must then notify the supervisory authority. | If there's likely no high risk, individuals do not need to be notified. | The processor must consider other obligations (e.g., NIS Directive"
df_questions.loc[index, "text"] = "Are there examples of who to notify in various breach scenarios?"




index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [100]:
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,I.A,question,,DataBreach,What does securing personal data involve?
1,I.A,question,,DataBreach,What should be considered when implementing se...
2,I.A,question,,DataBreach,What action should be taken to identify a data...
3,I.B.1,question,,DataBreach,What is considered a personal data breach?
4,I.B.1,question,,DataBreach,How is damage to personal data defined?
...,...,...,...,...,...
90,VI,question,,DataBreach,Which directive requires providers of publicly...
91,VI,question,,DataBreach,"What additional legal, medical, or professiona..."
92,VI,question,,DataBreach,What is the role of competent national authori...
93,VII.A,question,,DataBreach,What steps should you take when you detect a s...


In [96]:
summary_file = "../tmp/data_breach_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/data_breach_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [97]:
import pandas as pd
summary_file = "../tmp/data_breach_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/data_breach_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [98]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [101]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,I.A,Ensure personal data is processed securely wit...,summary,,DataBreach
1,I.B.1,A personal data breach is a security incident ...,summary,,DataBreach
2,I.B.2,A personal data breach can occur in various fo...,summary,,DataBreach
3,I.B.3,A personal data breach can lead to various neg...,summary,,DataBreach
4,II.A.1,"In the case of a personal data breach, notify ...",summary,,DataBreach
...,...,...,...,...,...
115,VI,Which directive requires providers of publicly...,question,,DataBreach
116,VI,"What additional legal, medical, or professiona...",question,,DataBreach
117,VI,What is the role of competent national authori...,question,,DataBreach
118,VII.A,What steps should you take when you detect a s...,question,,DataBreach


In [102]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines


In [103]:
file = "../inputs/index/data_breach.parquet"
df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [104]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,I.A,Ensure personal data is processed securely wit...,summary,"[-0.05888289958238602, -0.029703151434659958, ...",DataBreach
1,I.B.1,A personal data breach is a security incident ...,summary,"[-0.06669788807630539, 0.02424558252096176, -0...",DataBreach
2,I.B.2,A personal data breach can occur in various fo...,summary,"[-0.08075638860464096, -0.02675008401274681, -...",DataBreach
3,I.B.3,A personal data breach can lead to various neg...,summary,"[-0.045855563133955, -0.02036408707499504, -0....",DataBreach
4,II.A.1,"In the case of a personal data breach, notify ...",summary,"[-0.03721925616264343, -0.038920916616916656, ...",DataBreach
...,...,...,...,...,...
115,VI,Which directive requires providers of publicly...,question,"[-0.05106492340564728, -0.014771945774555206, ...",DataBreach
116,VI,"What additional legal, medical, or professiona...",question,"[-0.030308131128549576, -0.06112400442361832, ...",DataBreach
117,VI,What is the role of competent national authori...,question,"[-0.03222626447677612, -0.027398772537708282, ...",DataBreach
118,VII.A,What steps should you take when you detect a s...,question,"[0.0070754364132881165, -0.028580065816640854,...",DataBreach


In [109]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
breach_index = load_parquet_data("../inputs/index/data_breach.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('34')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- When should you inform individuals about a personal data breach?
-- What information should be included when notifying an individual about a personal data breach?
-- Under what conditions can you avoid informing individuals about a personal data breach?
-- What happens if you don't voluntarily notify individuals about a personal data breach?
-- When a personal data breach might significantly affect individuals' rights and freedoms, you must notify those affected without unnecessary delay. The notification must be straightforward and contain specifics on the breach's nature. However, you don't need to notify individuals if:
1. You've applied adequate security measures that make the affected personal data inaccessible or unreadable to unauthorised persons, such as encryption.
2. You've taken steps after the breach to ensure that the high risk to individuals' rights and freedoms is no longer likely.
3. Informing each individual would require an unreasonably high effort, in which case a

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)