In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [72]:
class_name = "DataPortability"
summary_file = "../tmp/data_portability_summary.csv"
question_file = "../tmp/data_portability_question.csv"
index_file = "../inputs/index/data_portability.parquet"



In [8]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.data_portability
importlib.reload(gdpr_rag.documents.data_portability)
from gdpr_rag.documents.data_portability import DataPortability

path_to_manual_as_csv_file = "../inputs/documents/data_portability.parquet"

doc = DataPortability(path_to_manual_as_csv_file)


In [9]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "intl_transfer", reference_checker = reference_checker, regulation_df = df)

In [10]:
toc.print_tree()

intl_transfer []
|-- Executive summary [Executive summary ]
|-- I [Introduction ]
|-- II [What are the main elements of data portability? ]
|   |-- .a [A right to receive personal data  ]
|   |-- .b [A right to transmit personal data from one data controller to another data controller ]
|   |-- .c [Controllership  ]
|   +-- .d [Data portability vs. other rights of data subjects   ]
|-- III [When does data portability apply?  ]
|   |-- .a [Which processing operations are covered by the right to data portability?  ]
|   +-- .b [What personal data must be included?  ]
|       |-- .1 [First condition: personal data concerning the data subject   ]
|       |-- .2 [Second condition: data provided by the data subject  ]
|       +-- .3 [Third condition: the right to data portability shall not adversely affect the rights and freedoms of others  ]
|-- IV [How do the general rules governing the exercise of data subject rights apply to data portability? ]
|   |-- .a [What prior information should b

In [11]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df[split_df["token_count"] > 100] # excludes a definition which we already have in GDPR
split_df = split_df.drop([0]) # Executive summary
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,I,I Introduction \nArticle 20 of the General Dat...,519
1,II.a,II What are the main elements of data portabil...,346
2,II.b,II What are the main elements of data portabil...,442
3,II.c,II What are the main elements of data portabil...,1092
4,II.d,II What are the main elements of data portabil...,511
5,III.a,III When does data portability apply? \n\nIII...,747
6,III.b.1,III When does data portability apply? \nIII.b...,308
7,III.b.2,III When does data portability apply? \nIII.b...,1003
8,III.b.3,III When does data portability apply? \nIII.b...,1263
9,IV.a,IV How do the general rules governing the exer...,436


In [12]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [13]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [14]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [58]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
V How must the portable data be provided?  

V.d How can portable data be secured?  
In general, data controllers should guarantee the "appropriate security of the personal data, including protection against unauthorised or unlawful processing and against accidental loss, destruction or damage, using appropriate technical or organisational measures" according to Article 5(1)(f) of the GDPR. 
However, the transmission of personal data to the data subject may also raise some security issues: 
How can data controllers ensure that personal data are securely delivered to the right person? 
As data portability aims to get personal data out of the information system of the data controller, the transmission may become a possible source of risk regarding those data (in particular of data breaches during the transmission). The data controller is responsible for taking all the security measures needed to ensure not only that personal data is securely transmitted (by the use of end-

In [59]:
model_summary

"When providing portable data, ensure appropriate security measures to protect against unauthorised access, accidental loss, or damage. Use end-to-end encryption and strong authentication methods to secure transmission to the correct recipient. Assess risks specific to data portability and implement mitigation measures, such as additional authentication factors or halting transmission if account compromise is suspected. Direct transmissions between controllers should utilise token-based authentication. Security measures must not obstruct the individual's rights or impose additional costs.\n\nFor individuals storing personal data, inform them of the need to secure their systems. Suggest appropriate formats, encryption tools, and security measures."

In [60]:
# index: 16, section_reference: V.d

df_summary.loc[index, "text"] = "When providing portable data, ensure appropriate security measures to protect against unauthorised access, accidental loss, or damage. Use end-to-end encryption and strong authentication methods to secure transmission to the correct recipient. Assess risks specific to data portability and implement mitigation measures, such as additional authentication factors or halting transmission if account compromise is suspected. Direct transmissions between controllers should utilise token-based authentication. Security measures must not obstruct the individual's rights or impose additional costs.\n\nFor individuals storing personal data, inform them of the need to secure their systems. Suggest appropriate formats, encryption tools, and security measures."

df_questions.loc[index, "text"] = "How can portable data be secured during transmission?|How should issues of suspected compromised accounts be handled during data transmission?|What authentication methods can be used for direct transmission between organisations?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [69]:
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,I,question,,DataPortability,What is data portability?
1,I,question,,DataPortability,How do individuals benefit from data portabil...
2,I,question,,DataPortability,Can data portability facilitate switching bet...
3,I,question,,DataPortability,What responsibilities do you have when handli...
4,I,question,,DataPortability,How does data portability differ from the rig...
...,...,...,...,...,...
65,V.c,question,,DataPortability,How can large or complex personal data collect...
66,V.c,question,,DataPortability,What role do dashboards play in aiding the und...
67,V.d,question,,DataPortability,How can portable data be secured during transm...
68,V.d,question,,DataPortability,How should issues of suspected compromised acc...


In [66]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [67]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [68]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [70]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Individuals have the right to receive their pe...,summary,,DataPortability
1,II.a,Individuals have the right to receive a copy o...,summary,,DataPortability
2,II.b,Individuals have the right to transmit their p...,summary,,DataPortability
3,II.c,Data portability allows individuals to receive...,summary,,DataPortability
4,II.d,When an individual exercises their right to da...,summary,,DataPortability
...,...,...,...,...,...
82,V.c,How can large or complex personal data collect...,question,,DataPortability
83,V.c,What role do dashboards play in aiding the und...,question,,DataPortability
84,V.d,How can portable data be secured during transm...,question,,DataPortability
85,V.d,How should issues of suspected compromised acc...,question,,DataPortability


In [71]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines


In [74]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [75]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Individuals have the right to receive their pe...,summary,"[-0.06711183488368988, -0.008369826711714268, ...",DataPortability
1,II.a,Individuals have the right to receive a copy o...,summary,"[-0.0680936649441719, -0.00960722379386425, -0...",DataPortability
2,II.b,Individuals have the right to transmit their p...,summary,"[-0.06925682723522186, -0.012536535039544106, ...",DataPortability
3,II.c,Data portability allows individuals to receive...,summary,"[-0.04807808995246887, -0.0008425539126619697,...",DataPortability
4,II.d,When an individual exercises their right to da...,summary,"[-0.045747801661491394, -0.028073307126760483,...",DataPortability
...,...,...,...,...,...
82,V.c,How can large or complex personal data collect...,question,"[-0.04162163287401199, -0.009414666332304478, ...",DataPortability
83,V.c,What role do dashboards play in aiding the und...,question,"[-0.053713295608758926, 0.0035466146655380726,...",DataPortability
84,V.d,How can portable data be secured during transm...,question,"[-0.038466013967990875, 0.006264044903218746, ...",DataPortability
85,V.d,How should issues of suspected compromised acc...,question,"[-0.04439269378781319, -0.055500928312540054, ...",DataPortability


In [76]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
port_index = load_parquet_data("../inputs/index/data_breach.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('20')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- What is the right to data portability?
-- What does data portability mean for the transfer of data between controllers?
-- Is the right to data portability unlimited?
-- The data subject has the right to receive their personal data in a structured, commonly used, and machine-readable format and transmit it to another controller without hindrance, provided the processing: (a) is based on consent or a contract, and (b) is carried out by automated means. They also have the right to have the data transmitted directly from one controller to another, where technically feasible. This right does not apply to processing necessary for public interest tasks or exercises of official authority, nor shall it adversely affect the rights and freedoms of others.


In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)