In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.lead_sa
importlib.reload(gdpr_rag.documents.lead_sa)
from gdpr_rag.documents.lead_sa import Lead_SA

path_to_manual_as_csv_file = "../inputs/documents/lead_sa.parquet"

doc = Lead_SA(path_to_manual_as_csv_file)


In [3]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "intl_transfer", reference_checker = reference_checker, regulation_df = df)

In [4]:
toc.print_tree()

intl_transfer []
|-- 1 [Identifying a lead supervisory authority: the key concepts.  ]
|   |-- .1 ['Cross-border processing of personal data'.   ]
|   |   +-- .1 ['Substantially affects'.  ]
|   |-- .2 [Lead supervisory authority.  ]
|   +-- .3 [Main establishment.   ]
|-- 2 [Steps to identify the lead supervisory authority  ]
|   |-- .1 [Identify the 'main establishment' for controllers   ]
|   |   |-- .1 [Criteria for identifying a controller's main establishment in cases where it is not the place of its central administration in the EU.  ]
|   |   |-- .2 [Groups of undertakings  ]
|   |   +-- .3 [Joint data controllers  ]
|   |-- .2 [Borderline cases  ]
|   +-- .3 [Processor ]
|-- 3 [Other relevant issues ]
|   |-- .1 [The role of the 'supervisory authority concerned' ]
|   |-- .2 [Local processing.  ]
|   +-- .3 [Companies not established within the EU.   ]
+-- Annex [Questions to guide the identification of the lead supervisory authority   ]


In [12]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df[split_df["token_count"] > 20]
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,1.1,1 Identifying a lead supervisory authority: th...,261
1,1.1.1,1 Identifying a lead supervisory authority: th...,784
2,1.2,1 Identifying a lead supervisory authority: th...,187
3,1.3,1 Identifying a lead supervisory authority: th...,200
4,2.1,2 Steps to identify the lead supervisory autho...,914
5,2.1.1,2 Steps to identify the lead supervisory autho...,410
6,2.1.2,2 Steps to identify the lead supervisory autho...,280
7,2.1.3,2 Steps to identify the lead supervisory autho...,210
8,2.2,2 Steps to identify the lead supervisory autho...,569
9,2.3,2 Steps to identify the lead supervisory autho...,281


In [13]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [14]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [15]:
summary_file = "../tmp/lead_sa_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "Lead_SA"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/lead_sa_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "Lead_SA"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [56]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
Annex Questions to guide the identification of the lead supervisory authority   
1. Is the controller or processor carrying out the cross-border processing of personal data? 
a. Yes, if:  
- the controller or processor is established in more than one Member State and  - the processing of personal data takes place in the context of the activities of 
establishments in more than one Member State. 
In this case, go to section 2.  
b. Yes, if:  
- the processing of personal data takes place in the context of the activities of a data 
controller or processor's single establishment in the Union, but: 
- substantially affects or is likely to substantially affect individuals in more than one 
Member State.   
In this case, the lead authority is the authority for the controller or processor's single 
establishment in a single Member State. This must - by logic - be the controller or processor's main establishment because it is its only establishment.  
2. How to identify the 'lea

In [48]:
model_summary

"The 'supervisory authority concerned' is a supervisory authority involved in personal data processing due to:\n\n(a) The establishment of the controller or processor in its Member State.\n(b) Significant impact on individuals residing in its Member State.\n(c) Complaints received by that authority.\n\nThis ensures other authorities can influence decisions despite the 'lead authority' model. Factors like residency, not citizenship, suffice for involvement. \n\nWhen a lead supervisory authority opts not to handle a case, the concerned one that informed the lead should take over, following the procedures in Article 61 (Mutual assistance) and Article 62 (Joint operations of supervisory authorities). For example, if a French company launches a product affecting only Portuguese individuals, both authorities may agree that the Portuguese authority should lead the matter.\n\nSupervisory authorities may request controllers provide clarifications on corporate arrangements. These authorities sho

In [57]:
# index: 13, section_reference: Annex

df_summary.loc[index, "text"] = "Workflow to determine the lead supervisory authority"

df_questions.loc[index, "text"] = "How do I determine my lead supervisory authority?"




index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [70]:
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,1.1,question,,Lead_SA,What is the significance of identifying a lead...
1,1.1,question,,Lead_SA,When is identifying a lead supervisory author...
2,1.1,question,,Lead_SA,How does an organisation determine if its act...
3,1.1.1,question,,Lead_SA,What constitutes 'substantially affecting' ind...
4,1.1.1,question,,Lead_SA,How do we determine if data processing has a s...
5,1.1.1,question,,Lead_SA,What are the criteria for considering an effec...
6,1.1.1,question,,Lead_SA,What examples would Supervisory Authorities co...
7,1.2,question,,Lead_SA,What is a lead supervisory authority?
8,1.2,question,,Lead_SA,What responsibility does the lead supervisory ...
9,1.2,question,,Lead_SA,How is the lead supervisory authority identified?


In [65]:
summary_file = "../tmp/lead_sa_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/lead_sa_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [67]:
import pandas as pd
summary_file = "../tmp/lead_sa_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/lead_sa_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [68]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [71]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,Identifying a lead supervisory authority is re...,summary,,Lead_SA
1,1.1.1,'Cross-border processing of personal data':\n\...,summary,,Lead_SA
2,1.2,The lead supervisory authority is primarily re...,summary,,Lead_SA
3,1.3,"Main establishment refers to:\n\n- For you, if...",summary,,Lead_SA
4,2.1,To identify the lead supervisory authority:\n\...,summary,,Lead_SA
5,2.1.1,"To identify the lead supervisory authority, fi...",summary,,Lead_SA
6,2.1.2,When processing is handled by a group with hea...,summary,,Lead_SA
7,2.1.3,To identify the lead supervisory authority for...,summary,,Lead_SA
8,2.2,In situations where identifying the main estab...,summary,,Lead_SA
9,2.3,The main establishment of a processor will be ...,summary,,Lead_SA


In [72]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines


In [73]:
file = "../inputs/index/lead_sa.parquet"
df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [74]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,Identifying a lead supervisory authority is re...,summary,"[-0.006422937847673893, 0.00971098430454731, -...",Lead_SA
1,1.1.1,'Cross-border processing of personal data':\n\...,summary,"[-0.01792445220053196, 0.00020537340606097132,...",Lead_SA
2,1.2,The lead supervisory authority is primarily re...,summary,"[-0.001868635998107493, 0.002838078886270523, ...",Lead_SA
3,1.3,"Main establishment refers to:\n\n- For you, if...",summary,"[0.0033447048626840115, 0.035395193845033646, ...",Lead_SA
4,2.1,To identify the lead supervisory authority:\n\...,summary,"[-0.00579441711306572, 0.016650624573230743, -...",Lead_SA
5,2.1.1,"To identify the lead supervisory authority, fi...",summary,"[0.014092209748923779, 0.028814133256673813, -...",Lead_SA
6,2.1.2,When processing is handled by a group with hea...,summary,"[0.011525212787091732, 0.020529285073280334, -...",Lead_SA
7,2.1.3,To identify the lead supervisory authority for...,summary,"[0.010616075247526169, -0.009036075323820114, ...",Lead_SA
8,2.2,In situations where identifying the main estab...,summary,"[0.005761819891631603, -0.002088783774524927, ...",Lead_SA
9,2.3,The main establishment of a processor will be ...,summary,"[0.0022017881274223328, 0.014683345332741737, ...",Lead_SA


In [76]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
lead_sa_index = load_parquet_data("../inputs/index/lead_sa.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('60')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- How do supervisory authorities collaborate on GDPR enforcement?
-- What role does the lead supervisory authority play in GDPR investigations?
-- How is consensus reached between supervisory authorities under GDPR?
-- What happens if there is disagreement among supervisory authorities on a draft decision?
-- What is the process for revising a draft decision when objections are raised?
-- How are decisions communicated to you and enforced across the European Union?
-- What options are available if urgent action is needed to protect an individual's interests?
-- How do supervisory authorities exchange information under GDPR?
-- The lead supervisory authority must cooperate and share all relevant information with other supervisory authorities to reach a consensus on matters pertaining to you. They may also request assistance or conduct joint operations for investigative or monitoring purposes. 
When drafting decisions involving you, the lead supervisory authority will communicate this t

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)