In [9]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [1]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.article_49_intl_transfer
importlib.reload(gdpr_rag.documents.article_49_intl_transfer)
from gdpr_rag.documents.article_49_intl_transfer import Article_49_Intl_Transfer

path_to_manual_as_csv_file = "../inputs/documents/article_49_intl_transfer.parquet"

doc = Article_49_Intl_Transfer(path_to_manual_as_csv_file)


In [5]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "intl_transfer", reference_checker = reference_checker, regulation_df = df)

In [6]:
# Remove the annexes from the tree for indexing

toc.print_tree()

decisions []
|-- 1 [GENERAL   ]
+-- 2 [SPECIFIC INTERPRETATION OF THE PROVISIONS OF ARTICLE 49  ]
    |-- .1 [The data subject has explicitly consented to the proposed transfer, after having been informed of the possible risks of such transfers for the data subject due to the absence of an adequacy decision and appropriate safeguards - Article (49 (1) (a))  ]
    |   |-- .1 [Consent must be explicit ]
    |   |-- .2 [Consent must be specific for the particular data transfer/set of transfers   ]
    |   +-- .3 [Consent must be informed[^15]  particularly as to the possible risks of the transfer 
[^15] The general transparency requirements of Articles 13 and 14 of the GDPR should also be complied with. For more information see Guidelines on transparency under Regulation 2016/679 (WP 260)]
    |-- .2 [Transfer necessary for the performance of a contract between the data subject and the controller or for the implementation of precontractual measures taken at the data subject's request - (4

In [7]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)
# from regulations_rag.regulation_table_of_content import split_tree

# split_df = split_tree(node = toc.root, document = doc, table_of_content = toc, token_limit = 1300)

In [11]:
split_df = split_df.drop([0, 1]) # Intro and heading
split_df.reset_index(drop=True, inplace=True)
split_df



Unnamed: 0,section_reference,text,token_count
0,2.1.1,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,297
1,2.1.2,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,543
2,2.1.3,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,764
3,2.2,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,954
4,2.3,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,528
5,2.4,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,907
6,2.5,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,865
7,2.6,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,757
8,2.7,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,571
9,2.8,2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF...,2167


In [11]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [12]:
import pandas as pd
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [15]:
summary_file = "../tmp/intl_transfer_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "Article_49_Intl_Transfer"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/47_intl_transfer_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "Article_49_Intl_Transfer"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [43]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
2 SPECIFIC INTERPRETATION OF THE PROVISIONS OF ARTICLE 49  

2.8 Compelling legitimate interests - (49 (1) § 2) 
Article 49 (1) § 2 introduces a new derogation which was not previously included in the Directive. Under a number of specific, expressly enumerated conditions, personal data can be transferred if it is necessary for the purposes of compelling legitimate interests pursued by the data exporter.  
This derogation is envisaged by the law as a last resort, as it will only apply where "a transfer could not be based on a provision in Article 45 or 46, including the provisions on binding corporate rules, and none of the derogations for a specific situation is applicable".[^38]
This layered approach to considering the use of derogations as a basis for transfers requires consideration of whether it is possible to use a transfer tool provided in Article 45 or 46 or one of the specific derogations set out in Article 49 (1) § 1, before resorting to the derogation of Articl

In [44]:
model_summary

'Personal data can be transferred if it is necessary for compelling legitimate interests pursued by the data exporter, provided certain specific conditions are met. This is a last-resort option when no other provisions or derogations apply.\n\n1. **Conditions for Use**:\n   - Cannot rely on appropriate safeguards in Articles 45 or 46.\n   - Cannot apply any specific derogations in Article 49(1) § 1.\n   - Must demonstrate serious attempts to use other means.\n\n2. **Compelling Legitimate Interests**:\n   - Must be essential for the data exporter and not overridden by the interests or rights and freedoms of the individual.\n   - Interests of data processors or importers are not relevant.\n   - Must have a higher threshold than general legitimate interests.\n   - Example: Protecting the organisation from immediate harm.\n\n3. **Non-repetitive and Limited Scope**:\n   - Transfers must not be repetitive.\n   - Should concern a limited number of individuals, context-dependent.\n   - Example

In [45]:
df_summary.loc[index, "text"] = 'Personal data can be transferred if it is necessary for compelling legitimate interests pursued by the data exporter, provided certain specific conditions are met. This is a last-resort option when no other provisions or derogations apply.\n\n1. Conditions for Use:\n   - Cannot rely on appropriate safeguards in Articles 45 or 46.\n   - Cannot apply any specific derogations in Article 49(1) § 1.\n   - Must demonstrate serious attempts to use other means.\n\n2. Compelling Legitimate Interests:\n   - Must be essential for the data exporter and not overridden by the interests or rights and freedoms of the individual.\n   - Interests of data processors or importers are not relevant.\n   - Must have a higher threshold than general legitimate interests.\n   - Example: Protecting the organisation from immediate harm.\n\n3. Non-repetitive and Limited Scope:\n   - Transfers must not be repetitive.\n   - Should concern a limited number of individuals, context-dependent.\n   - Example: Data transfer for detecting a specific security incident impacting a few employees.\n\n4. Balancing Test:\n   - Assess all circumstances of the transfer.\n   - Provide suitable safeguards reducing the impact on individuals.\n   - Consider possible negative effects on individuals.\n   - Factors include nature of data, purpose and duration of processing, and situation in relevant countries.\n   - Apply additional safeguards to minimise risks, such as data deletion post-transfer or purpose limitation.\n\n5. Inform Supervisory Authority:\n   - No need for authorisation, but must inform the authority as part of accountability.\n   - Record all relevant aspects of the transfer.\n\n6. Inform Individuals:\n   - Notify of the transfer and the compelling legitimate interests pursued.\n   - Provide this information in addition to requirements under Articles 13 and 14.'

df_questions.loc[index, "text"] = "What is a compelling legitimate interest?|When can personal data be transferred based on a compelling legitimate interest?|What is the process to determine if a data transfer is necessary for compelling legitimate interests?|What conditions need to be met to transfer data under the compelling legitimate interest derogation?|What information must be provided to individuals about a data transfer based on compelling legitimate interests?"




index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [27]:
#df_questions.at[9, "section_reference"] ='III.D.d'
df_questions

Unnamed: 0,section_reference,source,embedding,document,text
0,2.1.1,question,,Article_49_Intl_Transfer,When is explicit consent necessary?
1,2.1.1,question,,Article_49_Intl_Transfer,Why is explicit consent required for internati...
2,2.1.1,question,,Article_49_Intl_Transfer,Where can I find more guidance on the requirem...
3,2.1.2,question,,Article_49_Intl_Transfer,Why is specific consent important for data tra...
4,2.1.2,question,,Article_49_Intl_Transfer,Is it possible to obtain consent for future d...
5,2.1.2,question,,Article_49_Intl_Transfer,What happens if the data transfer circumstanc...
6,2.1.2,question,,Article_49_Intl_Transfer,Can general consent given during data collect...
7,2.1.3,question,,Article_49_Intl_Transfer,What does it mean to give informed consent for...
8,2.1.3,question,,Article_49_Intl_Transfer,What information must individuals be given bef...
9,2.1.3,question,,Article_49_Intl_Transfer,Why is it essential to inform individuals abou...


In [22]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1.1,Individuals must explicitly consent to propose...,summary,,Article_49_Intl_Transfer
1,2.1.2,Individuals must explicitly consent to propose...,summary,,Article_49_Intl_Transfer
2,2.1.3,Individuals must explicitly consent to the pro...,summary,,Article_49_Intl_Transfer
3,2.2,Transfers based on Article 49(1)(b) can occur ...,summary,,Article_49_Intl_Transfer
4,2.3,Transfers of personal data to a third country ...,summary,,Article_49_Intl_Transfer
5,2.4,Transfers based on important public interest m...,summary,,Article_49_Intl_Transfer
6,2.5,Transfers can occur if they are necessary for ...,summary,,Article_49_Intl_Transfer
7,2.6,Article 49(1)(f) permits data transfers outsid...,summary,,Article_49_Intl_Transfer
8,2.7,Transfer of personal data from registers is al...,summary,,Article_49_Intl_Transfer
9,2.8,Personal data can be transferred if it is nece...,summary,,Article_49_Intl_Transfer


In [24]:
summary_file = "../tmp/intl_transfer_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/47_intl_transfer_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [25]:
import pandas as pd
summary_file = "../tmp/intl_transfer_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/47_intl_transfer_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [26]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]

df_questions.reset_index(drop=True, inplace=True)


In [28]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
#df_index["document"] = 'Article_49_Intl_Transfer'
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1.1,Individuals must explicitly consent to propose...,summary,,Article_49_Intl_Transfer
1,2.1.2,Individuals must explicitly consent to propose...,summary,,Article_49_Intl_Transfer
2,2.1.3,Individuals must explicitly consent to the pro...,summary,,Article_49_Intl_Transfer
3,2.2,Transfers based on Article 49(1)(b) can occur ...,summary,,Article_49_Intl_Transfer
4,2.3,Transfers of personal data to a third country ...,summary,,Article_49_Intl_Transfer
5,2.4,Transfers based on important public interest m...,summary,,Article_49_Intl_Transfer
6,2.5,Transfers can occur if they are necessary for ...,summary,,Article_49_Intl_Transfer
7,2.6,Article 49(1)(f) permits data transfers outsid...,summary,,Article_49_Intl_Transfer
8,2.7,Transfer of personal data from registers is al...,summary,,Article_49_Intl_Transfer
9,2.8,Personal data can be transferred if it is nece...,summary,,Article_49_Intl_Transfer


In [29]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines


In [30]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

file = "../inputs/index/article_49_intl_transfer.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [7]:
#df_index["document"] = "Article_49_Intl_Transfer"
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1.1,Individuals must explicitly consent to propose...,summary,"[-0.04320913925766945, -0.05655830353498459, -...",Article_49_Intl_Transfer
1,2.1.2,Individuals must explicitly consent to propose...,summary,"[-0.025641605257987976, -0.025930145755410194,...",Article_49_Intl_Transfer
2,2.1.3,Individuals must explicitly consent to the pro...,summary,"[-0.03767729178071022, -0.0662284716963768, -0...",Article_49_Intl_Transfer
3,2.2,Transfers based on Article 49(1)(b) can occur ...,summary,"[-0.028453944250941277, 0.0008164968458004296,...",Article_49_Intl_Transfer
4,2.3,Transfers of personal data to a third country ...,summary,"[-0.02664925530552864, -0.024371206760406494, ...",Article_49_Intl_Transfer
5,2.4,Transfers based on important public interest m...,summary,"[-0.015191680751740932, -0.013405660167336464,...",Article_49_Intl_Transfer
6,2.5,Transfers can occur if they are necessary for ...,summary,"[0.006158918142318726, -0.005650562699884176, ...",Article_49_Intl_Transfer
7,2.6,Article 49(1)(f) permits data transfers outsid...,summary,"[-0.005364328622817993, 0.019495228305459023, ...",Article_49_Intl_Transfer
8,2.7,Transfer of personal data from registers is al...,summary,"[-0.043016571551561356, 0.03480661287903786, -...",Article_49_Intl_Transfer
9,2.8,Personal data can be transferred if it is nece...,summary,"[-0.020666183903813362, -0.04024985060095787, ...",Article_49_Intl_Transfer


In [31]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
intl_index = load_parquet_data("../inputs/index/article_49_intl_transfer.parquet", key)



In [32]:
section = gdpr_index[gdpr_index["section_reference"] == "49"]

#section
for index, row in section.iterrows():
    print(f"-- {row['text']}")

-- What are the conditions for transferring personal data to another country or international organisation?
-- When is an individual's consent required for data transfer?
-- How does a contract affect the transfer of personal data?
-- What constitutes an important reason of public interest for transferring personal data?
-- When can data be transferred for the establishment, exercise, or defence of legal claims?
-- Under what circumstances can personal data be transferred to protect someone's vital interests?
-- What are the rules for transferring data from public registers?
-- What are the requirements for a non-repetitive transfer based on compelling legitimate interests?
-- When are public authorities exempt from certain transfer conditions?
-- How does the law recognise public interest for data transfers?
-- What documentation is required for assessing data transfers?
-- In the absence of specific international data transfer agreements or safeguards:
- You may transfer personal dat

In [50]:
tmp = decision_index[(decision_index["section_reference"]== "I")]
for index, row in tmp.iterrows():
    print(f"-- {row['text']}")

-- Profiling and automated decision-making are increasingly utilised across various sectors such as banking, healthcare, and marketing, driven by technological advancements in big data, AI, and machine learning. These techniques can greatly benefit efficiency and personalisation but also pose risks to individual rights and freedoms due to their potential invasiveness and the issues around transparency and discrimination they can create.

The GDPR seeks to mitigate these risks by implementing specific provisions to protect privacy and other related rights. You must ensure appropriate safeguards when employing profiling and automated decision-making technologies, keeping in mind their potential to impact individuals based on inaccurate data, perpetuate stereotypes, or restrict individual choices.

Particularly under Article 22, you have obligations regarding solely automated decisions that have legal or similarly significant effects on individuals, including the requirement to provide me

In [8]:
file = "../inputs/index/article_49_intl_transfer.parquet"
#file = "../inputs/index/gdpr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(gdpr_index, file, key)

In [41]:
from src.index_tools import update_text_in_index, add_to_index

index_df = decision_index
#index_df = gdpr_index

text_to_change = "How do automated decision-making and profiling differ?"
changed_text = "What is automated decision-making?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_49_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_49_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [51]:
from src.index_tools import remove_from_index

text_to_delete = "What safeguards does the General Data Protection Regulation (GDPR) introduce for profiling and automated decision-making?"
decision_index = remove_from_index(decision_index, text_to_delete)

In [88]:
article_49_index = index_df
article_47_index[article_49_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)