In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.article_47_bcr
importlib.reload(gdpr_rag.documents.article_47_bcr)
from gdpr_rag.documents.article_47_bcr import Article_47_BCR

path_to_manual_as_csv_file = "../../inputs/documents/article_47_bcr.csv"
doc = Article_47_BCR(path_to_manual_as_csv_file)


In [5]:
doc.document_as_df.at[154, "section_reference"] = "Analysis"

In [7]:
doc.document_as_df.to_csv(path_to_manual_as_csv_file, encoding = "utf-8", sep="|", index = False, na_rep="")


In [121]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_csv("../inputs/documents/article_47_bcr.csv", sep = "|", encoding = "utf-8")
toc = StandardTableOfContent(root_node_name = "article_47_bcr", reference_checker = reference_checker, regulation_df = df)

In [4]:
toc.print_tree()

article_47_bcr []
|-- 1 [INTRODUCTION ]
|   |-- .1 []
|   |-- .2 []
|   |-- .3 []
|   |-- .4 []
|   |-- .5 []
|   |-- .6 []
|   |-- .7 []
|   |-- .8 []
|   |-- .9 []
|   |-- .10 []
|   |-- .11 []
|   |-- .12 []
|   |-- .13 []
|   |-- .14 []
|   |-- .15 []
|   +-- .16 []
|-- 2 [APPLICATION FORM  ]
|-- Application []
|   |-- . Part 1 [ APPLICANT INFORMATION]
|   |   |-- .1 [STRUCTURE AND CONTACT DETAILS OF THE GROUP OF UNDERTAKINGS OR GROUP OF ENTERPRISES ENGAGED IN A JOINT ECONOMIC ACTIVITY (THE GROUP)  ]
|   |   |-- .2 [SHORT DESCRIPTION OF PROCESSING AND DATA FLOWS [^11]  
[^11] See Article 47(2)(a) and (b) GDPR.]
|   |   |-- .3 [DETERMINATION OF THE LEAD SUPERVISORY AUTHORITY ('BCR LEAD') [^12]  
[^12] See Part 1, WP 263.]
|   |   +-- .4 [ACKNOWLEDGEMENT  ]
|   |-- . Part 2 [ BACKGROUND PAPER  ]
|   |   |-- .5 [BINDING NATURE OF THE BCR-C  ]
|   |   +-- .6 [EFFECTIVENESS  ]
|   |-- . Annex 1 [ COPY OF THE  BCR-C  ]
|   +-- . Annex 2 [ COPY OF THE FILLED-OUT TABLE "ELEMENTS AND PRINCI

In [122]:
sys.path.append('E:/Code/chat/gdpr')

import gdpr_rag.split_tree
importlib.reload(gdpr_rag.split_tree)
from gdpr_rag.split_tree import split_tree

split_df = split_tree(node = toc.root, regulation_reader = doc, table_of_content = toc, token_limit = 1600)

In [123]:
# remove "Application" section
indices = split_df[split_df['section_reference'].str.contains("Application", na=False)].index.to_list()
split_df = split_df.drop(index=indices)
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,1.1,1 INTRODUCTION \n\n1. The GDPR expressly provi...,65
1,1.2,"1 INTRODUCTION \n\n2. On 6 February 2018, the ...",109
2,1.3,"1 INTRODUCTION \n\n3. On 11 April 2018, the Ar...",98
3,1.4,1 INTRODUCTION \n\n4. These recommendations ar...,195
4,1.5,1 INTRODUCTION \n\n5. BCR-C are suitable for f...,494
5,1.6,1 INTRODUCTION \n\n6. EU data protection legis...,49
6,1.7,1 INTRODUCTION \n\n7. Pursuant to Article 46(2...,110
7,1.8,1 INTRODUCTION \n\n8. BCR are subject to appro...,123
8,1.9,1 INTRODUCTION \n\n9. The draft approval decis...,100
9,1.10,"1 INTRODUCTION \n\n10. However, the approval d...",276


In [26]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [24]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [137]:
summary_file = "../tmp/article_47_bcr_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "article_47_bcr"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/article_47_bcr_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "article_47_bcr"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 2


In [234]:
#model = "gpt-3.5-turbo"
#model="gpt-4"
model = "gpt-4-0125-preview"
model = "gpt-4-turbo-2024-04-09"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
1. Points that need to be addressed in a binding corporate resolution for controllers (BCR-C) are:
3.1.1: Duty to respect the BCR-C (Article 47(1)(a) and (2)(c) GDPR [^15]).
3.1.2: Explanation of how the BCR-C are internally[^16] made binding on the BCR members, and on their employees (Article 47(1)(a) and (2)(c) GDPR).
3.1.3.1: Creation of third-party beneficiary rights that are enforceable by data subjects (Article 47(1)(b), (2)(c) and (e) GDPR).
3.1.3.2: Right to judicial remedies, redress and compensation for data subjects (Article 47(2)(e) and Articles 77 to 82 GDPR).
3.1.4: One or more BCR member(s) in the EEA with delegated data protection responsibility accept liability for paying compensation to data subjects and remedying breaches of the BCR-C (hereinafter "Liable BCR Member(s)") (Article 47(2)(f) GDPR).
3.1.5: The Liable BCR member(s) has sufficient assets (Article 70(1)(i) GDPR).
3.1.6: The burden of proof lies with the Liable BCR member(s) (Article 47(2)(f) 

In [235]:

df_summary.loc[index, "text"] = "You must respect the binding corporate rules for controllers (BCR-C) and ensure they are binding on all BCR members and their employees. These rules must provide explicit rights for individuals to enforce certain protections and to seek judicial remedies, including redress and compensation for breaches. At least one BCR member within the EEA is responsible for accepting liability and paying compensation arising from breaches. This member must have sufficient assets to cover these liabilities, and the burden of proof is on them in the event of a dispute.\n\nIndividuals must have easy access to the BCR-C. The scope of these rules, both material and geographical, and the list of BCR members should be clearly defined. You must implement a training programme on data protection, establish a compliant handling process, and conduct regular audits as outlined in the BCR-C. It’s also important to have a network of data protection officers or suitable staff to ensure ongoing compliance.\n\nCooperation with competent supervisory authorities is mandatory. The BCR-C should outline adherence to data protection principles such as lawfulness of processing, maintaining data security, and personal data breach response. Restrictions on onward data transfers and measures on handling government access requests to data must be specified. Clear guidelines on the rights of individuals consistent with Articles 12-22 of the GDPR should be provided. \n\nYour accountability measures, including local laws and practices affecting BCR-C compliance, need to be detailed. Finally, processes for termination, addressing non-compliance, and updating the BCR-C must be explicitly laid out, along with a comprehensive list of related definitions."

df_questions.loc[index, "text"] = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [218]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,,article_47_bcr
1,1.4,You must create a standard form for applying f...,summary,,article_47_bcr
2,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,,article_47_bcr
3,1.6,You must comply with EU data protection legisl...,summary,,article_47_bcr
4,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,,article_47_bcr
5,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,,article_47_bcr
6,1.9,If you adopt Binding Corporate Rules (BCRs) fo...,summary,,article_47_bcr
7,1.10,You must ensure that each data transfer compli...,summary,,article_47_bcr
8,1.11,You can use Binding Corporate Rules (BCRs) to ...,summary,,article_47_bcr
9,1.12,"Once approved, Binding Corporate Rules (BCRs) ...",summary,,article_47_bcr


In [236]:
summary_file = "../tmp/article_47_bcr_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/article_47_bcr_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [178]:
#######################################################
##### WAIT - ARE YOU SURE YOU WANT TO LOOK AT THIS?
#######################################################
# this cell removes an index if we are not going to use it
df_summary = df_summary.drop(index = [index])
df_summary.reset_index(drop=True, inplace=True)

df_questions = df_questions.drop(index = [index])
df_questions.reset_index(drop=True, inplace=True)
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")


You have completed 30.30% of the work
The next section is:
3 ELEMENTS AND PRINCIPLES TO BE FOUND IN BCR-C
1. BINDING NATURE

1.1. The BCR-C must be legally binding and should contain a clear duty for each BCR member, including their employees, to respect the BCR-C.


In [3]:
import pandas as pd
summary_file = "../tmp/article_47_bcr_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/article_47_bcr_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [5]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions.reset_index(drop=True, inplace=True)


In [13]:
type(df_questions.iloc[0]["embedding"])

numpy.float64

In [21]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",article_47_bcr
1,1.4,You must create a standard form for applying f...,summary,"[-0.013666943646967411, -0.09392475336790085, ...",article_47_bcr
2,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",article_47_bcr
3,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",article_47_bcr
4,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",article_47_bcr
...,...,...,...,...,...
87,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",article_47_bcr
88,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",article_47_bcr
89,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",article_47_bcr
90,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",article_47_bcr


In [19]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines


In [37]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [36]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.4,You must create a standard form for applying f...,summary,"[-0.013666943646967411, -0.09392475336790085, ...",Article_47_BCR
2,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
3,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
4,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
...,...,...,...,...,...
87,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
88,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
89,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
90,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [90]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
article_47_index = load_parquet_data("../inputs/index/article_47_bcr.parquet", key)

for index, row in index_df[index_df['section_reference'].str.startswith('47')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

In [92]:


section = gdpr_index[gdpr_index["section_reference"] == "47"]

for index, row in section.iterrows():
    print(f"-- {row['text']}")

-- What are binding corporate rules?
-- How do binding corporate rules become approved?
-- What requirements must binding corporate rules meet?
-- What details must be spelled out in binding corporate rules?
-- What rights do individuals have under binding corporate rules?
-- Who is liable if binding corporate rules are breached?
-- How are individuals informed about their rights under binding corporate rules?
-- What is the role of a data protection officer within a group adopting binding corporate rules?
-- What mechanisms ensure compliance with binding corporate rules?
-- How is the effectiveness of binding corporate rules verified?
-- What is the process for reporting changes in binding corporate rules?
-- How do groups cooperate with supervisory authorities under binding corporate rules?
-- What happens if local laws conflict with the standards of binding corporate rules?
-- What kind of training is required for employees with access to personal data under binding corporate rules?

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)