In [4]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [5]:
class_name = "Concent"
prefix = "consent"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [6]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.consent
importlib.reload(gdpr_rag.documents.consent)

from gdpr_rag.documents.consent import Consent
path_to_manual_as_csv_file = "../inputs/documents/consent.parquet"

doc = Consent(path_to_manual_as_csv_file)



In [7]:
# from regulations_rag.regulation_table_of_content import StandardTableOfContent
# reference_checker = doc.reference_checker
# df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
# toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)
toc = doc.get_toc()

In [8]:
toc.print_tree()

Guidelines 05/2020 on consent under Regulation 2016/679 []
|-- 0 [PREFACE]
|-- 1 [INTRODUCTION]
|-- 2 [CONSENT IN ARTICLE 4(11) OF THE GDPR]
|-- 3 [ELEMENTS OF VALID CONSENT]
|   |-- .1 [Free / freely given]
|   |   |-- .1 [Imbalance of power]
|   |   |-- .2 [Conditionality]
|   |   |-- .3 [Granularity]
|   |   +-- .4 [Detriment]
|   |-- .2 [Specific]
|   |-- .3 [Informed]
|   |   |-- .1 [Minimum content requirements for consent to be 'informed']
|   |   +-- .2 [How to provide information]
|   +-- .4 [Unambiguous indication of wishes]
|-- 4 [OBTAINING EXPLICIT CONSENT]
|-- 5 [ADDITIONAL CONDITIONS FOR OBTAINING VALID CONSENT]
|   |-- .1 [Demonstrate consent]
|   +-- .2 [Withdrawal of consent]
|-- 6 [INTERACTION BETWEEN CONSENT AND OTHER LAWFUL GROUNDS IN ARTICLE 6 GDPR]
|-- 7 [SPECIFIC AREAS OF CONCERN IN THE GDPR]
|   |-- .1 [Children (Article 8)]
|   |   |-- .1 [Information society service]
|   |   |-- .2 [Offered directly to a child]
|   |   |-- .3 [Age]
|   |   +-- .4 [Children's c

In [14]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0, 1]) # preface and Introduction
split_df = split_df[split_df["token_count"] > 115] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,2,2 CONSENT IN ARTICLE 4(11) OF THE GDPR \n8. Ar...,432
1,3,3 ELEMENTS OF VALID CONSENT \n11. Article 4(11...,166
2,3.1,3 ELEMENTS OF VALID CONSENT \n\n3.1 Free / fre...,658
3,3.1.1,3 ELEMENTS OF VALID CONSENT \n3.1 Free / freel...,1496
4,3.1.2,3 ELEMENTS OF VALID CONSENT \n3.1 Free / freel...,2094
5,3.1.3,3 ELEMENTS OF VALID CONSENT \n3.1 Free / freel...,609
6,3.1.4,3 ELEMENTS OF VALID CONSENT \n3.1 Free / freel...,784
7,3.2,3 ELEMENTS OF VALID CONSENT \n\n3.2 Specific \...,954
8,3.3,3 ELEMENTS OF VALID CONSENT \n\n3.3 Informed \...,158
9,3.3.1,3 ELEMENTS OF VALID CONSENT \n3.3 Informed \n\...,566


In [15]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [16]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [17]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [95]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
model_summary = model_summary.replace('"', '\\"')
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
model_questions = model_questions.replace('"', '\\"')
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
8 CONSENT OBTAINED UNDER DIRECTIVE 95/46/EC 
166. Controllers that currently process data on the basis of consent in compliance with national data protection law are not automatically required to completely refresh all existing consent relations with data subjects in preparation for the GDPR. Consent, which has been obtained, to date continues to be valid in so far as it is in line with the conditions laid down in the GDPR. 
167. It is important for controllers to review current work processes and records in detail, before 25 May 2018, to be sure existing consents meet the GDPR standard (see Recital 171 of the GDPR[^78]). In practice, the GDPR raises the bar with regard to implementing consent mechanisms and introduces several new requirements that require controllers to alter consent mechanisms, rather than rewriting privacy policies alone.[^79]
168. For example, as the GDPR requires that a controller must be able to demonstrate that valid consent was obtained, all pres

In [96]:
model_summary

"Current data processing based on previously obtained consent under national law remains valid if it aligns with GDPR conditions. Review consents before 25 May 2018 to ensure they meet GDPR standards, which include stricter requirements for consent mechanisms beyond merely updating privacy policies.\n\nDemonstrating valid consent is necessary; consents without records or those obtained through implied actions, like pre-ticked boxes, need renewing. Update operations and IT systems to facilitate granular indications of consent and easy withdrawal mechanisms. Provide information on how individuals can withdraw consent. If existing procedures don't comply, obtain new consent.\n\nSome requirements under Articles 13 and 14 of GDPR do not need to be present for informed consent, meaning the extended information obligations do not necessarily invalidate previously granted consent.\n\nIf previous consent does not meet GDPR standards, take action to comply, potentially by refreshing consent. GDP

In [97]:
# index: 23, section_reference: 8

df_summary.loc[index, "text"] = "Current data processing based on previously obtained consent under national law remains valid if it aligns with GDPR conditions. Review consents before 25 May 2018 to ensure they meet GDPR standards, which include stricter requirements for consent mechanisms beyond merely updating privacy policies.\n\nDemonstrating valid consent is necessary; consents without records or those obtained through implied actions, like pre-ticked boxes, need renewing. Update operations and IT systems to facilitate granular indications of consent and easy withdrawal mechanisms. Provide information on how individuals can withdraw consent. If existing procedures don't comply, obtain new consent.\n\nSome requirements under Articles 13 and 14 of GDPR do not need to be present for informed consent, meaning the extended information obligations do not necessarily invalidate previously granted consent.\n\nIf previous consent does not meet GDPR standards, take action to comply, potentially by refreshing consent. GDPR prohibits switching from one lawful basis to another. If compliant consent cannot be renewed, or another lawful basis cannot be justified, stop the processing activities. Ensure all processing follows principles of lawful, fair, and transparent processing."

df_questions.loc[index, "text"] = "How should existing consents be reviewed to ensure they meet the new standards?|What happens if the current consent mechanisms do not align with GDPR requirements?"


index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [102]:

df_questions


Unnamed: 0,section_reference,source,embedding,document,text
0,2,question,,Concent,What is meant by 'consent'?
1,2,question,,Concent,What is required for consent to be valid?
2,2,question,,Concent,What provisions explain the requirements for g...
3,3,question,,Concent,What elements are necessary for consent to be ...
4,3.1,question,,Concent,What constitutes freely given consent?
...,...,...,...,...,...
68,7.3,question,,Concent,What rights does an individual have when proce...
69,7.3,question,,Concent,What rights to data does an individual have if...
70,7.3,question,,Concent,How does withdrawing consent affect an individ...
71,8,question,,Concent,How should existing consents be reviewed to en...


In [98]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [100]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [101]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [103]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2,"Consent must be freely given, specific, inform...",summary,,Concent
1,3,"Consent must meet four criteria: freely given,...",summary,,Concent
2,3.1,"To be valid, consent must be given freely. Thi...",summary,,Concent
3,3.1.1,Freely given consent requires a genuine choice...,summary,,Concent
4,3.1.2,Consent must be freely given. Bundling consent...,summary,,Concent
...,...,...,...,...,...
92,7.3,What rights does an individual have when proce...,question,,Concent
93,7.3,What rights to data does an individual have if...,question,,Concent
94,7.3,How does withdrawing consent affect an individ...,question,,Concent
95,8,How should existing consents be reviewed to en...,question,,Concent


In [104]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines


In [8]:
concent_index['document'] = 'Consent'

In [13]:
concent_index
save_parquet_data(concent_index, index_file, key)

In [9]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


NameError: name 'df_index' is not defined

In [5]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')


gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
concent_index = load_parquet_data("../inputs/index/consent.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('4')].iterrows():
    print(f"-- {row['text']}")


# for index, row in gdpr_index.iterrows():
#     if 'concent' in row['text'].lower():
#         print(f"* {row['text']}")

-- What is personal data?
-- What is processing?
-- What is restriction of processing?
-- What is profiling?
-- What is pseudonymisation?
-- What is a filing system?
-- What is a controller?
-- What is a processor?
-- What is a recipient?
-- What is a third party?
-- What is consent?
-- What is a personal data breach?
-- What is genetic data?
-- What is biometric data?
-- What is data concerning health?
-- What is a main establishment?
-- What is a representative?
-- What is an enterprise?
-- What is a group of undertakings?
-- What are binding corporate rules?
-- What is supervisory authority?
-- What is supervisory authority concerned?
-- What is cross-border processing?
-- What is relevant and reasoned objection?
-- What is information society service?
-- What is international organisation?
-- Who is encouraged to create codes of conduct for GDPR compliance?
-- What purposes do codes of conduct serve?
-- Can codes of conduct address the protection of children's data?
-- How can code

In [6]:
concent_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2,"Consent must be freely given, specific, inform...",summary,"[-0.03871389850974083, -0.028905309736728668, ...",Concent
1,3,"Consent must meet four criteria: freely given,...",summary,"[-0.015930110588669777, -0.01131503377109766, ...",Concent
2,3.1,"To be valid, consent must be given freely. Thi...",summary,"[-0.031091932207345963, -0.00736756669357419, ...",Concent
3,3.1.1,Freely given consent requires a genuine choice...,summary,"[-0.03908756002783775, -0.04293598607182503, -...",Concent
4,3.1.2,Consent must be freely given. Bundling consent...,summary,"[-0.02875448577105999, -0.024869825690984726, ...",Concent
...,...,...,...,...,...
92,7.3,What rights does an individual have when proce...,question,"[-0.02233157679438591, -0.050518155097961426, ...",Concent
93,7.3,What rights to data does an individual have if...,question,"[-0.06421481817960739, -0.0555371418595314, -0...",Concent
94,7.3,How does withdrawing consent affect an individ...,question,"[-0.049304455518722534, -0.04337126389145851, ...",Concent
95,8,How should existing consents be reviewed to en...,question,"[-0.02116905152797699, -0.0059234024956822395,...",Concent


In [7]:
covid_index['section_reference'] = covid_index['section_reference'].astype(str)

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)