In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.decision_making
importlib.reload(gdpr_rag.documents.decision_making)
from gdpr_rag.documents.decision_making import DecisionMaking

path_to_manual_as_file = "../inputs/documents/decision_making.parquet"
doc = DecisionMaking(path_to_manual_as_file)


In [3]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet("../inputs/documents/decision_making.parquet", engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "decisions", reference_checker = reference_checker, regulation_df = df)

In [4]:
# Remove the annexes from the tree for indexing
annex_1_node = toc.get_node("Annex 1")
annex_1_node.parent = None
annex_2_node = toc.get_node("Annex 2")
annex_2_node.parent = None
annex_3_node = toc.get_node("Annex 3")
annex_3_node.parent = None

toc.print_tree()

decisions []
|-- I [Introduction  ]
|-- II [Definitions ]
|   |-- .A [Profiling  ]
|   |-- .B [Automated decision-making]
|   +-- .C [How the GDPR addresses the concepts]
|-- III [General provisions on profiling and automated decision-making ]
|   |-- .A [Data protection principles ]
|   |   |-- .1 [Article 5(1) (a) - Lawful, fair and transparent   ]
|   |   |-- .2 [Article 5(1) (b) Further processing and purpose limitation  ]
|   |   |-- .3 [Article 5(1) (c) Data minimisation   ]
|   |   |-- .4 [Article 5(1) (d) Accuracy   ]
|   |   +-- .5 [Article 5(1) (e) Storage limitation   ]
|   |-- .B [Lawful bases for processing ]
|   |   |-- .1 [Article 6(1) (a) consent   ]
|   |   |-- .2 [Article 6(1) (b) - necessary for the performance of a contract  ]
|   |   |-- .3 [Article 6(1) (c) - necessary for compliance with a legal obligation  ]
|   |   |-- .4 [Article 6(1) (d) - necessary to protect vital interests  ]
|   |   |-- .5 [Article 6(1) (e) - necessary for the performance of a task carrie

In [29]:
sys.path.append('E:/Code/chat/gdpr')

from regulations_rag.regulation_table_of_content import split_tree

split_df = split_tree(node = toc.root, document = doc, table_of_content = toc, token_limit = 1300)

In [30]:
# indices = split_df[split_df['section_reference'].str.contains("Application", na=False)].index.to_list()
# split_df = split_df.drop(index=indices)
# split_df.reset_index(drop=True, inplace=True)
# split_df
split_df
rows_after_IV = split_df[split_df["section_reference"].str.startswith("V")]
split_df = split_df.drop(rows_after_IV.index.to_list())

In [31]:
# manually add the sections 
from regulations_rag.embeddings import num_tokens_from_string
sections_to_add = ['IV.A', 'IV.B', 'IV.C', 'IV.D', 'IV.E.1', 'IV.E.2', 'IV.F']
manual_df = pd.DataFrame(sections_to_add, columns = ["section_reference"])
for index, row in manual_df.iterrows():
    t = doc.get_text(row["section_reference"])
    manual_df.at[index, "text"] = t
    manual_df.at[index, "token_count"] = int(num_tokens_from_string(t))
manual_df["token_count"] = manual_df["token_count"].astype(int)


In [33]:
split_df = pd.concat([split_df, manual_df, rows_after_IV], ignore_index=True)
split_df


Unnamed: 0,section_reference,text,token_count
0,I,I. Introduction \nThe General Data Protection...,835
1,II.A,II. Definitions \nThe GDPR introduces provisio...,896
2,II.B,II. Definitions \nThe GDPR introduces provisio...,461
3,II.C,II. Definitions \nThe GDPR introduces provisio...,439
4,III.A.1,III. General provisions on profiling and autom...,1081
5,III.A.2,III. General provisions on profiling and autom...,523
6,III.A.3,III. General provisions on profiling and autom...,248
7,III.A.4,III. General provisions on profiling and autom...,346
8,III.A.5,III. General provisions on profiling and autom...,412
9,III.B.1,III. General provisions on profiling and autom...,334


In [26]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [34]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [35]:
summary_file = "../tmp/decision_making_summary.csv"
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = "DecisionMaking"

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

question_file = "../tmp/decision_making_question.csv"
if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = "DecisionMaking"

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [118]:
#model = "gpt-3.5-turbo"
#model="gpt-4"
model = "gpt-4-0125-preview"
model = "gpt-4-turbo-2024-04-09"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
VI. Data protection impact assessments (DPIA) and Data Protection Officer (DPO)   
Accountability is an important area and an explicit requirement under the GDPR.[^48]   
As a key accountability tool, a DPIA enables the controller to assess the risks involved in automated decision-making, including profiling. It is a way of showing that suitable measures have been put in place to address those risks and demonstrate compliance with the GDPR.  
Article 35(3) (a) highlights the need for the controller to carry out a DPIA in the case of:  
"a systematic and extensive evaluation of personal aspects relating to natural persons which is based on automated processing, including profiling, and on which decisions are based that produce legal effects concerning the natural person or similarly significantly affect the natural person;"  
Article 35(3)(a) refers to evaluations including profiling and decisions that are 'based' on automated processing, rather than  'solely' automated p

In [119]:
# index: 29, section_reference: VI

df_summary.loc[index, "text"] = "You must conduct a Data Protection Impact Assessment (DPIA) when employing automated decision-making, including profiling, that results in legal implications or similar significant impacts on individuals. This includes decision-making processes that are not entirely automated but still have a substantial influence based on profiles created about individuals. If using solely automated decisions that significantly impact individuals, without their consent, contractual obligation, or legal authority, you should not proceed.\n\nYou may opt for a decision-making approach with less automation by increasing human involvement, although this may still involve risks to individuals’ rights. Thus, a DPIA remains crucial for identifying and mitigating these risks according to established guidelines.\n\nIn processing activities where automated decision-making or profiling forms a core part, especially if conducted extensively, hiring a Data Protection Officer (DPO) is mandatory. Key functions of a DPIA include informing individuals about the automated processes, explaining the processing's impact, offering options to contest decisions, and allowing individuals to present their views."

df_questions.loc[index, "text"] = "When is a Data Protection Impact Assessment (DPIA) required?|What is the purpose of a Data Protection Impact Assessment (DPIA)?|When is it mandatory to appoint a Data Protection Officer (DPO)?"


index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [115]:
summary_file = "../tmp/decision_making_summary.csv"
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

question_file = "../tmp/decision_making_question.csv"
df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [3]:
import pandas as pd
summary_file = "../tmp/decision_making_summary.csv"
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

question_file = "../tmp/decision_making_question.csv"
df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [7]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions.reset_index(drop=True, inplace=True)


In [11]:
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [13]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Profiling and automated decision-making are in...,summary,,DecisionMaking
1,II.A,Profiling involves using personal data through...,summary,,DecisionMaking
2,II.B,While automated decision-making can function i...,summary,,DecisionMaking
3,II.C,You can perform profiling and automated decisi...,summary,,DecisionMaking
4,III.A.1,You must operate profiling and automated decis...,summary,,DecisionMaking
...,...,...,...,...,...
124,V,Under what circumstances can you process child...,question,,DecisionMaking
125,V,What safeguards must be in place if you proces...,question,,DecisionMaking
126,VI,When is a Data Protection Impact Assessment (D...,question,,DecisionMaking
127,VI,What is the purpose of a Data Protection Impac...,question,,DecisionMaking


In [15]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines
Completed 130 lines


In [18]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

file = "../inputs/index/decision_making.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(df_index, file, key)


In [19]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,I,Profiling and automated decision-making are in...,summary,"[-0.03931915760040283, -0.07119743525981903, -...",DecisionMaking
1,II.A,Profiling involves using personal data through...,summary,"[-0.07373262196779251, -0.05352300405502319, -...",DecisionMaking
2,II.B,While automated decision-making can function i...,summary,"[-0.03185207024216652, -0.02333715930581093, -...",DecisionMaking
3,II.C,You can perform profiling and automated decisi...,summary,"[-0.018590476363897324, -0.10302142053842545, ...",DecisionMaking
4,III.A.1,You must operate profiling and automated decis...,summary,"[-0.024095172062516212, -0.06239460036158562, ...",DecisionMaking
...,...,...,...,...,...
124,V,Under what circumstances can you process child...,question,"[-0.04889069125056267, -0.051598481833934784, ...",DecisionMaking
125,V,What safeguards must be in place if you proces...,question,"[-0.025248035788536072, -0.055343691259622574,...",DecisionMaking
126,VI,When is a Data Protection Impact Assessment (D...,question,"[-0.04924626275897026, -0.007886707782745361, ...",DecisionMaking
127,VI,What is the purpose of a Data Protection Impac...,question,"[-0.07883070409297943, -0.019308313727378845, ...",DecisionMaking


In [49]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
decision_index = load_parquet_data("../inputs/index/decision_making.parquet", key)



In [23]:
section = gdpr_index[gdpr_index["section_reference"] == "22"]

for index, row in section.iterrows():
    print(f"-- {row['text']}")

-- What rights do individuals have regarding automated decision-making and profiling?
-- Under what conditions can an individual be subject to automated decision-making?
-- What safeguards must be in place when automated decisions are necessary for contracts or based on explicit consent?
-- Can special categories of personal data be used in automated decision-making?
-- What measures are required to protect individuals when special categories of data are used in automated decisions?
-- Individuals have the right not to be subject to decisions made solely on automated processing, including profiling, if it significantly affects them legally or in a similarly significant way. However, this does not apply if the decision is necessary for a contract, authorised by law which protects the individual's rights and interests, or based on the individual's explicit consent. When a decision is necessary for a contract or based on consent, you must ensure there are measures in place allowing the in

In [50]:
tmp = decision_index[(decision_index["section_reference"]== "I")]
for index, row in tmp.iterrows():
    print(f"-- {row['text']}")

-- Profiling and automated decision-making are increasingly utilised across various sectors such as banking, healthcare, and marketing, driven by technological advancements in big data, AI, and machine learning. These techniques can greatly benefit efficiency and personalisation but also pose risks to individual rights and freedoms due to their potential invasiveness and the issues around transparency and discrimination they can create.

The GDPR seeks to mitigate these risks by implementing specific provisions to protect privacy and other related rights. You must ensure appropriate safeguards when employing profiling and automated decision-making technologies, keeping in mind their potential to impact individuals based on inaccurate data, perpetuate stereotypes, or restrict individual choices.

Particularly under Article 22, you have obligations regarding solely automated decisions that have legal or similarly significant effects on individuals, including the requirement to provide me

In [53]:
file = "../inputs/index/decision_making.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(decision_index, file, key)

In [41]:
from src.index_tools import update_text_in_index, add_to_index

index_df = decision_index
#index_df = gdpr_index

text_to_change = "How do automated decision-making and profiling differ?"
changed_text = "What is automated decision-making?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [51]:
from src.index_tools import remove_from_index

text_to_delete = "What safeguards does the General Data Protection Regulation (GDPR) introduce for profiling and automated decision-making?"
decision_index = remove_from_index(decision_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)