In [3]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [4]:
class_name = "TerritorialScope"
summary_file = "../tmp/territorial_scope_summary.csv"
question_file = "../tmp/territorial_scope_question.csv"

path_to_manual_as_csv_file = "../inputs/documents/territorial_scope.parquet"
index_file = "../inputs/index/territorial_scope.parquet"



In [11]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.territorial_scope
importlib.reload(gdpr_rag.documents.territorial_scope)
from gdpr_rag.documents.territorial_scope import TerritorialScope


doc = TerritorialScope(path_to_manual_as_csv_file)


In [12]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "territorial_scope", reference_checker = reference_checker, regulation_df = df)

In [7]:
toc.print_tree()

territorial_scope []
|-- INTRODUCTION [INTRODUCTION    ]
|-- 1 [APPLICATION OF THE ESTABLISHMENT CRITERION - ART 3(1)    ]
|   |-- .a ["An establishment in the Union"  ]
|   |-- .b [Processing of personal data carried out "in the context of the activities of" an  establishment   ]
|   |-- .c [Application of the GDPR to the establishment of a controller or a processor in the Union, regardless of whether the processing takes place in the Union or not  ]
|   +-- .d [Application of the establishment criterion to controller and processor  ]
|-- 2 [APPLICATION OF THE TARGETING CRITERION - ART 3(2)  ]
|   |-- .a [Data subjects in the Union  ]
|   |-- .b [Offering of goods or services, irrespective of whether a payment of the data subject  ]
|   |-- .c [Monitoring of data subjects' behaviour   ]
|   |-- .d [Processor not established in the Union  ]
|   +-- .e [Interaction with other GDPR provisions and other legislations  ]
|-- 3 [PROCESSING IN A PLACE WHERE MEMBER STATE LAW APPLIES BY  VIRTUE

In [13]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0]) # Introduction
#split_df = split_df[split_df["token_count"] > 25] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,1,1 APPLICATION OF THE ESTABLISHMENT CRITERION -...,321
1,1.a,1 APPLICATION OF THE ESTABLISHMENT CRITERION -...,1462
2,1.b,1 APPLICATION OF THE ESTABLISHMENT CRITERION -...,1034
3,1.c,1 APPLICATION OF THE ESTABLISHMENT CRITERION -...,493
4,1.d,1 APPLICATION OF THE ESTABLISHMENT CRITERION -...,2204
5,2,2 APPLICATION OF THE TARGETING CRITERION - ART...,849
6,2.a,2 APPLICATION OF THE TARGETING CRITERION - ART...,790
7,2.b,2 APPLICATION OF THE TARGETING CRITERION - ART...,1691
8,2.c,2 APPLICATION OF THE TARGETING CRITERION - ART...,769
9,2.d,2 APPLICATION OF THE TARGETING CRITERION - ART...,370


In [14]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [15]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [16]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [69]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
4 REPRESENTATIVE OF CONTROLLERS OR PROCESSORS NOT  ESTABLISHED IN THE UNION  

4.d Obligations and responsibilities of the representative  
The representative in the Union acts on behalf of the controller or processor it represents with regard  to the controller or processor's obligations under the GDPR. This implies notably the obligations  relating to the exercise of data subject rights, and in this regard and as already stated, the identity and  contact details of the representative must be provided to data subjects in accordance with articles 13  and 14. While not itself responsible for complying with data subject rights, the representative must  facilitate the communication between data subjects and the controller or processor represented, in  order to make the exercise of data subjects' rights are effective.  
As per Article 30, the controller or processor's representative shall in particular maintain a record of  processing activities under the responsibility of t

In [70]:
model_summary

"The representative in the Union acts on behalf of you regarding your obligations under GDPR, particularly those related to the exercise of individual rights. The representative's contact details must be provided to individuals according to Articles 13 and 14. While not responsible for complying with individual rights, the representative must facilitate communication between individuals and you.\n\nThe representative must maintain a record of processing activities, though you are responsible for the primary content and updates of this record. The representative must ensure the record is available when requested by a supervisory authority.\n\nThe representative should perform tasks according to the mandate received from you, including cooperating with supervisory authorities. Supervisory authorities would contact the representative for any compliance matters, and the representative should facilitate communication between them and you.\n\nThe representative in the Union must efficiently 

In [71]:
# index: 16, section_reference: 4.d

df_summary.loc[index, "text"] = "The representative in the Union acts on behalf of you regarding your obligations under GDPR, particularly those related to the exercise of individual rights. The representative's contact details must be provided to individuals according to Articles 13 and 14. While not responsible for complying with individual rights, the representative must facilitate communication between individuals and you.\n\nThe representative must maintain a record of processing activities, though you are responsible for the primary content and updates of this record. The representative must ensure the record is available when requested by a supervisory authority.\n\nThe representative should perform tasks according to the mandate received from you, including cooperating with supervisory authorities. Supervisory authorities would contact the representative for any compliance matters, and the representative should facilitate communication between them and you.\n\nThe representative in the Union must efficiently communicate with individuals and cooperate with supervisory authorities, using appropriate languages or other methods to ensure effective communication. The representative's availability is essential for easy contact with the non-EU controller or processor.\n\nThe representative does not assume your responsibility and liability under GDPR. Supervisory authorities can initiate enforcement proceedings through the representative and address corrective measures or penalties to the representative, but direct liability is limited to specific obligations in Articles 30 and 58(1)(a).\n\nArticle 50 aims to facilitate legislation enforcement related to third countries and international organisations, with further international cooperation mechanisms being considered."

df_questions.loc[index, "text"] = "Who does the representative in the Union act on behalf of?|How does the representative support individuals' rights?|What must the representative provide to individuals?|What record must the representative maintain?|Who is responsible for the primary content and updates of the record?|What is the representative's responsibility when addressed by a supervisory authority?|How should the representative handle communication with data subjects and supervisory authorities?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [73]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,1,The GDPR applies to the processing of personal...,summary,,TerritorialScope
1,1.a,To determine if an entity has an establishment...,summary,,TerritorialScope
2,1.b,Processing personal data in the context of any...,summary,,TerritorialScope
3,1.c,The GDPR applies to the processing of personal...,summary,,TerritorialScope
4,1.d,When processing activities fall under the scop...,summary,,TerritorialScope
5,2,"If you are not established in the EU, the GDPR...",summary,,TerritorialScope
6,2.a,The GDPR applies to personal data of individua...,summary,,TerritorialScope
7,2.b,To determine if offering goods or services tri...,summary,,TerritorialScope
8,2.c,When an individual's behaviour within the EU i...,summary,,TerritorialScope
9,2.d,"If you target individuals in the EU, the GDPR ...",summary,,TerritorialScope


In [74]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [75]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [76]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [77]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,1,The GDPR applies to the processing of personal...,summary,,TerritorialScope
1,1.a,To determine if an entity has an establishment...,summary,,TerritorialScope
2,1.b,Processing personal data in the context of any...,summary,,TerritorialScope
3,1.c,The GDPR applies to the processing of personal...,summary,,TerritorialScope
4,1.d,When processing activities fall under the scop...,summary,,TerritorialScope
...,...,...,...,...,...
79,4.d,What must the representative provide to indivi...,question,,TerritorialScope
80,4.d,What record must the representative maintain?,question,,TerritorialScope
81,4.d,Who is responsible for the primary content and...,question,,TerritorialScope
82,4.d,What is the representative's responsibility wh...,question,,TerritorialScope


In [78]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines


In [79]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [1]:
df_index
#df_index["document"] = "Article_47_BCR"

NameError: name 'df_index' is not defined

In [80]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
scope_index = load_parquet_data("../inputs/index/territorial_scope.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('3')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- When does GDPR apply to non-EU entities?
-- I am not based in the EU. Does GDPR apply to me?
-- What records must you keep about your data processing activities?
-- What details do records of data processing need to include?
-- Are there any specific requirements for the records of data processing to be kept by a data processor?
-- In what form should records of data processing be maintained?
-- Who might request access to records of data processing, and what is your responsibility?
-- What is required from you in terms of cooperation with the supervisory authority?
-- What measures should you implement to ensure the security of processing personal data?
-- How do you determine the appropriate level of security needed for personal data?
-- Can following a code of conduct or obtaining a certification help demonstrate compliance with data security requirements?
-- What steps should you take to ensure that individuals processing personal data under your authority only do so based on yo

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)