In [1]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [2]:
class_name = "Codes"
summary_file = "../tmp/codes_summary.csv"
question_file = "../tmp/codes_question.csv"

path_to_manual_as_csv_file = "../inputs/documents/codes.parquet"
index_file = "../inputs/index/codes.parquet"



In [3]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.codes
importlib.reload(gdpr_rag.documents.codes)
from gdpr_rag.documents.codes import Codes

path_to_manual_as_csv_file = "../inputs/documents/codes.parquet"

doc = Codes(path_to_manual_as_csv_file)


In [4]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [5]:
toc.print_tree()

transparency []
|-- 1 [INTRODUCTION  ]
|   +-- .1 [Scope of these guidelines]
|-- 2 [DEFINITIONS]
|-- 3 [WHAT ARE CODES ?]
|-- 4 [WHAT ARE THE BENEFITS OF CODES?]
|-- 5 [ADMISSIBILITY OF A DRAFT CODE[^29]
[^29] This also applies for all codes (national and transnational) as well as amended or extended codes.]
|   |-- .1 [Explanatory statement and supporting documentation]
|   |-- .2 [Representative]
|   |-- .3 [Processing Scope ]
|   |-- .4 [Territorial scope]
|   |-- .5 [Submission to a CompSA]
|   |-- .6 [Oversight of mechanisms]
|   |-- .7 [Monitoring body]
|   |-- .8 [Consultation]
|   |-- .9 [National legislation]
|   |-- .10 [Language]
|   +-- .11 [Checklist]
|-- 6 [CRITERIA FOR APPROVING CODES]
|   |-- .1 [Meets a particular need]
|   |-- .2 [Facilitates the effective application of the GDPR]
|   |-- .3 [Specifies the application of the GDPR]
|   |-- .4 [Provides sufficient safeguards]
|   +-- .5 [Provides mechanisms which will allow for effective oversight ]
|-- 7 [SUBMISSION, 

In [8]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df[split_df["token_count"] > 31] 
split_df = split_df.drop([0, 1, 2]) # Introduction
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,3,3 WHAT ARE CODES ?\n7. GDPR codes are voluntar...,562
1,4,4 WHAT ARE THE BENEFITS OF CODES?\n11. Codes r...,1481
2,5,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n19. Ther...,95
3,5.1,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.1 Ex...,257
4,5.2,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.2 Re...,218
5,5.3,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.3 Pr...,103
6,5.4,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.4 Te...,118
7,5.5,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.5 Su...,157
8,5.6,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.6 Ov...,96
9,5.7,5 ADMISSIBILITY OF A DRAFT CODE[^29]\n\n5.7 Mo...,224


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [10]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [11]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [131]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
APPENDIX 3 CHECKLIST FOR SUBMISSION  
Before submitting a draft code to the competent supervisory authority it is important that you ensure the following (where relevant) have been submitted/set out and are appropriately signposted within the documentation:  
1.Have you provided an explanatory statement and all relevant supporting documentation?
(Paragraph 20)  
2.Are you an association or other body representing categories of controllers or processors?
(Paragraph 21)  
3.Have you provided details in your submission to substantiate that you are an effective
representative body that is capable of understanding the needs of your members? (Paragraph
22)  
4.Have you clearly defined the processing activity or sector and the processing problems to
which the code is intended to address? (Paragraph 23)  
5.Have you identified the territorial scope of your code and included a list of all concerned SAs
(where relevant)? (Paragraph 24)  
6.Have you provided details to justify the 

In [129]:
model_summary

"Factors to consider when choosing a competent supervisory authority (CompSA) for evaluating a transnational draft code:\n\n- Location with the highest density of processing activities or specific sector.\n- Location with the highest density of individuals affected by the processing activities or specific sector.\n- Code owner's headquarters location.\n- Proposed monitoring body's headquarters location.\n- Initiatives by a supervisory authority in a particular field.\n\nChoosing a CompSA should be a well-considered decision, as the CompSA acts as a single point of contact during the approval process, manages the application procedure, accredits the monitoring body (if relevant), and ensures the approved code is effectively monitored."

In [133]:
# index: 45, section_reference: APPENDIX 4

df_summary.loc[index, "text"] = "Workflow for the submission of a code"

df_questions.loc[index, "text"] = "Is there a workflow for a Code?"





index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [136]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,3,Codes of conduct are voluntary accountability ...,summary,,Codes
1,4,Codes are an opportunity to establish practica...,summary,,Codes
2,5.1,Every draft code submitted for approval must i...,summary,,Codes
3,5.2,"A code must be submitted by an association, co...",summary,,Codes
4,5.3,"A draft code must clearly define its scope, sp...",summary,,Codes
5,5.4,The draft code must specify if it is a nationa...,summary,,Codes
6,5.5,Code owners must ensure that the chosen superv...,summary,,Codes
7,5.6,The draft code must propose mechanisms that en...,summary,,Codes
8,5.7,For a draft code involving the processing acti...,summary,,Codes
9,5.8,A draft code must detail the extent of consult...,summary,,Codes


In [134]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [135]:
df_summary = df_summary[df_summary["section_reference"] != "K"]
df_questions = df_questions[df_questions["section_reference"] != "K"]

In [135]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [137]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [138]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,3,Codes of conduct are voluntary accountability ...,summary,,Codes
1,4,Codes are an opportunity to establish practica...,summary,,Codes
2,5.1,Every draft code submitted for approval must i...,summary,,Codes
3,5.2,"A code must be submitted by an association, co...",summary,,Codes
4,5.3,"A draft code must clearly define its scope, sp...",summary,,Codes
...,...,...,...,...,...
120,APPENDIX 1,Can a national code become a transnational code?,question,,Codes
121,APPENDIX 2,How do code owners choose a competent supervis...,question,,Codes
122,APPENDIX 2,What factors should be considered when selecti...,question,,Codes
123,APPENDIX 3,Is there a checklist for a Code?,question,,Codes


In [139]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines
Completed 130 lines


In [140]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [141]:
df_index
#df_index["document"] = "Article_47_BCR"

Unnamed: 0,section_reference,text,source,embedding,document
0,3,Codes of conduct are voluntary accountability ...,summary,"[0.00377368344925344, -0.02220195345580578, -0...",Codes
1,4,Codes are an opportunity to establish practica...,summary,"[-0.010806070640683174, 0.0005761659704148769,...",Codes
2,5.1,Every draft code submitted for approval must i...,summary,"[0.021022645756602287, 0.011024325154721737, -...",Codes
3,5.2,"A code must be submitted by an association, co...",summary,"[0.020343054085969925, -0.0009793692734092474,...",Codes
4,5.3,"A draft code must clearly define its scope, sp...",summary,"[0.020648499950766563, 0.032284095883369446, -...",Codes
...,...,...,...,...,...
120,APPENDIX 1,Can a national code become a transnational code?,question,"[-0.027643537148833275, 0.03155497461557388, -...",Codes
121,APPENDIX 2,How do code owners choose a competent supervis...,question,"[0.036905426532030106, -0.01979360356926918, -...",Codes
122,APPENDIX 2,What factors should be considered when selecti...,question,"[0.02717580460011959, 0.007003546226769686, -0...",Codes
123,APPENDIX 3,Is there a checklist for a Code?,question,"[0.03202644735574722, -0.007595430593937635, -...",Codes


In [144]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
code_index = load_parquet_data("../inputs/index/data_breach.parquet", key)

for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('40')].iterrows():
    print(f"-- {row['text']}")


# for index, row in index_df[index_df["section_reference"] == '30'].iterrows():
#     print(f"* {row['text']}")

-- Who is encouraged to create codes of conduct for GDPR compliance?
-- What purposes do codes of conduct serve?
-- Can codes of conduct address the protection of children's data?
-- How can codes of conduct assist with personal data breaches?
-- Can organisations outside the EU adhere to codes of conduct?
-- What requirements must a code of conduct meet for approval?
-- Who approves codes of conduct for local purposes?
-- How are codes of conduct evaluated for processing activities across multiple Member States?
-- What role does the Commission play in the validity of codes of conduct?
-- How is the public informed about approved codes of conduct?
-- Where can one find a register of approved codes of conduct?
-- You are encouraged to develop or adopt codes of conduct to help apply GDPR effectively, considering your specific processing activities and the needs if you're a micro, small, or medium-sized enterprise. These codes can address various aspects such as fair processing, legitima

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)