In [2]:
import logging
log_level = 25
logging.basicConfig(level=log_level) # root logger


In [3]:
class_name = "Video"
prefix = "video"
summary_file = f"../tmp/{prefix}_summary.csv"
question_file = f"../tmp/{prefix}_services_question.csv"

path_to_manual_as_csv_file = f"../inputs/documents/{prefix}.parquet"
index_file = f"../inputs/index/{prefix}.parquet"



In [4]:
import re
import pandas as pd

import sys
sys.path.append('E:/Code/chat/gdpr')

import importlib
import gdpr_rag.documents.video
importlib.reload(gdpr_rag.documents.video)
from gdpr_rag.documents.video import Video


doc = Video(path_to_manual_as_csv_file)


In [5]:
from regulations_rag.regulation_table_of_content import StandardTableOfContent
reference_checker = doc.reference_checker
df = pd.read_parquet(path_to_manual_as_csv_file, engine='pyarrow')
toc = StandardTableOfContent(root_node_name = "transparency", reference_checker = reference_checker, regulation_df = df)

In [6]:
toc.print_tree()

transparency []
|-- 1 [INTRODUCTION ]
|-- 2 [SCOPE OF APPLICATION[^2] 
[^2] The EDPB notes that where the GDPR so allows, specific requirements in national legislation might apply.]
|   |-- .1 [Personal Data ]
|   |-- .2 [Application of the Law Enforcement Directive, LED (EU2016/680) ]
|   +-- .3 [Household exemption ]
|-- 3 [LAWFULNESS OF PROCESSING ]
|   |-- .1 [Legitimate interest, Article 6 (1) (f) ]
|   |   |-- .1 [Existence of legitimate interests ]
|   |   |-- .2 [Necessity of processing ]
|   |   +-- .3 [Balancing of interests ]
|   |-- .2 [Necessity to perform a task carried out in the public interest or in the exercise of official authority vested in the controller, Article 6 (1) (e) ]
|   +-- .3 [Consent, Article 6 (1) (a) ]
|-- 4 [DISCLOSURE OF VIDEO FOOTAGE TO THIRD PARTIES ]
|   |-- .1 [Disclosure of video footage to third parties in general ]
|   +-- .2 [Disclosure of video footage to law enforcement agencies ]
|-- 5 [PROCESSING OF SPECIAL CATEGORIES OF DATA ]
|   |-- .1

In [9]:
sys.path.append('E:/Code/chat/gdpr')

from anytree import Node, PreOrderIter
from regulations_rag.embeddings import num_tokens_from_string

all_nodes = list(PreOrderIter(toc.root))
all_node_names = [node.full_node_name for node in all_nodes if node.full_node_name != ""]
split_df = pd.DataFrame(all_node_names, columns = ["section_reference"])
for index, row in split_df.iterrows():
    text = doc.get_text(row['section_reference'])
    split_df.at[index, 'text'] = text
    split_df.at[index, 'token_count'] = num_tokens_from_string(text)
split_df["token_count"] = split_df["token_count"].astype(int)

split_df = split_df.drop([0]) # Introduction
split_df = split_df[split_df["token_count"] > 56] 
split_df.reset_index(drop=True, inplace=True)
split_df

Unnamed: 0,section_reference,text,token_count
0,2.1,2 SCOPE OF APPLICATION[^2] \n\n2.1 Personal Da...,444
1,2.2,2 SCOPE OF APPLICATION[^2] \n\n2.2 Application...,108
2,2.3,2 SCOPE OF APPLICATION[^2] \n\n2.3 Household e...,697
3,3,"3 LAWFULNESS OF PROCESSING \n15. Before use, t...",404
4,3.1.1,3 LAWFULNESS OF PROCESSING \n3.1 Legitimate in...,647
5,3.1.2,3 LAWFULNESS OF PROCESSING \n3.1 Legitimate in...,680
6,3.1.3,3 LAWFULNESS OF PROCESSING \n3.1 Legitimate in...,1195
7,3.2,3 LAWFULNESS OF PROCESSING \n\n3.2 Necessity t...,261
8,3.3,"3 LAWFULNESS OF PROCESSING \n\n3.3 Consent, Ar...",533
9,4.1,4 DISCLOSURE OF VIDEO FOOTAGE TO THIRD PARTIES...,371


In [10]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')
#index_df = load_parquet_data("../inputs/index_gdpr.parquet", key)

In [11]:
from openai import OpenAI
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),)

import sys
sys.path.append('E:/Code/chat/gdpr')
import importlib
import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


In [12]:
if os.path.exists(summary_file):
    df_summary = pd.read_csv(summary_file, sep="|", encoding = "utf-8")
else:
    df_summary = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_summary["section_reference"] = split_df["section_reference"]
    # df_summary["text"] = split_df["text"]
    df_summary["source"] = "summary"
    df_summary["document"] = class_name

first_empty_index = df_summary[df_summary['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0
index = first_empty_index

if os.path.exists(question_file):
    df_questions = pd.read_csv(question_file, sep="|", encoding = "utf-8")
else:
    df_questions = pd.DataFrame([], columns = ["section_reference", "text", "source", "embedding", "document"])
    df_questions["section_reference"] = split_df["section_reference"]
    # df_questions["text"] = split_df["text"]
    df_questions["source"] = "question"
    df_questions["document"] = class_name

first_empty_index = df_questions[df_questions['text'].isna()].index.min()

if pd.isna(first_empty_index):
    #print("No empty embeddings found.")
    first_empty_index = 0

if index != first_empty_index:
    print("The index in the summary and questions files do not match. This needs to be fixed")
else:
    [print(f"Picking up at index {index}")]


Picking up at index 0


In [109]:
model = "gpt-4o"

reference = df_summary.iloc[index]["section_reference"]

reg_text = doc.get_text(reference)
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(openai_client = openai_client, text = reg_text, model = model)

#format output
print(f'# index: {index}, section_reference: {df_summary.iloc[index]["section_reference"]}')
print()
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "text"] = "{model_questions}"')

##############
10 DATA PROTECTION IMPACT ASSESSMENT 
136. According to Article 35 (1) GDPR controllers are required to conduct data protection impact assessments (DPIA) when a type of data processing is likely to result in a high risk to the rights and freedoms of natural persons. Article 35 (3) (c) GDPR stipulates that controllers are required to carry out data protection impact assessments if the processing constitutes a systematic monitoring of a publicly accessible area on a large scale. Moreover, according to Article 35 (3) (b) GDPR a data protection impact assessment is also required when the controller intends to process special categories of data on a large scale. 
137. The Guidelines on Data Protection Impact Assessment[^27] provide further advice, and more detailed examples relevant to video surveillance (e.g. concerning the "use of a camera system to monitor driving behaviour on highways"). Article 35 (4) GDPR requires that each supervisory authority publish a list of the ki

In [110]:
model_summary

"When processing is likely to result in a high risk to individuals' rights and freedoms, you must conduct a Data Protection Impact Assessment (DPIA). This is mandatory if you conduct systematic monitoring of publicly accessible areas on a large scale or process special categories of data on a large scale. Supervisory authorities will provide lists of operations requiring a DPIA on their websites.\n\nIf the DPIA shows that high risk remains despite planned security measures, you must consult the supervisory authority before proceeding."

In [111]:
# index: 29, section_reference: 10

df_summary.loc[index, "text"] = "When processing is likely to result in a high risk to individuals' rights and freedoms, you must conduct a Data Protection Impact Assessment (DPIA). This is mandatory if you conduct systematic monitoring of publicly accessible areas on a large scale or process special categories of data on a large scale. Supervisory authorities will provide lists of operations requiring a DPIA on their websites.\n\nIf the DPIA shows that high risk remains despite planned security measures, you must consult the supervisory authority before proceeding."

df_questions.loc[index, "text"] = "When should you conduct a Data Protection Impact Assessment (DPIA) for video surveillance?|Why is Data Protection Impact Assessment (DPIA) important for video surveillance activities?"



index = index + 1
print(f"You have completed {(index / len(df_summary) * 100):.2f}% of the work")
if index < len(df_summary):
    print(f"The next section is:\n{doc.get_text(df_summary.iloc[index]['section_reference'])}")
else:
    print("All done")

You have completed 100.00% of the work
All done


In [114]:
df_summary

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1,Personal data encompasses systematic automated...,summary,,Video
1,2.2,Processing personal data by competent authorit...,summary,,Video
2,2.3,- The processing of personal data by an indivi...,summary,,Video
3,3,"Before use, specify the purposes of processing...",summary,,Video
4,3.1.1,Video surveillance is lawful if it serves a le...,summary,,Video
5,3.1.2,"Personal data must be adequate, relevant, and ...",summary,,Video
6,3.1.3,Ensure video surveillance is necessary for leg...,summary,,Video
7,3.2,Personal data can be processed through video s...,summary,,Video
8,3.3,"Consent must be freely given, specific, inform...",summary,,Video
9,4.1,"Disclosure involves transmitting, disseminatin...",summary,,Video


In [112]:
df_summary.to_csv(summary_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

df_questions.to_csv(question_file, sep = "|", encoding = "utf-8", index = False, na_rep="")

In [115]:
import pandas as pd
df_summary = pd.read_csv(summary_file, sep = "|", encoding = "utf-8")

df_questions = pd.read_csv(question_file, sep = "|", encoding = "utf-8")

In [116]:
df_questions = df_questions.drop("text", axis=1).join(df_questions["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_questions = df_questions[df_questions["text"] != ""]
df_questions.reset_index(drop=True, inplace=True)


In [117]:
df_index = pd.concat([df_summary, df_questions], ignore_index = True)
df_index

Unnamed: 0,section_reference,text,source,embedding,document
0,2.1,Personal data encompasses systematic automated...,summary,,Video
1,2.2,Processing personal data by competent authorit...,summary,,Video
2,2.3,- The processing of personal data by an indivi...,summary,,Video
3,3,"Before use, specify the purposes of processing...",summary,,Video
4,3.1.1,Video surveillance is lawful if it serves a le...,summary,,Video
...,...,...,...,...,...
123,9.3.2,What role does physical security play in analo...,question,,Video
124,9.3.2,How can system and data security be ensured?,question,,Video
125,9.3.2,How is footage transmission kept secure?,question,,Video
126,10,When should you conduct a Data Protection Impa...,question,,Video


In [118]:
from regulations_rag.embeddings import get_ada_embedding
model = "text-embedding-3-large"
dimensions = 1024

#df_index = df_questions
df_index['embedding'] = pd.NA  # Initialize the column to hold NA values
df_index['embedding'] = df_index['embedding'].astype(object)  # Ensure the column type is object
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["embedding"] = chunk["text"].apply(lambda x: get_ada_embedding(openai_client, x, model, dimensions))
    df_index.loc[chunk.index, "embedding"] = chunk["embedding"]
    print(f"Completed {i+increment} lines")


Completed 10 lines
Completed 20 lines
Completed 30 lines
Completed 40 lines
Completed 50 lines
Completed 60 lines
Completed 70 lines
Completed 80 lines
Completed 90 lines
Completed 100 lines
Completed 110 lines
Completed 120 lines
Completed 130 lines


In [119]:
df_index.to_parquet(index_file, engine = "pyarrow")
save_parquet_data(df_index, index_file, key)


In [127]:
gdpr_index = load_parquet_data("../inputs/index/gdpr.parquet", key)
video_index = load_parquet_data("../inputs/index/video.parquet", key)

# for index, row in gdpr_index[gdpr_index['section_reference'].str.startswith('5')].iterrows():
#     print(f"-- {row['text']}")


for index, row in gdpr_index.iterrows():
    if 'camera' in row['text'].lower():
        print(f"* {row['text']}")

In [14]:
online_index['section_reference'] = online_index['section_reference'].astype(str)
online_index
save_parquet_data(online_index, index_file, key)

In [89]:
file = "../inputs/index/article_47_bcr.parquet"
#df_index.to_parquet(file, engine = "pyarrow")
save_parquet_data(index_df, file, key)

In [54]:
from src.index_tools import update_text_in_index, add_to_index

index_df = article_47_index
#index_df = gdpr_index

text_to_change = "What points need to be addressed in the Binding Corporate Rules for Controllers (BCR-C)"
changed_text = "What points need to be addressed in the Binding Corporate Rules for controllers?"
embedding_model = model
embedding_dimensions = dimensions

index_df = update_text_in_index(openai_client = openai_client, index_df = index_df, text_to_change = text_to_change, changed_text = changed_text, embedding_model = embedding_model, embedding_dimensions = embedding_dimensions)

Unnamed: 0,section_reference,text,source,embedding,document
0,1.1,You can use binding corporate rules (BCRs) for...,summary,"[-0.009783994406461716, -0.06066862493753433, ...",Article_47_BCR
1,1.5,Binding Corporate Rules for controllers (BCR-C...,summary,"[-0.017316676676273346, -0.021807707846164703,...",Article_47_BCR
2,1.6,You must comply with EU data protection legisl...,summary,"[-0.07271231710910797, -0.06814079731702805, -...",Article_47_BCR
3,1.7,You must use Binding Corporate Rules (BCR) as ...,summary,"[-0.017403494566679, -0.08751912415027618, -0....",Article_47_BCR
4,1.8,Binding Corporate Rules (BCRs) need approval f...,summary,"[-0.004610632546246052, -0.043068330734968185,...",Article_47_BCR
...,...,...,...,...,...
82,3.6,What agreements must be made regarding the han...,question,"[-0.04174955189228058, -0.0986546203494072, -0...",Article_47_BCR
83,3.7,What expectations are set for members of Bindi...,question,"[-0.04039411619305611, -0.08921048790216446, -...",Article_47_BCR
84,3.8,What obligations are there to keep Binding Cor...,question,"[-0.04588606581091881, -0.06260394304990768, -...",Article_47_BCR
85,3.9,What definitions must be included in the Bindi...,question,"[-0.010924087837338448, -0.03873256593942642, ...",Article_47_BCR


In [86]:
import importlib
import src.index_tools
importlib.reload(src.index_tools)
from src.index_tools import add_to_index

index_df = article_47_index
text = "What happens if local laws conflict with the standards of binding corporate rules?"
section_reference = "3.5.4.1"
source = "question"
document = "Article_47_BCR"

index_df = add_to_index(openai_client, index_df, text, section_reference, source, document, embedding_model, embedding_dimensions)


In [84]:
from src.index_tools import remove_from_index

text_to_delete = "What happens if local laws conflict with the standards of binding corporate rules?"
article_47_index = remove_from_index(article_47_index, text_to_delete)

In [88]:
article_47_index = index_df
article_47_index[article_47_index["section_reference"] == "3.5.4.1"]

Unnamed: 0,section_reference,text,source,embedding,document
25,3.5.4.1,You must ensure that the Binding Corporate Rul...,summary,"[-0.018655141815543175, -0.05344023182988167, ...",Article_47_BCR
79,3.5.4.1,What should the Binding Corporate Rules-Contro...,question,"[-0.014255059882998466, -0.07780193537473679, ...",Article_47_BCR
87,3.5.4.1,What happens if local laws conflict with the s...,question,"[-0.02774774841964245, -0.06760360300540924, -...",Article_47_BCR


In [30]:
save_parquet_data(index_df, "../inputs/index/gdpr.parquet", key)

In [31]:
index_df[index_df['document'] == "article_30_5"]

Unnamed: 0,text,embedding,source,section_reference,document
147,Under what conditions could you be exempt from...,"[-0.03439036011695862, -0.0008122794097289443,...",question,all,article_30_5


In [9]:
import os
from regulations_rag.standard_regulation_index import load_parquet_data, save_parquet_data
key = os.getenv('encryption_key_gdpr')

path_to_file = "../inputs/index/article_30_5.parquet"
index = load_parquet_data(path_to_file, key)

In [10]:
#index["document"] = "Article_30_5"
index

Unnamed: 0,section_reference,text,source,embedding,document
0,all,"If you have fewer than 250 employees, you usua...",summary,"[-0.026057356968522072, 0.014008629135787487, ...",Article_30_5
1,all,What are the record-keeping exceptions for sma...,questions,"[0.007615841459482908, 0.02034611813724041, -0...",Article_30_5
2,all,When must smaller organisations maintain a rec...,questions,"[0.00046261242823675275, 0.006825227290391922,...",Article_30_5
3,all,What types of processing activities still requ...,questions,"[0.011152287945151329, 0.021851062774658203, -...",Article_30_5
4,all,Why is maintaining a record of processing acti...,questions,"[-0.011836612597107887, -0.011333354748785496,...",Article_30_5
5,all,What defines occasional processing?,questions,"[-0.0750560611486435, 0.04036270081996918, -0....",Article_30_5
6,all,What assistance is available for SMEs to manag...,questions,"[0.011198482476174831, 0.03537283092737198, -0...",Article_30_5
7,all,How does the principle of accountability relat...,questions,"[-0.045358121395111084, -0.04111029952764511, ...",Article_30_5


In [6]:
save_parquet_data(index, path_to_file, key)