### To add new content to the manual_plus document and create an index for it

In [12]:
import os
import pandas as pd
import importlib
from openai import OpenAI
from regulations_rag.file_tools import load_parquet_data, save_parquet_data

import data_tools.index_tools
importlib.reload(data_tools.index_tools)
from data_tools.index_tools import update_text_in_index, add_to_index, remove_from_index

from cemad_rag.cemad_reference_checker import CEMADReferenceChecker
cemad_reference_checker = CEMADReferenceChecker()

from dotenv import load_dotenv
load_dotenv()
secret_name = "OPENAI_API_KEY_CEMAD"
openai_api_key = os.getenv(secret_name)
openai_client = OpenAI(api_key=openai_api_key,)
key = os.getenv('DECRYPTION_KEY_CEMAD')



In [67]:
existing_df = pd.read_csv("./inputs/documents/ad_manual_plus.csv", encoding = 'utf-8', sep = "|", na_values="",
    keep_default_na=False)
existing_df = existing_df.fillna("")

The text to add should be in the same format as the rest of the manual so tools can be reused to process it

In [69]:
text = """
Z.1 References (#Heading)
    (C) Crypto Assets (#Heading)
        (i) According to the Reserve Bank's official [FAQ page](https://www.resbank.co.za/en/home/what-we-do/financial-surveillance/FinSurvFAQ) "The SARB does not currently oversee, supervise or regulate crypto assets, which were previously referred to as virtual currencies, but is continuing to monitor this evolving area. The SARB’s position on crypto assets remains as set out in the [2014 Position Paper on Virtual Currencies](https://www.resbank.co.za/content/dam/sarb/what-we-do/financial-surveillance/general-public/Virtual%20Currencies%20Position%20Paper%20%20Final_02of2014.pdf)"
        More recently, the Crypto Assets Regulatory (CAR) Working Group (WG) published a [position paper](https://www.treasury.gov.za/comm_media/press/2021/IFWG_CAR%20WG_Position%20paper%20on%20crypto%20assets_Final.pdf) on Crypto Assets where they set out 25 recommendations for a revised South African policy, legal and regulatory position on crypto assets and related activities. This is intended to provide a roadmap to putting in place a framework for regulating crypto asset service providers in South Africa.
        When these recommendations make it into regulations they will be included in this document. Until then please refer to the links to see the official position and where we may be heading. 
"""
# sectinos_referenced is a column that is used when CEMAD is updated to ensure these additions can be updated to keep 
# up with changes in the manual
# sections_referenced should be in the format of a dictionary {"Z.1(A)(i)(c)":"B.4(B)(i), B.4(B)(ii), B.4(B)(iv)(a)"}
# where the key needs to exist in the text above
sections_referenced = {}

index_for_new_text = {"Z.1(C)(i)", "What do the regulations say about Crypto?"}


In [72]:
from file_tools.file_tools import process_regulations
# Ensure the directory exists
tmp_folder = './tmp'
os.makedirs(tmp_folder, exist_ok=True)
file_name = tmp_folder + '/text_to_process'
with open(file_name, 'w', encoding='utf-8') as f:
    f.write(text)

file_list = []
file_list.append(file_name)
# required for the tools
non_text_labels = ['Table', 'Formula', 'Example', 'Definition']
df_from_txt, non_text = process_regulations(file_list, cemad_reference_checker, non_text_labels)

# Remove the file
os.remove(file_name)
# Remove the folder
os.rmdir(tmp_folder)

# now add in the sections_referenced
df_from_txt['sections_referenced'] = ""
for key, value in sections_referenced.items():
    subset_df = df_from_txt[df_from_txt["section_reference"]== key]
    assert len(subset_df) > 0
    df_from_txt.loc[subset_df.index, "sections_referenced"] = value

assert existing_df.columns.to_list() == df_from_txt.columns.to_list()

# Filter rows in df_from_txt where section_reference is not already in existing_df
filtered_df = df_from_txt[~df_from_txt['section_reference'].isin(existing_df['section_reference'])]

In [74]:
combined_df = pd.concat([existing_df, filtered_df], ignore_index=True)
combined_df

Unnamed: 0,indent,reference,text,document,page,heading,section_reference,word_count,sections_referenced
0,0,Z.1,References,,,True,Z.1,1,
1,1,(A),Commodities,,,True,Z.1(A),1,
2,2,(i),Exchange Control treats Gold extensively in it...,,,False,Z.1(A)(i),0,"B.2(B)(i)(r), B.2(I)(iii), D.1(G)(v)(b)(aa), D..."
3,1,(B),Travel,,,True,Z.1(B),1,
4,2,(i),Summary Travel Rules,,,True,Z.1(B)(i),3,
5,3,(a),There is no limit on the amount of Rand that m...,,,False,Z.1(B)(i)(a),23,B.4(B)(i)
6,3,(b),Adult residents (above 18 years old) may use a...,,,False,Z.1(B)(i)(b),28,B.4(A)(ii)
7,3,(c),Residents under 18 years old are permitted a t...,,,False,Z.1(B)(i)(c),22,"B.4(B)(i), B.4(B)(ii), B.4(B)(iv)(a)"
8,3,(d),"Foreign exchange, in terms of a travel allowan...",,,False,Z.1(B)(i)(d),38,B.4(B)(iii)
9,3,(e),Foreign currency for travel may not be bought ...,,,False,Z.1(B)(i)(e),21,B.4(B)(x)(a)


In [75]:
from file_tools.tree_tools import build_tree_for_regulation

tree = build_tree_for_regulation("manual_plus", combined_df, cemad_reference_checker)
tree.print_tree()

ModuleNotFoundError: No module named 'regulations_rag.reg_tools'

In [57]:
print(df_from_txt.columns.to_list())
print(df.columns.to_list())

['indent', 'reference', 'text', 'document', 'page', 'heading', 'section_reference', 'word_count', 'sections_referenced']
['indent', 'reference', 'text', 'document', 'page', 'heading', 'section_reference', 'word_count', 'sections_referenced']


In [14]:
assert cemad_reference_checker.is_valid(input_section)
components = cemad_reference_checker.split_reference(input_section)
indent = len(components) - 1
reference = components[-1]


print(f"indent: {indent}")
print(f"reference: {reference}")
# indent|reference|text|document|page|heading|section_reference|word_count|sections_referenced

indent: 1
reference: (C)


In [9]:
from cemad_rag.cemad_reference_checker import CEMADReferenceChecker
from cemad_rag.cemad_reader import CEMADReader

from regulations_rag.regulation_table_of_content import StandardTableOfContent, split_tree

reference_checker = CEMADReferenceChecker()
reader = CEMADReader()
df = reader.regulation_df
toc = StandardTableOfContent(root_node_name = "AD", index_checker = reference_checker, regulation_df = df)

Unnamed: 0,indent,reference,text,document,page,heading,section_reference,word_count,sections_referenced
0,0,Z.1,References,,,True,Z.1,1,
1,1,(A),Commodities,,,True,Z.1(A),1,
2,2,(i),Exchange Control treats Gold extensively in it...,,,False,Z.1(A)(i),0,"B.2(B)(i)(r), B.2(I)(iii), D.1(G)(v)(b)(aa), D..."
3,1,(B),Travel,,,True,Z.1(B),1,
4,2,(i),Summary Travel Rules,,,True,Z.1(B)(i),3,
5,3,(a),There is no limit on the amount of Rand that m...,,,False,Z.1(B)(i)(a),23,B.4(B)(i)
6,3,(b),Adult residents (above 18 years old) may use a...,,,False,Z.1(B)(i)(b),28,B.4(A)(ii)
7,3,(c),Residents under 18 years old are permitted a t...,,,False,Z.1(B)(i)(c),22,"B.4(B)(i), B.4(B)(ii), B.4(B)(iv)(a)"
8,3,(d),"Foreign exchange, in terms of a travel allowan...",,,False,Z.1(B)(i)(d),38,B.4(B)(iii)
9,3,(e),Foreign currency for travel may not be bought ...,,,False,Z.1(B)(i)(e),21,B.4(B)(x)(a)
