### To add new content to the manual_plus document and create an index for it

In [93]:
import os
import pandas as pd
import importlib
from openai import OpenAI
from regulations_rag.file_tools import load_parquet_data, save_parquet_data

import data_tools.index_tools
importlib.reload(data_tools.index_tools)
from data_tools.index_tools import update_text_in_index, add_to_index, remove_from_index

from cemad_rag.cemad_reference_checker import CEMADReferenceChecker
cemad_reference_checker = CEMADReferenceChecker()

from dotenv import load_dotenv
load_dotenv()
secret_name = "OPENAI_API_KEY_CEMAD"
openai_api_key = os.getenv(secret_name)
openai_client = OpenAI(api_key=openai_api_key,)
decryption_key = os.getenv('DECRYPTION_KEY_CEMAD')



In [80]:
existing_document_csv = "./inputs/documents/ad_manual_plus.csv"
existing_df = pd.read_csv(existing_document_csv, encoding = 'utf-8', sep = "|", na_values="",
    keep_default_na=False)
existing_df = existing_df.fillna("")

The text to add should be in the same format as the rest of the manual so tools can be reused to process it

In [89]:
text = """
Z.1 References (#Heading)
    (C) Crypto Assets (#Heading)
        (i) According to the Reserve Bank's official [FAQ page](https://www.resbank.co.za/en/home/what-we-do/financial-surveillance/FinSurvFAQ) "The SARB does not currently oversee, supervise or regulate crypto assets, which were previously referred to as virtual currencies, but is continuing to monitor this evolving area. The SARB’s position on crypto assets remains as set out in the [2014 Position Paper on Virtual Currencies](https://www.resbank.co.za/content/dam/sarb/what-we-do/financial-surveillance/general-public/Virtual%20Currencies%20Position%20Paper%20%20Final_02of2014.pdf)"
        More recently, the Crypto Assets Regulatory (CAR) Working Group (WG) published a [position paper](https://www.treasury.gov.za/comm_media/press/2021/IFWG_CAR%20WG_Position%20paper%20on%20crypto%20assets_Final.pdf) on Crypto Assets where they set out 25 recommendations for a revised South African policy, legal and regulatory position on crypto assets and related activities. This is intended to provide a roadmap to putting in place a framework for regulating crypto asset service providers in South Africa.
        When these recommendations make it into regulations they will be included in this document. Until then please refer to the links to see the official position and where we may be heading. 
"""
# sectinos_referenced is a column that is used when CEMAD is updated to ensure these additions can be updated to keep 
# up with changes in the manual
# sections_referenced should be in the format of a dictionary {"Z.1(A)(i)(c)":"B.4(B)(i), B.4(B)(ii), B.4(B)(iv)(a)"}
# where the key needs to exist in the text above
sections_referenced = {}

index_for_new_text = [
    ["Z.1(C)(i)", "What do the regulations say about Crypto?"]
]


In [72]:
from file_tools.file_tools import process_regulations
# Ensure the directory exists
tmp_folder = './tmp'
os.makedirs(tmp_folder, exist_ok=True)
file_name = tmp_folder + '/text_to_process'
with open(file_name, 'w', encoding='utf-8') as f:
    f.write(text)

file_list = []
file_list.append(file_name)
# required for the tools
non_text_labels = ['Table', 'Formula', 'Example', 'Definition']
df_from_txt, non_text = process_regulations(file_list, cemad_reference_checker, non_text_labels)

# Remove the file
os.remove(file_name)
# Remove the folder
os.rmdir(tmp_folder)

# now add in the sections_referenced
df_from_txt['sections_referenced'] = ""
for key, value in sections_referenced.items():
    subset_df = df_from_txt[df_from_txt["section_reference"]== key]
    assert len(subset_df) > 0
    df_from_txt.loc[subset_df.index, "sections_referenced"] = value

assert existing_df.columns.to_list() == df_from_txt.columns.to_list()

# Filter rows in df_from_txt where section_reference is not already in existing_df
filtered_df = df_from_txt[~df_from_txt['section_reference'].isin(existing_df['section_reference'])]
combined_df = pd.concat([existing_df, filtered_df], ignore_index=True)

In [79]:
from file_tools.tree_tools import build_tree_for_regulation
tree = build_tree_for_regulation("manual_plus", combined_df, cemad_reference_checker)
tree.print_tree()

manual_plus []
|-- Z.1 [References]
|   |-- (A) [Commodities]
|   |   +-- (i) []
|   |-- (B) [Travel]
|   |   +-- (i) [Summary Travel Rules]
|   |       |-- (a) []
|   |       |-- (b) []
|   |       |-- (c) []
|   |       |-- (d) []
|   |       |-- (e) []
|   |       |-- (f) []
|   |       |-- (g) []
|   |       |-- (h) []
|   |       |-- (i) []
|   |       |-- (j) []
|   |       +-- (k) []
|   +-- (C) [Crypto Assets]
|       +-- (i) []
+-- Z.2 [Metadata]
    +-- (A) [Version]
        +-- (i) []


Save the updated dataframe over the old one

In [85]:
#existing_document_csv
existing_document_csv_1 = "./inputs/documents/ad_manual_plus_1.csv"
combined_df.to_csv(existing_document_csv_1, encoding='utf-8', sep="|", na_rep='', index=False)


add to the index

In [105]:
index_file = "./inputs/index/ad_index_plus.parquet"
existing_index_df = load_parquet_data(index_file, decryption_key)

Could not find the file ./inputs/index/ad_index_plus.parquet


FileNotFoundError: Could not find the file ./inputs/index/ad_index_plus.parquet

In [97]:
from regulations_rag.embeddings import get_ada_embedding
from regulations_rag.embeddings import  EmbeddingParameters
embedding_parameters = EmbeddingParameters("text-embedding-3-large", 1024)

list_to_add_to_index = []
added_sections = filtered_df["section_reference"].to_list()
for row in index_for_new_text:
    assert len(row) == 2
    assert cemad_reference_checker.is_valid(row[0])
    assert row[0] in added_sections
    embedding = get_ada_embedding(openai_client, row[1], embedding_parameters.model, embedding_parameters.dimensions)  
    list_to_add_to_index.append([row[0], row[1], "document", embedding, "CEMAD_User_Queries"])

In [100]:
df_append_to_index = pd.DataFrame(list_to_add_to_index, columns = ["section_reference", "text", "source", "embedding", "document"])
assert df_append_to_index.columns.to_list() == existing_index_df.columns.to_list()

In [102]:
combined_index_df = pd.concat([existing_index_df, df_append_to_index], ignore_index=True)

In [107]:
save_parquet_data(combined_index_df, index_file, decryption_key)