In [10]:
import importlib

import src.valid_index
importlib.reload(src.valid_index)
from src.valid_index import get_excon_manual_index

import src.file_tools
importlib.reload(src.file_tools)
from src.file_tools import read_processed_regs_into_dataframe, get_regulation_detail, num_tokens_from_string

import src.embeddings
importlib.reload(src.embeddings)
from src.embeddings import get_ada_embedding

import src.tree_tools
importlib.reload(src.tree_tools)
from src.tree_tools import build_tree_for_regulation, split_tree

import src.summarise_and_question
importlib.reload(src.summarise_and_question)
from src.summarise_and_question import get_summary_and_questions_for


file_list = []
file_list.append('./manual/adla_manual.txt')
non_text_labels = ['Table', 'Formula', 'Example', 'Definition']

index_adla = get_excon_manual_index()
df_adla, non_text = read_processed_regs_into_dataframe(file_list=file_list, valid_index_checker=index_adla, non_text_labels=non_text_labels)
# Save the processed manual for chat
# df_adla.to_csv("./inputs/adla_manual.csv", encoding="utf-8", sep="|", index = False)
tree_adla = build_tree_for_regulation("ADLA", df_adla, valid_index_checker=index_adla)

section_summary_with_embeddings = "./tmp/summary_excon_with_embedding.parquet"
section_questions_with_embeddings = "./tmp/summary_excon_questions_with_embedding.parquet"
headings_index_file = "./tmp/headings.csv"




In [6]:
import pandas as pd

first_time = False
sectioned_df = pd.DataFrame([],columns = ["section", "text", "token_count"])
save_sectioned_df_to_file = "./tmp/adla_manual.csv"
if first_time:
    print("Loading the initial split of the tree. You will need to make changes to this as you see the data")
    # Starting at an particular parent node (can be the tree root or any child), this method splits up the 
    # branch into sections where the text does not exceed a certain word_count cap.
    sectioned_df = split_tree(tree_adla.root, df_adla, 1000, index_adla)
else:
    print("Loading the currency split of the tree so you can continue generating summaries and questions")
    sectioned_df = pd.read_csv(save_sectioned_df_to_file, encoding="utf-8", sep="|")

print(f'Total number of sections: {len(sectioned_df)}')


Loading the currency split of the tree so you can continue generating summaries and questions
Total number of sections: 92


Create or load the DataFrames that will hold the text index. Later we will add the embeddings to the same DataFrames. When we do so, loading the embeddings is slow from certain file formats like csv so we just start with a fast loading file format - parquet 

In [12]:
df_summary = None
if os.path.exists(section_summary_with_embeddings):
    df_summary = pd.read_parquet(section_summary_with_embeddings, engine='pyarrow')
    print(f"Summary data contains {len(df_summary)} lines of text")
    missing = len(df_summary[df_summary["text"] == ""])
    if missing > 0:
        print(f" -- of which there are {missing} lines that do not contain index text (e.g. sections with only definitions or indexes)")
else:
    print("Creating a new summary DataFrame")
    df_summary = pd.DataFrame([], columns = ["text", "section"])


df_questions = None
if os.path.exists(section_questions_with_embeddings):
    df_questions = pd.read_parquet(section_questions_with_embeddings, engine='pyarrow')
    print(f"Questions data contains {len(df_questions)} lines of text")    
    missing = len(df_questions[df_questions["text"] == ""])
    if missing > 0:
        print(f" -- of which there are {missing} lines that do not contain index text (e.g. sections with only definitions or indexes)")
else:
    print("Creating a new questions DataFrame")
    df_questions = pd.DataFrame([], columns = ["text", "section"])

index = None
if len(df_summary) != len(df_questions):
    print("The summary and the questions DataFrames do not have the same length")
else:
    index = len(df_summary)
    p = (index / len(sectioned_df)) * 100
    print(f'There are a total number of {len(sectioned_df)} sections to index')
    print(f"You have created {p:.2f} percent of your text index")


Summary data contains 92 lines of text
 -- of which there are 5 lines that do not contain index text (e.g. sections with only definitions or indexes)
Questions data contains 92 lines of text
 -- of which there are 5 lines that do not contain index text (e.g. sections with only definitions or indexes)
There are a total number of 92 sections to index
You have created 100.00 percent of your text index


## Create the text index 

This is a manual process. We call OpenAI and print out the answers in a format that is used to update the index text but this does need to be edited before it is added to the index so this is not automated.


In [121]:
#model = "gpt-3.5-turbo"
model="gpt-4"

reg_text = sectioned_df.loc[index]['text']
print("##############")
print(reg_text)
print("##############")

model_summary, model_questions = get_summary_and_questions_for(reg_text, model = model)

#format output
section = sectioned_df.loc[index]['section']
print(f'df_summary.loc[index, "section"] = "{section}"')
print(f'df_summary.loc[index, "text"] = "{model_summary}"')
print()
print(f'df_questions.loc[index, "section"] = "{section}"')
print(f'df_questions.loc[index, "text"] = "{model_questions}"')



##############
C.1 FinSurv Reporting System
    (J) Systems governance
        (i) Inspection manual
        The minimum information that should be contained in an inspection manual includes:
            (a) a comprehensive flow diagram clearly depicting the flow of transactions through various systems (on-boarding, transactional, accounting and FinSurv Reporting System, including the Reconciliation Module) from capturing to submission of the transactions to the Financial Surveillance Department;
            (b) an up to date list of definitions, contact details of the dedicated person(s) responsible for the reporting to Financial Surveillance Department, error handling and the Reconciliation Module; and
            (c) suitable back-up procedures (i.e. how often, where, when, by whom, the duration of storage that should be minimum five years and recovery testing). Refer to the inspection manual specimen which is available from the website: www.resbank.co.za by following the links: Hom

In [13]:
# Once the last four lines are manually checked, the edited result is copied here and the summary and question index is updated

df_summary.loc[index, "section"] = "C.1(J)"
df_summary.loc[index, "text"] = "Governance requirements for FinSurv Reporting System Include preparation of an inspection manual, submission of pre and post certification managerial letters of comfort, and an annual managerial letter of comfort, all addressed to the Financial Surveillance Department. Key elements covered are description of system transaction flows, contact information of personnel responsible for reporting, and backup procedures. The managerial letters of comfort give assurance of system compliance, risk mitigation, data accuracy, readiness for deployment, and the functionality of governance structures. The responsibility for these submissions primarily lies with the dedicated person responsible for regulatory compliance."

df_questions.loc[index, "section"] = "C.1(J)"
df_questions.loc[index, "text"] = "What is the role of the dedicated person for regulatory compliance in the FinSurv Reporting System?|What are the key elements covered in the governance requirements for the FinSurv Reporting System?"


index = index + 1
if index == len(sectioned_df):
    print("All done!")
else:
    next_section = sectioned_df.iloc[index]["section"]
    assert len(sectioned_df[sectioned_df["section"] == next_section]) == 1, "Huston, we have a problem"
    print(f'Next section is {next_section} which is on line {index}')
    p = ((index-1) / len(sectioned_df)) * 100
    print(f"You have completed {p:.2f} percent of your work")
    reg_text = sectioned_df.loc[index]['text']
    print("Next section")
    print("##############")
    print(reg_text)
    print("##############")



IndexError: single positional indexer is out-of-bounds

In [124]:
# Sometimes there are errors in the previous code block. We need to be careful when saving over any work we have already done so the 
# save step is a manual one which needs to be run regularly but without overwriting good data with bad data
df_summary.to_parquet(section_summary_with_embeddings, engine='pyarrow')
df_questions.to_parquet(section_questions_with_embeddings, engine='pyarrow')


## Changing how the sections are chunked

From time to time we will see instances whe the initial chunk size needs to be adjusted and nodes need to be expanded or collapsed. We do this in two stages. First we can experiment with a new "token_limit_per_chunk" for the node in question and once we find the chunking solution we are after, we remove the old chunks and replace them with the new chunks.


In [91]:
node_str = "C.1(D)(v)"
# get the list of indicies that start with this string
index_list = sectioned_df.index[sectioned_df['section'].str.startswith(node_str)].tolist()
is_consecutive = all(x+1 == y for x, y in zip(index_list[:-1], index_list[1:]))
assert is_consecutive, "The list of indicies that start with the node string is not consecutive so the rest of the logic here will not hold"
start_index_to_replace = index_list[0]
end_index_to_replace = index_list[-1] + 1
print(f"This will remove {len(index_list)} row(s) from sectioned_df. From {start_index_to_replace} to {end_index_to_replace}")

# Get the new set of indicies assuming a different chunking length
node = tree_adla.get_node(node_str)
token_limit_per_chunk = 1300

tmp_df = split_tree(node, df_adla, token_limit_per_chunk, index_adla)

print(f"... and will replace them with {len(tmp_df)} row(s)")
print(tmp_df)


This will remove 9 row(s) from sectioned_df. From 82 to 91
... and will replace them with 1 row(s)
     section                                               text  token_count
0  C.1(D)(v)  C.1 FinSurv Reporting System\n    (D) Offshori...         1208


Replace the node and all its children with the new DataFrame with a different word_count limit

In [92]:
def replace_rows(original_df, updated_section_df, start_row, end_row):
    before = original_df.iloc[:start_row]
    after = original_df.iloc[end_row:]
    new_df = pd.concat([before, updated_section_df, after]).reset_index(drop=True)
    return new_df

print(f"The original data consisted of {sectioned_df} chunks")
sectioned_df = replace_rows(sectioned_df, tmp_df, start_row=start_index_to_replace, end_row=end_index_to_replace)
print(f"Post the update, the data consists of {sectioned_df} chunks")

Jesus saves! 
But only if he is happy with the results. Check first!

In [94]:
sectioned_df.to_csv(save_sectioned_df_to_file, encoding="utf-8", sep="|", index = False)

## Definitions

In [29]:
import re
import pandas as pd

def count_leading_spaces(s):
    match = re.match(r'^\s*', s)  # Matches leading whitespace
    return len(match.group(0))

add_embeddings = False
ad = False # False = adla data
if ad:
    definitions_to_process = '#Definition 1'
    if definitions_to_process == '#Definition 1':
        excon_definitions_and_embeddings_file = "./tmp/ad_definitions_with_embeddings.csv"
        exclude_first_line = True
        start_line = 1
    elif definitions_to_process == '#Definition 2':
        excon_definitions_and_embeddings_file = "./tmp/ad_insurance_definitions_with_embeddings.csv"
        exclude_first_line = False
        start_line = 0
    elif definitions_to_process == '#Definition 3':
        excon_definitions_and_embeddings_file = "./tmp/ad_securities_definitions_with_embeddings.csv"
        exclude_first_line = True
        start_line = 1
    else:
        raise NotImplemented("Only implemented for excon Definitions 1, 2 and 3")
else:
    definitions_to_process = '#Definition 2'
    if definitions_to_process == '#Definition 1':
        excon_definitions_and_embeddings_file = "./tmp/adla_definitions.csv"
        exclude_first_line = True
        start_line = 1
    elif definitions_to_process == '#Definition 2':
        excon_definitions_and_embeddings_file = "./tmp/adla_cloud_definitions.csv"
        exclude_first_line = False
        start_line = 0
    # elif definitions_to_process == '#Definition 3':
    #     excon_definitions_and_embeddings_file = "./tmp/excon_securities_definitions_with_embeddings.csv"
    #     exclude_first_line = True
    #     start_line = 1
    else:
        raise NotImplemented("Only implemented for excon Definitions 1 and 2")

excon_manual_definitions = []
#raw_list = non_text['Definition']['#Definition 1']
raw_list = non_text['Definition'][definitions_to_process]
number_of_spaces = count_leading_spaces(raw_list[start_line])
if number_of_spaces % 4 != 0:
    raise ValueError(f"This line does not have an indent which is a multiple of 4: {raw_list[start_line]}")

current_line = raw_list[start_line]

current_line_number_of_spaces = count_leading_spaces(current_line)
if current_line_number_of_spaces != number_of_spaces:
    print(f"current_line_number_of_spaces: {current_line_number_of_spaces}")
    print(f'number_of_spaces: {number_of_spaces}')
    raise ValueError(f"This line does not have the correct indentation: {current_line}")

processing_table = False
for line_number in range(start_line,len(raw_list)-1):
    next_line = raw_list[line_number + 1]
    next_line_number_of_spaces = count_leading_spaces(next_line)
    if next_line_number_of_spaces % 4 != 0:
        raise ValueError(f"This line does not have an indent which is a multiple of 4: {next_line_number_of_spaces}")

    if current_line_number_of_spaces == next_line_number_of_spaces:
        current_line = current_line.lstrip()
        if "|" in current_line: # processing something with table formatting
            processing_table = True
            split_line = [x.strip() for x in current_line.split("|")]
            excon_manual_definitions.append(split_line)
        else:
            excon_manual_definitions.append(current_line)
        current_line = next_line
    else:
        current_line = current_line + "\n" + next_line

# add the last entry
current_line = current_line.lstrip()
if processing_table:
    split_line = [x.strip() for x in current_line.split("|")]
    excon_manual_definitions.append(split_line)
else:
    excon_manual_definitions.append(current_line.lstrip())

if processing_table:
    headings = excon_manual_definitions[0]
    excon_manual_definitions.pop(0)
    df_excon_definitions = pd.DataFrame(excon_manual_definitions, columns=headings) 
else:
    df_excon_definitions = pd.DataFrame(excon_manual_definitions, columns=["Definition"])

if add_embeddings:
    df_excon_definitions["Embedding"] =df_excon_definitions["Definition"].apply(get_ada_embedding)

df_excon_definitions.to_csv(excon_definitions_and_embeddings_file, sep="|", encoding='utf-8', index = False)



## Section headings are also a good index

In [36]:
df_adla[df_adla["Heading"] == True]

Unnamed: 0,Indent,Reference,Text,Document,Page,Heading,full_reference,word_count
0,0,Legal context,,,,True,Legal context,0
5,0,Introduction,,,,True,Introduction,0
15,0,A.2,Authorised entities,,,True,A.2,2
16,1,(A),Authorised Dealers in foreign exchange with li...,,,True,A.2(A),8
18,1,(B),Authorised Dealers,,,True,A.2(B),2
...,...,...,...,...,...,...,...,...
678,2,(i),Service related payments,,,True,C.1(H)(i),3
698,2,(ii),Transactions relating to income,,,True,C.1(H)(ii),4
704,2,(iii),Transfers of a current nature,,,True,C.1(H)(iii),5
713,1,(I),Reconciliation module,,,True,C.1(I),2


In [32]:
# step 1 is to remove all text that is not tagged as a heading leaving only the index and headings
toc_file = "./tmp/section_numbers_and_headings.txt" # Note this is a temporary file and will be deleted in a few cells time
df = df_adla
index = index_adla


written_references = set() # only write each reference once
with open(toc_file, 'w', encoding = 'utf-8') as f:
    for _, row in df.iterrows():
        if row['full_reference'] not in written_references:
            written_references.add(row['full_reference'])
            s = ' ' * row['Indent'] * 4 + row['Reference']
            # skip the text for into and legal
            if row['Heading'] and (row['Indent'] == 0 and row['Text'].strip() in index.exclusion_list):
                s = s
            elif row['Heading']:               
                s += " " + row['Text']
            f.write(s + '\n')

In [42]:
# step 2) Remove all lines that do not have text. Since the only text is for the headers, this removes everything that is not a header.
#         Note however that the remaining "headers" will not contain the (#headers) markdown so will be treated as text
import src.file_tools
importlib.reload(src.file_tools)
from src.file_tools import process_regulations

excon_headers = './tmp/non_empty_headings_excon.txt'  # Note this is a temporary file and will be deleted in a few cells time
files_as_list = []
files_as_list.append(toc_file)
df_toc, non_text_toc = process_regulations(files_as_list, valid_index_checker=index_adla, non_text_labels=non_text_labels)

for index, row in df_toc.iterrows():
    if row['Text'] != "":
        df_toc.at[index, 'Heading'] = True

tree_toc = build_tree_for_regulation("adla_toc", df_toc, valid_index_checker=index_adla)
l = tree_toc._list_node_children(tree_toc.root)
with open(excon_headers, 'w', encoding = 'utf-8') as f:
    f.write(l)

In [49]:
# step 3) The file that contains the headers (now as text because they are missing the (#heading) markdown) and load it up
#         as if it were the regs themselves
files_as_list = []
files_as_list.append(excon_headers)
df_non_empty_toc, non_text_toc = process_regulations(files_as_list, valid_index_checker=index_adla, non_text_labels=non_text_labels)
for index, row in df_non_empty_toc.iterrows():
    if row['Text'] != "":
        df_non_empty_toc.at[index, 'Heading'] = True

non_empty_tree_toc = build_tree_for_regulation("Excon", df_non_empty_toc, valid_index_checker=index_adla)


In [58]:
# Construct the full reference of each heading and use these as a key for a dictionary where the heading is the value
import os

def get_leaf_headings(root):
    leaf_headings = {}

    def recurse(node, heading):
        # Add ". " only if it's not the root node and the node's heading_text is not empty
        new_heading = heading + (". " + node.heading_text if heading and node.heading_text else node.heading_text)
        if not node.children:  # This is a leaf node.
            leaf_headings[node.full_node_name] = new_heading
        else:  # This is not a leaf node. We continue the recursion.
            for child in node.children:
                recurse(child, new_heading)

    recurse(root, '')  # We start the recursion from the root, with an empty heading.
    return leaf_headings

leaf_headings = get_leaf_headings(non_empty_tree_toc.root)
df_section_headings = pd.DataFrame(list(leaf_headings.items()), columns=['section', 'text'])
print(f'Excon Manual contains {len(leaf_headings)} section headings')

df_section_headings.to_csv(headings_index_file, encoding = "utf-8", sep = "|", index = False)
if os.path.exists(toc_file):
    os.remove(toc_file)
if os.path.exists(excon_headers):
    os.remove(excon_headers)



Excon Manual contains 84 section headings


## Embed the three indexes and the definitions

In [8]:
import pandas as pd
# Definitions
df_definitions = pd.read_csv("./tmp/adla_definitions.csv", encoding="utf-8", sep="|")
df_definitions["source"] = "all"
df_dfn_tmp = pd.read_csv("./tmp/adla_cloud_definitions.csv", encoding="utf-8", sep="|")
df_dfn_tmp.drop("Concept", axis=1, inplace=True)
df_dfn_tmp.rename(columns={"Description": "Definition"}, inplace=True)
df_dfn_tmp["source"] = "cloud"
df_definitions = pd.concat([df_definitions, df_dfn_tmp], ignore_index = True)
df_definitions['Embedding'] = df_definitions['Definition'].apply(get_ada_embedding)
df_definitions.to_parquet("./inputs/adla_definitions.parquet", engine='pyarrow')

In [15]:
# Index

df_index = pd.read_parquet(section_questions_with_embeddings, engine='pyarrow')
df_index = df_index[df_index["text"] != ""] # remove rows that have 'text' == ""
# the 'text' column for the questions may contain multiple questions separated by a "|". The next line expands these rows
# so the value in 'text' only contains one question
df_index = df_index.drop("text", axis=1).join(df_index["text"].str.split("|", expand=True).stack().reset_index(level=1, drop=True).rename("text"))
df_index.reset_index(drop=True, inplace=True)
df_index["source"] = "question"

df_tmp = pd.read_parquet(section_summary_with_embeddings, engine='pyarrow')
df_tmp["source"] = "summary"

df_tmp_2 = pd.read_csv(headings_index_file, encoding = "utf-8", sep = "|")
df_tmp_2["source"] = "heading"

df_index = pd.concat([df_index, df_tmp, df_tmp_2], ignore_index = True)
df_index = df_index[df_index["text"]!= ""]
df_index = df_index[df_index["text"].notna()] # Remove any NaN's
df_index.reset_index(drop=True, inplace=True)

df_index['Embedding'] = df_index['text'].apply(get_ada_embedding)
df_index.to_parquet("./inputs/adla_index.parquet", engine='pyarrow')


InvalidRequestError: We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)

In [26]:
# if there is an error somewhere in the generation of the embedding and you need to find it, this is a hacky way to do that
increment = 10
for i in range(0, len(df_index), increment):
    chunk = df_index.iloc[i:i+increment].copy()
    chunk["Embedding"] = chunk["text"].apply(get_ada_embedding)
    df_index.loc[chunk.index, "Embedding"] = chunk["Embedding"]
    print(f"Completed {i+increment} lines")


#df_index


Completed 250 lines
Completed 260 lines
Completed 270 lines
Completed 280 lines
Completed 290 lines
Completed 300 lines
Completed 310 lines
Completed 320 lines
Completed 330 lines
