In [1]:
import re
import fitz
import pandas as pd

In [2]:
import importlib
import src.valid_index
importlib.reload(src.valid_index)
from src.valid_index import get_popia_act_index

popia_index = get_popia_act_index()

In [3]:
pdf_file = "./pdf/3706726-11act4of2013protectionofpersonalinforcorrect.pdf"
doc = fitz.open(pdf_file)


In [72]:
def clean_text(text, valid_index,):
    lines = text.split("\n")
    cleaned_lines = []
    previous_line = ""
    for line in lines:
        if line.strip():  # If the line contains text other than whitespace
            line_text = line.strip()
            if line_text.isupper(): # mostly headings
                line_text = "\n" + line_text
            if line_text.startswith("Condition"):
                line_text = "\n" + line_text
            if line_text.startswith("Part"):
                line_text = "\n" + line_text
            elif line_text.startswith("‘‘"): # mostly definitions
                line_text = "\n" + line_text
            else:
                pattern = valid_index.index_patterns[0]
                match = re.match(pattern, line_text)
                if match:
                    matched_value = match.group()
                    remaining_line = line_text[match.end()+1:]
                    cleaned_lines[-1] = "\n" + matched_value + " " + previous_line + "\n" # trying to catch the section headings after the fact
                    line_text = remaining_line
                for pattern in valid_index.index_patterns[1:]:
                    if re.match(pattern, line_text):
                        line_text = "\n" + line_text
            previous_line = line_text
            cleaned_lines.append(line_text)
        # else:  # If the line is blank
        #     if cleaned_lines and cleaned_lines[-1] != "\n":  
        #         cleaned_lines.append("\n")
    cleaned_lines = [line for line in cleaned_lines if line.strip()]
    return " ".join(cleaned_lines)
    #return " ".join(cleaned_lines).replace(" \n ", "\n")
    
def output_doc_as_text(pdf_doc, valid_index, start_page = 0, end_page = 0, header_size=80, footer_size=0):
    if end_page == 0 or end_page > len(pdf_doc):
        end_page = len(pdf_doc)
    if end_page < start_page:
        print("End page before start page! Doing nothing")
    combined_text = ''
    for page_number in range(start_page, end_page):
        page = pdf_doc[page_number]
        tl = page.rect[0], page.rect[1]  # lower-left coordinates
        br = page.rect[2], page.rect[3]  # upper-right
        rect = fitz.Rect(tl[0], tl[1]+header_size, br[0]-130, br[1]-footer_size)
        raw_text = page.get_text('text', clip=rect)

        cleaned_text = clean_text(raw_text, valid_index)
        combined_text += cleaned_text
        combined_text = combined_text.replace("ﬁ", "fi")
        combined_text = combined_text.replace("ﬂ", "fl")        
        combined_text = combined_text.replace("‘‘", '\"')
        combined_text = combined_text.replace("’’", '\"')

    return combined_text


def get_indent(line, valid_index):
    indent = None
    # Check if line starts with any string from exclusion_list
    if any(line.startswith(exclusion) for exclusion in valid_index.exclusion_list):
        indent = 0
    
    # Check if line starts with a string matching any regex pattern from excon_index_patterns
    if indent is None:
        for idx, pattern in enumerate(valid_index.index_patterns):
            if re.match(pattern, line):
                indent = idx * 4
                break
    return indent

def add_indents(text, valid_index):
    previous_indent = 0
    lines = text.split("\n")
    modified_lines = []
    for line in lines:
        line = line.strip()
        if line:
            indent = get_indent(line, valid_index)
            if indent is None:
                indent = previous_indent
            modified_line = " " * indent + line
            modified_lines.append(modified_line)
            previous_indent = indent
    return "\n".join(modified_lines)

In [73]:
# The output of the this step is to a temporary file because it needs to be reviewed before it can be used. Only once the review is
# complete should the file be moved to the ./manual/ folder for later use

document_text = output_doc_as_text(doc, popia_index, start_page = 6, end_page = 55) # end_page = 55

output_file = "./tmp/popia_act_intermediate.txt"
with open(output_file, 'w', encoding="utf-8") as f:
        f.write(document_text)

output_file = "./tmp/popia_act.txt"
document_text = add_indents(document_text, popia_index)
with open(output_file, 'w', encoding="utf-8") as f:
        f.write(document_text)

Now you need to manually review the file and make the necessary changes.

Load the text file into the data structures to check everything is ok

In [3]:
import importlib
import src.valid_index
importlib.reload(src.valid_index)
from src.valid_index import get_popia_act_index

popia_index = get_popia_act_index()


import src.file_tools
importlib.reload(src.file_tools)
from src.file_tools import read_processed_regs_into_dataframe, get_regulation_detail

dir_path = './txt/'
file_list = []
file_list.append('./txt/popia_act.txt')
non_text_labels = ['Definition']
df_popia, non_text = read_processed_regs_into_dataframe(file_list=file_list, valid_index_checker=popia_index, non_text_labels=non_text_labels)


import src.tree_tools
importlib.reload(src.tree_tools)
from src.tree_tools import build_tree_for_regulation

tree_popia = build_tree_for_regulation("POPIA", df_popia, valid_index_checker=popia_index)


In [5]:
df_popia.to_csv("./tmp/popia_manual.csv", encoding="utf-8", sep="|")

In [13]:
# len(df_popia)
# df_popia[0:15]
tree_popia.print_tree()

POPIA []
|-- 2. [Purpose of Act]
|   |-- (a) []
|   |   |-- (i) []
|   |   +-- (ii) []
|   |-- (b) []
|   |-- (c) []
|   +-- (d) []
|-- 3. [Application and interpretation of Act]
|   |-- (1) []
|   |   |-- (a) []
|   |   +-- (b) []
|   |       |-- (i) []
|   |       +-- (ii) []
|   |-- (2) []
|   |   |-- (a) []
|   |   +-- (b) []
|   |-- (3) []
|   |   |-- (a) []
|   |   +-- (b) []
|   +-- (4) []
|-- 4. [Lawful processing of personal information]
|   |-- (1) []
|   |   |-- (a) []
|   |   |-- (b) []
|   |   |-- (c) []
|   |   |-- (d) []
|   |   |-- (e) []
|   |   |-- (f) []
|   |   |-- (g) []
|   |   +-- (h) []
|   |-- (2) []
|   |   |-- (a) []
|   |   +-- (b) []
|   |-- (3) []
|   |   |-- (a) []
|   |   +-- (b) []
|   |-- (4) []
|   |   |-- (a) []
|   |   +-- (b) []
|   |-- (5) []
|   |-- (6) []
|   +-- (7) []
|-- 5. [Rights of data subjects]
|   |-- (a) []
|   |   |-- (i) []
|   |   +-- (ii) []
|   |-- (b) []
|   |-- (c) []
|   |-- (d) []
|   |-- (e) []
|   |   |-- (i) []
|   |   +-- 

In [151]:
print(get_regulation_detail('29.', df_popia, popia_index))

29. Authorisation concerning data subject’s race or ethnic origin
The prohibition on processing personal information concerning a data subject’s race or ethnic origin, as referred to in section 26, does not apply if the processing is carried out to—
        (a) identify data subjects and only when this is essential for that purpose; and
        (b) comply with laws and other measures designed to protect or advance persons, or categories of persons, disadvantaged by unfair discrimination.
