In [3]:
""" Module for converting docx to xml for element prediction"""
import os
import re
from collections import Counter
import docx
import docx.package
import docx.parts.document
from docx.document import Document
import docx.parts.numbering
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.numbering import CT_Num
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.oxml import parse_xml
from docx.oxml.ns import nsmap
from lxml import etree
import pandas as pd

def health_check():
    """Function to perform a health check (Used for testing that the package gets loaded)"""
    output_str = "Health Check OK"
    return output_str

def extract_docx_properties(docx_path):
    """ Function to create an XML document that can be passed to Element Prediction
    Args:
        docx_path (str): String containing the path to the docx that should be used for extraction
    Returns:
        [str]: An xml formatted string that contains extracted properties from the docx
    """

    document, numbering_pd, theme_dict = create_document_object(docx_path)

    # Default value, empty string
    para_properties_xml = ""
    # Create list that will hold all the properties which should be treated as CDATA
    c_data_tags = ["ParaContent"]
    # Create a list of dictionaries with the properties of each paragraph
    para_properties_list = extract_properties_to_list(
        document,
        numbering_pd,
        theme_dict
    )
    # Convert list of dictionaries to an XML string
    para_properties_xml = create_structured_xml(
        para_properties_list,
        c_data_tags
    )

    return para_properties_xml


def create_document_object(docx_path):
    """ Function to create a python-docx Document object
    Args:
        docx_path (str): String containing the path to the docx file
    Returns:
        python-docx Document: A python-docx Document object of the docx file
    """
    document = None
    numbering_pd = None
    theme_dict = None

    # First check that a string with characters has been passed
    if len(docx_path) > 0:
        # Check to see if the file exists
        if os.path.exists(docx_path):
            docx_package = docx.package.Package.open(docx_path)
            # Create the Numbering DataFrame
            try:
                numbering = docx_package.main_document_part.numbering_part

            except (RuntimeError, TypeError, NameError, AttributeError, NotImplementedError):
                numbering = None

            if not numbering is None:
                numbering_pd = create_numbering_pd(numbering_part=numbering)

            # Create the Theme Dictionary
            theme_dict = get_theme_data(docx_package)

            document = docx.Document(docx_path)

    return document, numbering_pd, theme_dict

def create_numbering_pd(numbering_part):
    """ Function to create a Pandas Dataframe representing the numbering.xml file in a docx
    Args:
        numbering_part (python-docx parts object): A python-docx parts object representing
            the numbering.xml file
    Returns:
        [pandas Dataframe]: A converted pandas DataFrame of the numbering.xml file for easy access
    """
    namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    num_list = []
    abstract_num_list = []

    # Loop through each part of numbering.xml
    for num in numbering_part._element:
        # The abstractNum part
        if not isinstance(num, CT_Num):
            # Loop through each child, adding value to a dict
            for abstract_element in num.iterchildren():
                abstract_num_dict = {
                    "abstract_num_id" : num.get(namespace + "abstractNumId")
                }
                # Get MultilevelType
                if abstract_element.tag == (namespace + "multiLevelType"):
                    abstract_num_dict["multi_level_type"] = abstract_element.get(namespace + "val")
                if abstract_element.tag == (namespace + "lvl"):
                    # Add Dictionary for level based info
                    abstract_num_level_dict = abstract_num_dict
                    abstract_num_level_dict["level"] = abstract_element.get(namespace + "ilvl")

                    for level_element in abstract_element.iterchildren():
                        if level_element.tag == (namespace + "start"):
                            abstract_num_level_dict["level_start"] = level_element.get(namespace + "val")
                        if level_element.tag == (namespace + "numFmt"):
                            abstract_num_level_dict["level_num_format"] = level_element.get(namespace + "val")
                        if level_element.tag == (namespace + "lvlText"):
                            abstract_num_level_dict["level_text"] = level_element.get(namespace + "val")
                        if level_element.tag == (namespace + "pPr"):
                            for level_para_prop_element in level_element.iterchildren():
                                if level_para_prop_element.tag == (namespace + "ind"):
                                    abstract_num_level_dict["level_para_prop_left"] = level_para_prop_element.get(namespace + "left")
                                    abstract_num_level_dict["level_para_prop_hanging"] = level_para_prop_element.get(namespace + "hanging")
                    abstract_num_list.append(abstract_num_level_dict)
        else:
            # The numID part that is in document.xml
            num_dict = {
                "num_id" : num.get(namespace + "numId"),
                "abstract_num_id" : str(num.abstractNumId.val)
            }
            num_list.append(num_dict)

    # Convert Lists to DataFrame
    num_id_pd = pd.DataFrame(num_list)
    abstract_num_pd = pd.DataFrame(abstract_num_list)

    # Merge into a single Data Frame
    numbering_pd = pd.merge(
        num_id_pd,
        abstract_num_pd,
        how = "left",
        on = "abstract_num_id"
    )

    return numbering_pd

def get_data_from_numbering_pd(numbering_pd, num_id, level, column):
    """ Function return spacific data from the numbering_pd DataFrame
    Args:
        numbering_pd (pandas DataFrame): The pandas Dataframe corresponding to numbering.xml, see
            create_numbering_pd()
        num_id (int): An integer corresponding to the List Para Style to be retrieved
        level (int): An integer corresponding to the level that the list is at
        column (str): String corresponding to the column name to be returned from the numbering_pd
            dataframe
    Returns:
        [str]: The found data value corresponding to the inputs
    """

    output = ""

    if not numbering_pd is None:
        if len(numbering_pd.index) > 0 and \
            column in numbering_pd.columns and \
            int(num_id) >= 0:

            numbering_filtered = numbering_pd[numbering_pd.num_id == num_id]
            if int(level) > 0:
                numbering_filtered = numbering_filtered[numbering_filtered.level == level]

            output = numbering_filtered[column].to_list()[0]

    return output

def get_theme_data(docx_package):
    """ Function to create a dictionary containing the major and minor fonts found in theme.xml
    Args:
        docx_package (python-docx docx.package.Package object): A python-docx Package object
    Returns:
        [dict]: A dictionary containing two keys containing string values, major and minor fonts
    """
    output = {
        "major_font" : "",
        "minor_font" : ""
    }

    # Iterate through each of the parts in the docx package
    for part in docx_package.parts:
        if part.partname.startswith("/word/theme/"):
            theme_xml = parse_xml(part.blob)

            for major_font in theme_xml.xpath("//a:majorFont/a:latin/@typeface",
                namespaces = nsmap):
                output["major_font"] = major_font

            for minor_font in theme_xml.xpath("//a:minorFont/a:latin/@typeface",
                namespaces = nsmap):
                output["minor_font"] = minor_font

    return output

""" Function to iterate through paragraphs of a document and return each paragraph in a list
    Input:
    - document: python-docx document object, see create_document_object
    Output:
    - para_properties_list: A list containing dictionaries which contain properties of the paragraph
    """

def extract_properties_to_list(document, numbering_pd, theme_dict):
    """ Function to iterate through paragraphs of a document, extract relevant information
        about the paragraph, store as a dictionary, and append to a list
    Args:
        document (python-docx docx.Document): A python-docx Document object representing the .docx file
        numbering_pd (pandas DataFrame): A pandas DataFrame representing the numbering.xml
        theme_dict (dict): A dictionary for the major and minor fonts
    Returns:
        [list]: A list containing dictionaries which store the information on each paragraph
    """

    block_id = 1
    document_properties_list = []

    for document_block in iter_block_items(document):
        # Check to see if the paragraph has a text box
        para_contains_text_box = para_contains_xpath(
            document_block,
            xpath_string = ".//v:textbox/w:txbxContent")

        # Check to see if paragraph contains an inline image
        para_contains_linked_image = para_contains_xpath(
            document_block,
            xpath_string = ".//w:drawing/wp:inline/a:graphic/a:graphicData/pic:pic"
        )

        # Check to see if paragraph contains an word shape
        para_contains_shape = para_contains_xpath(
            document_block,
            xpath_string = ".//w:drawing/wp:inline/a:graphic/a:graphicData/dgm:relIds"
        )

        ##########################
        ######## Table ###########
        ##########################
        if isinstance(document_block, Table):
            # Iterate through the rows, cells and paragraphs in the table
            for table_row in document_block.rows:
                for cell in table_row.cells:
                    for para in cell.paragraphs:
                        # Uncommented this line to exclude any paragraphs with blank or only new lines
                        document_properties_list.append(create_paragraph_properties(
                            document,
                            para,
                            block_id,
                            block_type = "table_cell_paragraph",
                            numbering_pd = numbering_pd,
                            theme_dict = theme_dict))

                        block_id += 1

        # Check to see if the current block is a paragraph and contains textbox
        elif isinstance(document_block, Paragraph) and para_contains_text_box:
            ################################
            ### Paragraphs with TextBox ####
            ################################
            text_box_para = etree.ElementBase.xpath(
                document_block._element,
                '//v:textbox/w:txbxContent/w:p',
                namespaces = document_block._element.nsmap)

            # Iterate through all the paragraphs in the textboxes
            for txt_box_para in text_box_para:
                # Convert the paragraph (w:p) to a Paragraph Class
                txt_box_para_class = Paragraph(
                    txt_box_para,
                    document_block
                )

                # Identify the properties
                document_properties_list.append(create_paragraph_properties(
                    document,
                    txt_box_para_class,
                    block_id,
                    block_type = "text_box_paragraph",
                    numbering_pd = numbering_pd,
                    theme_dict = theme_dict))

                block_id += 1

        elif isinstance(document_block, Paragraph):
            block_type = ""
            if para_contains_linked_image:
                block_type = "linked_image_paragraph"
            elif para_contains_shape:
                block_type = "shape_paragraph"
            else:
                block_type = "paragraph"

            ####################################
            ### Paragraph with Linked Image ####
            ####################################
            document_properties_list.append(create_paragraph_properties(
                document,
                document_block,
                block_id,
                block_type = block_type,
                numbering_pd = numbering_pd,
                theme_dict = theme_dict))

            block_id += 1

    return document_properties_list

def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document
    order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def para_contains_xpath(para, xpath_string):
    """ Function to check if an XML block contains a certain xpath query
    Args:
        para (python-docx Paragraph): Python-docx paragraph object to check for a certain xpath
        xpath_string (str): String containing the xpath to be queried
    Returns:
        boolean: True/False indicating if the para XML contains the xpath
    """
    # Combine Namespaces from the package and the paragraph
    custom_nsmap = dict(list(nsmap.items()) + list(para._element.nsmap.items()))
    xml_value = etree.ElementBase.xpath(
        para._element,
        xpath_string,
        namespaces = custom_nsmap)

    return len(xml_value) > 0

def create_paragraph_properties(document, para, para_id, block_type, numbering_pd, theme_dict):
    """ Function to create a paragraph properties dictionary
    Args:
        document (python_docx Document): Python-docx Document object
        para (python-docx Paragraph): Python-docx Paragraph object
        para_id (int): Sequential integer indicating the order in which the document block
            occurs in the document
        block_type (str): Type of document block, example, paragraph, text_box_paragraph,
            table_cell_paragraph
        numbering_pd (pandas DataFrame): A pandas DataFrame corresponding to numbering.xml
        theme_dict (dict): A dictionary containing keys for the major and minor fonts
    Returns:
        dict: Dictionary containing the paragraph properties for the document block
    """

    para_prop_dict = {}
    para_prop_dict["ParaID"] = para_id
    para_prop_dict["ParaObjectType"] = block_type
    para_prop_dict["ParaCleanedContent"] = transform_para_content(get_para_content(para))
    para_prop_dict["ParaContent"] = get_para_content(para)
    para_prop_dict["ParaContentTabStart"] = get_para_content_tab_start_count(para)
    para_prop_dict["ParaFontFamily"] = get_para_font_family(document, para, theme_dict)
    para_prop_dict["ParaBold"] = get_para_bold(document, para)
    para_prop_dict["ParaItalic"] = get_para_italic(document, para)
    para_prop_dict["ParaFontSize"] = get_para_font_size(document, para)
    para_prop_dict["ParaStyle"] = get_para_style(para)
    para_prop_dict["ParaListStyle"] = get_para_list_style(document, para, numbering_pd)
    para_prop_dict["ParaLeftIndent"] = get_para_left_indent(document, para, numbering_pd)
    para_prop_dict["ParaRightIndent"] = get_para_right_indent(document, para)
    para_prop_dict["ParaFirstLineIndent"] = get_para_first_line_indent(document, para)
    para_prop_dict["ParaAlignment"] = get_para_alignment(document, para)
    para_prop_dict["ParaLineSpace"] = get_para_line_space(document, para)
    para_prop_dict["ParaAboveSpace"] = get_para_space_above(document, para)
    para_prop_dict["ParaBelowSpace"] = get_para_space_below(document, para)

    para_prop_dict = get_para_border(para_prop_dict, para)

    para_prop_dict = get_para_shading(para_prop_dict, para)

    para_prop_dict["ParaSingleStrike"] = get_para_single_strike(para)
    para_prop_dict["ParaDoubleStrike"] = get_para_double_strike(para)
    para_prop_dict["ParaUnderline"] = get_para_underline(document, para)
    para_prop_dict["ParaSmallCaps"] = get_para_small_caps(para)

    return para_prop_dict

def get_xml_attribute(para,
                      tag_to_find,
                      tag_parent,
                      tag_attribute,
                      default_value = 0
                     ):
    """ Function to retrieve an xml attribute for a given tag and parent tag
    Args:
        para (python-docx paragraph object): Python-docx paragraph object
            found through iterating the document
        tag_to_find (str): The tag to retreive attribute values for
        tag_parent (str): The parent tag_to_find should have. Found using .getparent().tag
        tag_attribute (str): The attribute value to be retrieved from the element
        default_value (int, optional): A default value to be returned in the
            event no value is found. Defaults to 0.
    Returns:
        str: The attribute value found in the XML
    """
    # Set-up the return value to be the default value
    attribute_value = default_value
    # Convert para element xml to an etree data structure
    para_xml = etree.XML(para._element.xml)

    # Iterate directly to the tag that should be found
    for element in para_xml.iter(tag_to_find):
        # Check if the tag is the same and the parent is as expected
        if element.tag == tag_to_find and element.getparent().tag == tag_parent:
            # Check if the attribute to be retrieved is actually there
            if tag_attribute in element.keys():
                # Retrieve the attribute values from the dict
                attribute_value = element.get(tag_attribute)
                # Exit the loop as we have got what we needed
                break

    return attribute_value

def get_para_content(para):
    """ Function to retrieve the paragraph content across all runs
    Input:
    - para: paragraph object from document
    Output:
    - para_content: String containing the text for the paragraph
    """
    para_content = ""
    # Check to ensure that the para is not None
    if not para is None:
        # Retrieve the text from all the runs in the paragraph
        para_content = para.text

    return para_content

def transform_para_content(para_content):
    """ Function to transform raw paragraph content into a cleaned version
    Input:
    - para_content: String containing the raw paragrah content, see get_para_content
    Output:
    - para_content_transformed: String containing the cleaned/transformed paragraph content
    """

    para_content_transformed = para_content

    # Replace ` with a '
    para_content_transformed = re.sub('`', "'", para_content_transformed)

    return para_content_transformed

def get_para_content_tab_start_count(para):
    """ Function to count the number of tab characters at the start of a string of paragraph of text
    Args:
        para (python-docx Paragraph object): A python-docx Paragraph corresponding to a paragraph in a word document
    Returns:
        [int]: The number of tab characters at the start of a paragraph of text, default to 0
    """
    output = 0
    if len(para.text) > 0:
        i = 0
        while True:
            if para.text[i] == "\t" and i < len(para.text):
                i += 1
            else:
                break
        output = i
    return output

def get_para_font_family(document, para, theme_dict):
    """ Function to retrieve the font family for a specific paragraph, the run is
        also included to check for any run specific fonts
    Args:
        document (python-docx document object): Python-docx document object
        para (python-docx Paragraph object): A python-docx object of the paragraph
        theme_dict (dict): A dictionary containing keys for the major and minor fonts
    Returns:
        [str]: The font family identified for the paragraph
    """

    # Set the default font_family to be Word default of Calibir
    font_family = "Default"
    # Check to esnure the paragraph object is not None
    if not para is None:
        # Direct font family name for the run and style values for the run
        run_font_values = []
        run_style_font_values = []
        # Iterate through each run to populate the two above lists
        for run in para.runs:
            if not run.text == "" and not run.text == "\n":
                run_font_values.append(run.font.name)
                run_style_font_values.append(run.style.font.name)

        # Check the paragraph style font family name
        para_style_font_name = para.style.font.name

        # Identify the paragraph style
        para_style = get_para_style(para)
        # Check the style from the document styles
        document_para_style = document.styles[para_style].font.name

        # Check the theme_dict for a font at theme level
        theme_font = None
        if para_style.lower().find("heading") >= 0 and "major_font" in theme_dict.keys():
            theme_font = theme_dict.get("major_font")
        elif para_style.lower().find("heading") < 0 and "minor_font" in theme_dict.keys():
            theme_font = theme_dict.get("minor_font")


        # Check which of the above variables should be used, based on the following logic
        # - The font directly applied to the paragraph
        # - The font for any style applied to the paragraph
        # - The majority font for any style applied to the runs within the paragraph
        # - The majority font directly applied to any run within the paragraph
        # - The Major/Minor Font used in theme.xml, depending on the heading
        if para_style_font_name is None:
            if document_para_style is None:
                # Condition to check if all values are None in the list
                if all(run_font_value is None for run_font_value in run_style_font_values):
                    if all(run_font_value is None for run_font_value in run_font_values):
                        if not theme_font is None:
                            font_family = theme_font
                    else:
                        run_values_counter = Counter(run_font_values)
                        font_family = run_values_counter.most_common(1)[0][0]
                else:
                    run_values_counter = Counter(run_style_font_values)
                    font_family = run_values_counter.most_common(1)[0][0]
            else:
                font_family = document_para_style
        else:
            font_family = para_style_font_name

    return font_family

def get_para_bold(document, para):
    """ Function to identify if a paragraph contains bold content
    Input:
    - document: Python-docx document object
    - para: Paragraph object
    Output
    - is_bold: Boolean indicating if the paragraph is bold
    """
    # Default is False (not bold)
    is_bold = False
    # Check to see if the para object is None
    if not para is None:
        # Create a list to store the bold values for each of the runs
        run_values = []
        for run in para.runs:
            if not run.bold is None:
                run_values.append(run.bold)


        # Check paragraph style
        para_bold = para.style.font.bold

        # Check document style
        para_style = get_para_style(para)
        # Check the style from the document styles
        document_style_bold = document.styles[para_style].font.bold

        # Update is_bold variable based on the following logic:
        # - check if the paragraph style is bold
        # - Check the default value for the paragraph style at document level
        # - check if all runs are bold, if they are assign is_bold to True
        if para_bold is None:
            if document_style_bold is None:
                if len(run_values) > 0:
                    if all(run_bold is True for run_bold in run_values):
                        is_bold = True
            else:
                is_bold = document_style_bold
        else:
            is_bold = para_bold

    return is_bold


def get_para_italic(document, para):
    """ Function to identify if a paragraph contains italic content
    Input:
    - document: Python-docx document object
    - para: Paragraph object
    Output
    - is_italic: Boolean indicating if the paragraph is italic
    """
    # Default is False (not italic)
    is_italic = False
    # Check to see if the para object is None
    if not para is None:
        # Create a list to store the italic values for each of the runs
        run_values = []
        for run in para.runs:
            if not run.italic is None:
                run_values.append(run.italic)


        # Check paragraph style
        para_italic = para.style.font.italic

        # Check document style
        para_style = get_para_style(para)
        # Check the style from the document styles
        document_style_italic = document.styles[para_style].font.italic

        # Update is_italic variable based on the following logic:
        # - check if the paragraph style is italic
        # - Check the default value for the paragraph style at document level
        # - check if all runs are italic, if they are assign is_italic to True
        if para_italic is None:
            if document_style_italic is None:
                if len(run_values) > 0:
                    if all(run is True for run in run_values):
                        is_italic = True
            else:
                is_italic = document_style_italic
        else:
            is_italic = para_italic

    return is_italic

def get_para_font_size(document, para):
    """ Function to retrieve the font size for a paragraph based on the style hierarchy
    Input:
    - document: document object for the entire docx file
    - para: paragraph object for the current paragraph
    Output:
    - font_size: Number indicating the font size, default to 11 - Word default font size
    """
    # Default font size to be 11
    font_size = 11
    if not document is None and not para is None:
        ## Gather all the raw data, follow style hierarchy as defined by
        #https://stackoverflow.com/questions/64031644/how-to-get-a-style-value-by-traversing
        #-from-bottom-run-to-top-docdefaults

        # Direct font values for the run and style values for the run
        run_font_values = []
        run_style_font_values = []
        # Iterate through each run to populate the two above lists
        for run in para.runs:
            if not run.text == "" and not run.text == "\n":
                run_font_values.append(run.font.size)
                run_style_font_values.append(run.style.font.size)
        # Check the font size from the paragraph style
        paragraph_style_size = para.style.font.size

        # Identify the paragraph style
        para_style = get_para_style(para)
        # Check the style from the document styles
        document_para_style = document.styles[para_style].font.size

        # Check if all the run font values are None
        if all(run is None for run in run_font_values):
            # Check if all the run style font values are None
            if all(run is None for run in run_style_font_values):
                # Check if the paragraph style is none
                if paragraph_style_size is None:
                    # Check if the style as defined in styles.xml has a size
                    if not document_para_style is None:
                        font_size = document_para_style.pt
                else:
                    font_size = paragraph_style_size.pt
            else:
                # Get unique set of font sizes identified in the run
                font_sizes = [font_size for font_size in run_style_font_values
                              if not font_size is None]
                font_size = font_sizes[0].pt
        else:
            # Get unique set of font sizes identified in the run
            font_sizes = [font_size for font_size in run_font_values if not font_size is None]
            font_size = font_sizes[0].pt

    return font_size


def get_para_style(para):
    """ Function to retrieve the font paragraph style for a paragraph based on the style hierarchy
    Input:
    - para: paragraph object for the current paragraph
    Output:
    - para_style: The style that is applied to the paragraph, defaults to 'Normal'
    """
    # Set the default to be Normal style
    para_style = "Normal"

    # Check the para style object
    if not para.style.name is None:
        para_style = para.style.name

    return para_style


def get_para_list_style(document, para, numbering_pd):
    """ Function to return the paragraph list style, default to '' for now """
    para_list_style = ""

    # Retrieve the paragraph style
    para_style = get_para_style(para)

    # Get any paragraph properties associated with the paragraph, either at the style
    # or paragraph level
    document_pPr = document.styles[para_style]._element.pPr
    para_pPr = para._p.pPr

    num_id = -1
    level = 0
    # Check the paragraph level first
    # Extract out the number id for the list
    if not para_pPr is None:
        if not para_pPr.numPr is None:
            num_id = str(para_pPr.numPr.numId.val)
            # Identify the Level, if any
            if not para_pPr.numPr.ilvl is None:
                level = str(para_pPr.numPr.ilvl.val)

        elif not document_pPr is None:
            if not document_pPr.numPr is None:
                num_id = str(document_pPr.numPr.numId.val)
                # Identify the Level, if any
                if not document_pPr.numPr.ilvl is None:
                    level = str(document_pPr.numPr.ilvl.val)


    # Identify the paragraph list style from the numbering_pd DataFrame
    numbering_para_list_style = get_data_from_numbering_pd(
        numbering_pd,
        num_id, level,
        column = "level_num_format"
    )

    # if len(numbering_para_list_style) > 0:
    #     para_list_style = numbering_para_list_style
    
    #######  this is for testing the project 500200 #####
    # if len(str(numbering_para_list_style)) > 0:
    para_list_style = numbering_para_list_style

    return para_list_style

def get_para_left_indent(document, para, numbering_pd):
    """ Function to find the left indent for a paragraph, default to 0
    Input:
    - document: Python-docx document object
    - para: Paragraph object for the current paragraph
    Output:
    - para_left_indent: Number indicating the left indent
    """
    para_left_indent = 0

    # Get the paragraph style
    para_style = get_para_style(para)
    # Check the style from the document styles
    document_para_style = document.styles[para_style].paragraph_format.left_indent
    # Check for numbering.xml when lists are used in Document
    document_pPr = document.styles[para_style]._element.pPr
    para_pPr = para._p.pPr

    numbering_left_indent = get_left_indent_from_numbering_pd(
        numbering_pd, document_pPr, para_pPr
    )

    if not para is None:
        if not para.paragraph_format.left_indent is None:
            para_left_indent = para.paragraph_format.left_indent.pt
            if not numbering_left_indent is None:
                para_left_indent -= numbering_left_indent
        elif not para.style.paragraph_format.left_indent is None:
            para_left_indent = para.style.paragraph_format.left_indent.pt
        elif not document_para_style is None:
            para_left_indent = document.styles[para_style].paragraph_format.left_indent.pt
        elif not numbering_left_indent is None:
            para_left_indent = numbering_left_indent

    return para_left_indent

def get_left_indent_from_numbering_pd(numbering_pd, document_pPr, para_pPr):
    """ Function to retrieve the left_indent from the numbering_pd
    Args:
        numbering_pd (pandas DataFrame): Pandas DataFrame representing the numbering.xml file
        document_pPr (python-docx document object): Python-docx document object
        para_pPr (python-docx Paragraph properties object): python-docx Paragraph properties object
    Returns:
        [int]: Integer representing the left_indent for a paragraph, or None if none found
    """
    para_left_indent = None

    if not para_pPr is None:
        # Extract out the number id for the list
        num_id = -1
        level = 0
        if not para_pPr.numPr is None:
            num_id = str(para_pPr.numPr.numId.val)
            # Identify the Level, if any
            if not para_pPr.numPr.ilvl is None:
                level = str(para_pPr.numPr.ilvl.val)
        elif not document_pPr is None:
            # Extract out the number id for the list
            if not document_pPr.numPr is None:
                num_id = str(document_pPr.numPr.numId.val)
                # Identify the Level, if any
                if not document_pPr.numPr.ilvl is None:
                    level = str(document_pPr.numPr.ilvl.val)

        # Identify the left_indent from the numbering_pd DataFrame
        numbering_left_indent = get_data_from_numbering_pd(
            numbering_pd,
            num_id, level,
            column = "level_para_prop_left"

        )
        # Convert to Pts if there is any value
        if len(str(numbering_left_indent)) > 0:
            # para_left_indent = int(numbering_left_indent) / 20
            #######  this is for testing the project 500200 #####
            para_left_indent = float(numbering_left_indent) / 20
        

    return para_left_indent


def get_para_right_indent(document, para):
    """ Function to find the right indent for a paragraph, default to 0
    Input:
    - document: Python-docx document object
    - para: Paragraph object for the current paragraph
    Output:
    - para_right_indent: Number indicating the right indent
    """
    para_right_indent = 0

    # Get the paragraph style
    para_style = get_para_style(para)
    # Check the style from the document styles
    document_para_style = document.styles[para_style].paragraph_format.right_indent

    if not para is None:
        if not para.paragraph_format.right_indent is None:
            para_right_indent = para.paragraph_format.right_indent.pt
        elif not para.style.paragraph_format.right_indent is None:
            para_right_indent = para.style.paragraph_format.right_indent.pt
        elif not document_para_style is None:
            para_right_indent = document.styles[para_style].paragraph_format.right_indent.pt

    return para_right_indent

def get_para_first_line_indent(document, para):
    """ Function to retrieve the first line indent for a paragraph
    Input:
    - document: Python-docx document object
    - para: paragraph object for the current paragraph
    Output:
    - first_line_indent: The length of the first line indent, default to 0, expressed in points
    """
    first_line_indent = 0

    # Get the paragraph style
    para_style = get_para_style(para)
    # Check the style from the document styles
    document_para_style = document.styles[para_style].paragraph_format.first_line_indent

    if not para is None:
        if not para.paragraph_format.first_line_indent is None:
            first_line_indent = para.paragraph_format.first_line_indent.pt
        elif not para.style.paragraph_format.first_line_indent is None:
            first_line_indent = para.style.paragraph_format.first_line_indent.pt
        elif not document_para_style is None:
            first_line_indent = document.styles[para_style].paragraph_format.first_line_indent.pt

    return first_line_indent

def get_para_alignment(document, para):
    """ Function to identify the alignment of the para
    Input:
    - document: Python-docx document object
    - para: Paragraph object in which the alignment is to be identified
    Output:
    - para_alignment: String containing the alignment of the paragraph para, default to "LEFT"
    """
    para_alignment = "left"

    if not para is None:
        # Paragraph alignment direct to the paragraph format
        para_format_alignment = para.paragraph_format.alignment
        # Paragraph alignment direct
        para_alignment_xml = para.alignment
        # Paragraph alignment from style
        para_style_alignment = para.style.paragraph_format.alignment
        # Document level style default
        document_para_style = document.styles[get_para_style(para)].paragraph_format.alignment

        if not para_format_alignment is None:
            para_alignment = para_format_alignment
        elif not para_alignment_xml is None:
            para_alignment = para_alignment_xml
        elif not para_style_alignment is None:
            para_alignment = para_style_alignment
        elif not document_para_style is None:
            para_alignment = document_para_style

        # Iterate through the inbuilt python docx values to get the original XML value
        for class_object in docx.enum.text.WD_PARAGRAPH_ALIGNMENT.__members__:
            if class_object.value == para_alignment:
                para_alignment = class_object.xml_value

    return para_alignment

def get_para_line_space(document, para):
    """ Function to identify the line spacing of a paragraph
    Input:
    - document: Python-docx document object
    - para: Paragraph object in which the alignment is to be identified
    Output:
    - para_line_space: Numeric indicating the line spacing
    """
    para_line_space = 1.15

    if not para is None:
        # Paragraph alignment direct to the paragraph format
        para_format = para.paragraph_format.line_spacing
        # Paragraph alignment from style
        para_style = para.style.paragraph_format.line_spacing
        # Document level style default
        document_para_style = document.styles[get_para_style(para)].paragraph_format.line_spacing

        if not para_format is None:
            para_line_space = para_format
        elif not para_style is None:
            para_line_space = para_style
        elif not document_para_style is None:
            para_line_space = document_para_style

    return para_line_space


def get_para_space_above(document, para):
    """ Function to identify the spacing above of a paragraph
    Input:
    - document: Python-docx document object
    - para: Paragraph object in which the alignment is to be identified
    Output:
    - para_space_above: Numeric indicating the space above
    """
    para_space_above = 0

    if not para is None:
        # Paragraph alignment direct to the paragraph format
        para_format = para.paragraph_format.space_before
        # Paragraph alignment from style
        para_style = para.style.paragraph_format.space_before
        # Document level style default
        document_para_style = document.styles[get_para_style(para)].paragraph_format.space_before

        if not para_format is None:
            para_space_above = para_format.pt
        elif not para_style is None:
            para_space_above = para_style.pt
        elif not document_para_style is None:
            para_space_above = document_para_style.pt

    return para_space_above

def get_para_space_below(document, para):
    """ Function to identify the spacing below of a paragraph
    Input:
    - document: Python-docx document object
    - para: Paragraph object in which the alignment is to be identified
    Output:
    - para_space_below: Numeric indicating the space below
    """
    para_space_below = 0

    if not para is None:
        # Paragraph alignment direct to the paragraph format
        para_format = para.paragraph_format.space_after
        # Paragraph alignment from style
        para_style = para.style.paragraph_format.space_after
        # Document level style default
        document_para_style = document.styles[get_para_style(para)].paragraph_format.space_after

        if not para_format is None:
            para_space_below = para_format.pt
        elif not para_style is None:
            para_space_below = para_style.pt
        elif not document_para_style is None:
            para_space_below = document_para_style.pt

    return para_space_below

def get_para_border(para_prop_dict,
                    para
):
    """Function to retrieve the Paragraph Border Properties from XML
    Args:
        para_prop_dict (dict): Dictionary results should be appended to
        para (object): Paragraph object from the document object
    """
    ## Border Values - Top
    para_prop_dict["ParaBorderTopVal"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}top",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val",
        default_value = 0)

    para_prop_dict["ParaBorderTopSz"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}top",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sz",
        default_value = 0)

    para_prop_dict["ParaBorderTopSpace"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}top",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}space",
        default_value = 0)

    para_prop_dict["ParaBorderTopColor"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}top",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color",
        default_value = -1)

    ## Border Values - Left
    para_prop_dict["ParaBorderLeftVal"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}left",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val",
        default_value = 0)

    para_prop_dict["ParaBorderLeftSz"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}left",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sz",
        default_value = 0)

    para_prop_dict["ParaBorderLeftSpace"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}left",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}space",
        default_value = 0)

    para_prop_dict["ParaBorderLeftColor"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}left",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color",
        default_value = -1)

    ## Border Values - Bottom
    para_prop_dict["ParaBorderBottomVal"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bottom",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val",
        default_value = 0)

    para_prop_dict["ParaBorderBottomSz"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bottom",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sz",
        default_value = 0)

    para_prop_dict["ParaBorderBottomSpace"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bottom",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}space",
        default_value = 0)

    para_prop_dict["ParaBorderBottomColor"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}bottom",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color",
        default_value = -1)

    ## Border Values - Right
    para_prop_dict["ParaBorderRightVal"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}right",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val",
        default_value = 0)

    para_prop_dict["ParaBorderRightSz"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}right",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sz",
        default_value = 0)

    para_prop_dict["ParaBorderRightSpace"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}right",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}space",
        default_value = 0)

    para_prop_dict["ParaBorderRightColor"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}right",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color",
        default_value = -1)

    ## Border Values - Between
    para_prop_dict["ParaBorderBetweenVal"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}between",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val",
        default_value = 0)

    para_prop_dict["ParaBorderBetweenSz"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}between",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sz",
        default_value = 0)

    para_prop_dict["ParaBorderBetweenSpace"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}between",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}space",
        default_value = 0)

    para_prop_dict["ParaBorderBetweenColor"] = get_xml_attribute(para,
        tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}between",
        tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pBdr",
        tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color",
        default_value = -1)

    return para_prop_dict

def get_para_shading(para_prop_dict,
                    para):
    """Function to retrieve the Paragraph Shading Properties from XML
    Args:
        para_prop_dict (dict): Dictionary results should be appended to
        para (object): Paragraph object from the document object
    """
    if not para is None:
        # Get the value of the amount of percent
        para_prop_dict["ParaShadingVal"] = get_xml_attribute(para,
            tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd",
            tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr",
            tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
        )

        # Get the colour of the shading
        para_prop_dict["ParaShadingColor"] = get_xml_attribute(para,
            tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd",
            tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr",
            tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color"
        )

        # Get the fill of the shading
        para_prop_dict["ParaShadingFill"] = get_xml_attribute(para,
            tag_to_find = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd",
            tag_parent = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr",
            tag_attribute = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fill"
        )

    return para_prop_dict


def get_para_single_strike(para):
    """ Function to return if a paragraph is all single striked through
    All runs within the paragraph should the single striked through in order to
    return True. Only runs which have text are considered.
    Args:
        para (python-docx Paragraph): Python-docx paragraph object
    Returns:
        boolean: True/False wheather all runs which contain text in the paragraph are
        single striked throug
    """
    run_strike = []

    for run in para.runs:
        if not run.text == "":
            run_strike.append(run.font.strike)

    return all(run_strike)

def get_para_double_strike(para):
    """ Function to return if a paragraph is all double striked through
    All runs within the paragraph should the double striked through in order to
    return True. Only runs which have text are considered.
    Args:
        para (python-docx Paragraph): Python-docx paragraph object
    Returns:
        boolean: True/False wheather all runs which contain text in the paragraph are
        double striked throug
    """
    run_strike = []

    for run in para.runs:
        if not run.text == "":
            run_strike.append(run.font.double_strike)

    return all(run_strike)

def get_para_underline(document, para):
    """[summary]
    Args:
        document (python_docx Document): Python-docx Document object
        para (python-docx Paragraph): Python-docx Paragraph object
    Returns:
        [str]: String containing the underline value for the paragraph
    """

    underline = ""

    # Get the Underline values for each run in the paragraph

    run_values = []

    for run in para.runs:
        if not run.text == "":
            run_values.append(run.font.underline)

    # Check if there is only one run_value
    if len(run_values) == 1:
        # If this single value is True, set underline to be Single
        if run_values[0] is True:
            underline = "single"
        else:
            para_prop_underline = run_values[0]
            for underline_class in docx.enum.text.WD_UNDERLINE.__members__:
                if underline_class.value == para_prop_underline:
                    underline = underline_class.xml_value

    # If underline is still blank after checking the runs, check the style
    if underline == "":
        # Get the paragraph style
        para_style = get_para_style(para)

        # Check the font underline in the document style
        style_underline = document.styles[para_style].font.underline

        # If its not None, then update the return value
        if not style_underline is None:
            # Iterate throught the UNDERLINE class
            for underline_class in docx.enum.text.WD_UNDERLINE.__members__:
                # Check if the display value is equal to the value we have gotten
                if underline_class.value == style_underline:
                    # Update the output with the xml value
                    underline = underline_class.xml_value

    return underline


def get_para_small_caps(para):
    """ Function to identify if the para has small caps enabled
    Args:
        para (python-docx Paragraph): Python-docx Paragraph object
    Returns:
        boolean: True/False if the paragraph has small caps enabled
    """
    output = "No Text"
    run_values = []


    for run in para.runs:
        if not run.text == "":
            run_values.append(run.font.small_caps)

    if len(run_values) > 0:
        output = any(run_values)

    return output

def create_structured_xml(para_properties_list, c_data_tags):
    """ Function to transform a list of dictionary poperties into XML
    Input:
    - para_properties_list: List containing dictionary for each subelement, see
    create_para_properties_list
    - c_data_tags: List of strings containing the tags which should be treated as XML CDATA
    Output:
    - para_properties_xml: xml string of the para_properties_list
    """
    para_properties_xml = ""
    if len(para_properties_list) > 0:
        # Create root element
        root = etree.Element("ArrayOfParagraphProperties")

        # Loop through each dictionary in the list
        for para_dict in para_properties_list:
            # Create a parent node for the sub-child of root
            parent = etree.SubElement(root, "ParagraphProperties")

            # Loop through each key, value pair in the dictionary
            for key, value in para_dict.items():
                # Check if the value is None, if it is then no text to be added
                if value is None:
                    etree.SubElement(parent, key)
                else:
                    # Condition when there is text to be added for a sub-child of parent
                    # Check if the current key is of CDATA type
                    if key in c_data_tags:
                        etree.SubElement(parent, key).text = etree.CDATA(str(value))
                    else:
                        etree.SubElement(parent, key).text = str(value)

            # Append the parent to the root
            root.append(parent)

    # Convert to a UTF-8 encoded string, and add in the initial xml declaration
    para_properties_xml = etree.tostring(root, xml_declaration = True,
                                         encoding = "UTF-8") \
                                .decode("UTF-8")

    return para_properties_xml

In [4]:
""" Script to apply colour codes to Reference Named Entities within a docx document
"""
import logging
import numpy as np
import pandas as pd
from lxml import etree
from docx.oxml.ns import qn
from docx.table import Table
from docx.shared import RGBColor
from docx.oxml.shared import OxmlElement
from docx.text.paragraph import Paragraph
# from reference.docx_extraction import docx as dst_docx
# from dstetl.extractors import docx as dst_docx


# define loggings
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

# Driver function to colour the references
def colouring_main(raw_docx_file_path, coloured_docx_file_path, colour_codes_file_path,ner_results):
    """ Main colouring function to colour the reference in a docx file with the results of the
        NER model

    Args:
        raw_docx_file_path (str): String containing the full path to the docx file with references
            to be coloured
        coloured_docx_file_path (str): String containing the full path to the coloured docx file
        colour_codes_file_path (str): String containing the full path to the csv file containing
            the rgb colour values for each named entity in a reference
        ner_results (pandas DataFrame): A pandas dataframe containing the results of the named
            entity recognition. Main Columns are:
            - para_id: The para_id or block_id for the paragraph in the docx file
            - reference_text: A string containing the reference text
            - reference_entities: A dictionary containing the reults of the ner model
    """

    logging.info("Reading in Colour Codes CSV")
    entity_colour_codes = get_colour_codes(colour_codes_file_path)

    logging.info("Reading in Word Document")
    document, _, _ = create_document_object(raw_docx_file_path)
    

    if len(ner_results.index) > 0:
        logging.info("Expanding NER Results")
        ner_results_expanded = pd.DataFrame()

        for i in ner_results.index:
            model_entities_pd = pd.DataFrame(ner_results.reference_entities[i])
            model_entities_pd["para_id"] = ner_results.para_id[i]
            ner_results_expanded = pd.concat([ner_results_expanded, model_entities_pd])

        logging.info("Merging NER Results with Colour Codes")
        ner_colours = pd.merge(
            left = ner_results_expanded,
            right = entity_colour_codes,
            how = "left",
            on = "entity"
        )

        logging.info("Applying Colour to Word Document")
        colour_all_references(
            document = document,
            target_para_ids = ner_colours.para_id.unique().tolist(),
            reference_model_results = ner_colours
        )

    logging.info("Saving Coloured Word Document")
    document.save(coloured_docx_file_path)

    return "Finished Colouring Docx Document"

# function to load/map colour codes from colouring csv file
def get_colour_codes(colour_codes_file_path):
    """ Function to load the csv file containing the colours codes for each entity, and
        reformat the file from long to wide

    Args:
        colour_codes_file_path (str): A full file path to the location of the csv file to be read in

    Returns:
        pandas DataFrame: A pandas DataFrame of the csv colour codes file
    """
    logging.info("Reading in Colour Codes CSV")
    entity_colour_codes = pd.read_csv(colour_codes_file_path)

    logging.info("Creating RGB Tuple")
    entity_colour_codes["colour_rgb"] = entity_colour_codes.apply(
        lambda x: (x.colour_r, x.colour_g, x.colour_b),
        axis = 1
    )

    logging.info("Removing unneeded columns")
    columns_to_keep = ["entity", "colour_type", "colour_rgb"]
    entity_colour_codes = entity_colour_codes[
        entity_colour_codes.columns.intersection(columns_to_keep)]

    logging.info("Pivoting data from long to wide format")
    entity_colour_codes = entity_colour_codes.pivot(
        index = "entity",
        columns = "colour_type",
        values = "colour_rgb"
    )

    logging.info("Replacing nan values with empty tuples")
    entity_colour_codes = entity_colour_codes.applymap(
        lambda x: () if x is np.nan else x
    )

    return entity_colour_codes


# function to colour all reference in docx files
def colour_all_references(document, target_para_ids, reference_model_results):
    """ Function to iterate through a docx file and colour any applicable references

    Args:
        document (python-docx Document): A python docx Document object of the docx file
        target_para_ids (list): A list containing the para_ids/block_ids of the blocks of text which
             are references, as identified during the Element Prediction process
        reference_model_results (pandas DataFrame): A pandas Dataframe containing the model results,
             merged with the relevant colours codes. Main columns are:
            - entity: The named entity as identified by the model
            - entity_start: The start index within the reference that the entity starts at
            - entity_end: The end index within the reference that the entity ends at (plus 1)
            - para_id: The para_id/block_id to indicate which reference the results relate to
            - background: The background colour to be used for the named entity
            - font: The font colour to be used for the named entity

    Returns:
        str: String indicating that all references have been coloured
    """
    # define block id as placeholder
    block_id = 1

    logging.info("Iterating Through Docs Document Blocks")
    for document_block in iter_block_items(document):
        # Check to see if the paragraph has a text box
        para_contains_text_box = para_contains_xpath(
            document_block,
            xpath_string = ".//v:textbox/w:txbxContent")

        ##########################
        ######## Table ###########
        ##########################
        if isinstance(document_block, Table):
            # Iterate through the rows, cells and paragraphs in the table
            for table_row in document_block.rows:
                for cell in table_row.cells:
                    for para in cell.paragraphs:
                        if block_id in target_para_ids:
                            colour_reference(
                                block_id,
                                para,
                                reference_model_results)
                        block_id += 1

        # Check to see if the current block is a paragraph and contains textbox
        elif isinstance(document_block, Paragraph) and para_contains_text_box:
            ################################
            ### Paragraphs with TextBox ####
            ################################
            text_box_para = etree.ElementBase.xpath(
                document_block._element,
                '//v:textbox/w:txbxContent/w:p',
                namespaces = document_block._element.nsmap)

            # Iterate through all the paragraphs in the textboxes
            for txt_box_para in text_box_para:
                # Convert the paragraph (w:p) to a Paragraph Class
                txt_box_para_class = Paragraph(
                    txt_box_para,
                    document_block)
                if block_id in target_para_ids:
                    colour_reference(
                        block_id,
                        txt_box_para_class,
                        reference_model_results)

                block_id += 1

        elif isinstance(document_block, Paragraph):
            if block_id in target_para_ids:
                colour_reference(
                    block_id,
                    document_block,
                    reference_model_results)
            # increment block_id
            block_id += 1

    return "All References Coloured"


# colour the specific reference in docx
def colour_reference(block_id, para, reference_model_results):
    """ Function to colour a specific reference within a docx file

    Args:
        block_id (int): Integer representing the unique id for the paragraph
        para (python-docx Paragraph): A python-docx Paragraph object for the current reference
        reference_model_results (pandas DataFrame): A pandas Dataframe containing the model results,
             see above for structure
    """
    logging.info("Filtinger Model Results on Current Paragraph")
    reference_model_results = reference_model_results[reference_model_results.para_id == block_id]
    reference_model_results = reference_model_results.reset_index()

    if len(reference_model_results.index) > 0:
        # Update the end index to be less than what it is, as colours need to be offet by 1
        # for formatting
        reference_model_results.entity_end = reference_model_results.entity_end - 1

        logging.info("Identifying Backgrond and Font Colour per Character in Reference")
        char_colour = get_char_colour(
            para.text,
            reference_model_results)

        logging.info("Grouping Characters by Background/Font Colour for Run Information")
        char_colour_grouped = group_char_by_colours(char_colour)

        logging.info("Apply colours to Reference in Docx Document")
        apply_colouring_to_document(
            para,
            char_colour_grouped)
    else:
        logging.info("No Reference Entities Identified. Document Block: %s", str(block_id))


# identify background and font colour for each character in reference
def get_char_colour(reference_text, reference_model_results):
    """ Function to identify the background and font colour to be used for each character
        in the reference text

    Args:
        reference_text (str): The full original reference text as it is in the original docx
        reference_model_results (pandas DataFrame): A pandas dataframe containing the model results.
            Main columns are as follows:
            - entity: The named entity as identified by the model
            - entity_start: The starting index of the named entity found
            - entity_end: The end index of the named entity found
            - background: A tuple containing the rgb values for the background colour of the
                named entity
            - font: A tuple containing the rgb values for the font colour of the named entity

    Returns:
        pandas DataFrame: A pandas DataFrame containing rows for each character, and the associated
            background and font colour for the character
    """
    # Create a list which will hold dictionaries for each row in the final dataframe
    text_by_char_list = []

    # Iterate through each char in the reference and assign a background and font colour, if
    # applicable
    for i, char in enumerate(reference_text):
        # Find the background and font colour rgb values based on the current index and model
        # results
        background_colour_rgb, font_colour_rgb = get_background_font_colour(
            char_index = i,
            data = reference_model_results)

        # Create a dictionary to hold information on the current character
        text_by_char_dict = {
            "char_index" : i,
            "char" : char,
            "background_colour_rgb" : background_colour_rgb,
            "font_colour_rgb" : font_colour_rgb}

        # Append the current dictionary to the overall list of all dictionaries
        text_by_char_list.append(text_by_char_dict)

    # Return a dataframe of the list of dictionaries
    return pd.DataFrame(text_by_char_list)

# get background and font colour by index in reference
def get_background_font_colour(char_index, data):
    """ Function to retrieve the background and font colour for a particular character index within
        a reference

    Args:
        char_index (int): Integer representing the index of the character to be checked
        data (pandas DataFrame): NER model results, see above for structure

    Returns:
        Two tuples: Two tuples containing rgb values for the background and font for the char_index
    """
    # Create two empty tuples
    background_colour_rgb = ()
    font_colour_rgb = ()

    # Iterate through each index in the data dataframe
    for i in range(len(data.index)):
        # Check to see if the char_index is within the start and end indexes for the entity
        if data.entity_start[i] <= char_index <= data.entity_end[i]:
            # Update the background and font colour tuple if they are empty with the values
            # from data
            if background_colour_rgb == ():
                background_colour_rgb = data.background[i]
            if font_colour_rgb == ():
                font_colour_rgb = data.font[i]

    # Return the two tuples
    return background_colour_rgb, font_colour_rgb

# grouping the characters by colour
def group_char_by_colours(char_colour):
    """ Function to concatenate similar coloured text together into runs

    Args:
        char_colour (pandas DataFrame): A dataframe which contains a row for each character, and the
            corresponding background and font colour, see get_char_colour() for structure

    Returns:
        pandas DataFrame: A pandas DataFrame containing the relevant text grouped together based on
            their background and font colour
    """
    # Create a list to hold dictionaries of results
    char_colour_grouped_list = []

    # Create initial variables for the background, font colour and text
    background_colour_rgb = ()
    font_colour_rgb = ()
    text = ""

    # Iterate through each row of the char_colour Dataframe
    for i in range(len(char_colour.index)):
        # Set-up inital variables for the first iteration
        if i == 0:
            text = char_colour.char[i]
            background_colour_rgb = char_colour.background_colour_rgb[i]
            font_colour_rgb = char_colour.font_colour_rgb[i]
        else:
            # Check and see if the current indexes values for background and font colour have
            # changed. If they have, then add a dictionary to the main list. The dictionary will
            # contain the text, background and font colours
            if (not background_colour_rgb == char_colour.background_colour_rgb[i] or \
                not font_colour_rgb == char_colour.font_colour_rgb[i]):
                char_colour_grouped_dict = {
                    "text" : text,
                    "background_colour_rgb" : background_colour_rgb,
                    "font_colour_rgb" : font_colour_rgb
                }

                char_colour_grouped_list.append(char_colour_grouped_dict)

                # Update the main three variables with the new changed values of the currend index
                text = char_colour.char[i]
                background_colour_rgb = char_colour.background_colour_rgb[i]
                font_colour_rgb = char_colour.font_colour_rgb[i]

            else:
                # Append the current character to the overall text variable
                text += char_colour.char[i]

        # Make sure to add the last dictionary to the list to ensure all characters are accounted
        if i == len(char_colour.index) - 1:
            char_colour_grouped_dict = {
                "text" : text,
                "background_colour_rgb" : background_colour_rgb,
                "font_colour_rgb" : font_colour_rgb
            }

    return pd.DataFrame(char_colour_grouped_list)


# applying the colouring to documents
def apply_colouring_to_document(para, para_runs):
    """ Function to update a docx reference with the reference entities coloured

    Args:
        para (python-docx Paragraph): A python docx paragraph object
        para_runs (pandas DataFrame): A pandas dataframe containing the text for each run, including
            the background and font colours, see group_char_by_colours()

    Returns:
        str: String indicating that the reference has been coloured
    """
    # Remove all previous text in the paragraph. The original text will be replaced with the values
    # from para_runs
    para.text = ""
    para_runs = para_runs.applymap(
        lambda x: () if x is np.nan else x
    )

    # Iterate through each row in the para_runs dataframe
    for i in range(len(para_runs.index)):
        # Identify the text for the current row/run
        run_text = para_runs.text[i]

        # Add a new run to the paragraph, keep track of this new run in the variable new_run
        new_run = para.add_run(text = run_text)
        

        # Check to see if the current para_runs font is not an empty tuple (ie, has some colour to
        # be applied to it). If there is none, leave the font as 0,0,0 rgb ie black
        if not para_runs.font_colour_rgb[i] == ():
            # Update the font colour by using RGBColor function from python-docx package
            r, g, b = para_runs.font_colour_rgb[i]
            new_run.font.color.rgb = RGBColor(r, g, b)
        else:
            r, g, b = (0, 0, 0)
            new_run.font.color.rgb = RGBColor(r, g, b)

        # Check and see if the current run should have background colour applied, ie is not an
        # empty tuple
        if not para_runs.background_colour_rgb[i] == ():
            # Convert rgb to hex, as required by docx xml
            background_colour_hex = '#%02x%02x%02x' % para_runs.background_colour_rgb[i]
            # Create run_rag variable which will be used to store information on the shading (shd)
            # xml element
            run_tag = new_run._r

            # Create and set the values for the shd element
            shd = OxmlElement("w:shd")
            shd.set(qn("w:val"), "clear")
            shd.set(qn("w:color"), "auto")
            shd.set(qn("w:fill"), background_colour_hex)

            # Add shd properties to the run properties element
            run_tag.rPr.append(shd)

    return "Finished Colouring Reference"

In [6]:
import logging


raw_docx_file_path = "/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/RL_06_ELMU_REF.docx"
document, _, _ = create_document_object(raw_docx_file_path)
reference_model_results = pd.read_csv("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/ner_colours_para_id.csv")
target_para_ids=reference_model_results.para_id.unique().tolist()


# fetched_block_ids = colour_all_references_test_V(document, target_para_ids)

logging.info("Iterating Through Docs Document Blocks")
for document_block in iter_block_items(document):

    if isinstance(document_block, Paragraph):
        para = document_block

        
    # print("paragraphs:\n ", para.text)

logging.info("Identifying Backgrond and Font Colour per Character in Reference")
char_colour = get_char_colour(
    para.text,
    reference_model_results)

logging.info("Grouping Characters by Background/Font Colour for Run Information")
char_colour_grouped = group_char_by_colours(char_colour)
print("coloured_grouped df: \n ", char_colour_grouped)


# para=para, 
para_runs=char_colour_grouped
# # from para_runs
para.text = ""

print(" ====== "*20)
para_runs = char_colour_grouped.applymap(lambda x: () if x is np.nan else x)
print("if there is any NaN:\n ", para_runs)


print(" ====== "*20)    
# # Iterate through each row in the para_runs dataframe
for i in range(len(para_runs.index)):
    # Identify the text for the current row/run
    run_text = para_runs.text[i]
    print(i, run_text, sep=" ---> ")

2023-01-18 09:07:02 INFO     Iterating Through Docs Document Blocks
2023-01-18 09:07:02 INFO     Identifying Backgrond and Font Colour per Character in Reference
2023-01-18 09:07:03 INFO     Grouping Characters by Background/Font Colour for Run Information


coloured_grouped df: 
                                                  text background_colour_rgb  \
0                                            Zizek,                     ()   
1                                                  S       (242, 219, 219)   
2                                           lavoj. 1                    ()   
3                                            997. “M                    ()   
4                                                  u                    ()   
5  lticulturalism, or, The Cultural Logic of Mult...       (255, 253, 215)   
6                                          al Capita                    ()   
7                                         lism.” New                    ()   

   font_colour_rgb  
0    (34, 177, 76)  
1               ()  
2   (255, 194, 14)  
3    (186, 20, 25)  
4   (255, 194, 14)  
5  (162, 157, 150)  
6   (255, 194, 14)  
7    (34, 177, 76)  
if there is any NaN:
                                                  text backgrou