# Reference Entities Coloring Module

In [5]:
##
## reference entities coloring module
##


import docx
from docx.shared import RGBColor
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt


# opening an existing document
document = docx.Document('refeence_data.docx')

# start a new one for output
outdoc = docx.Document()

# example of format from CSV
#referencetext = "American Psychological Association. (2021). One year later, a new wave of pandemic health concerns. Retrieved on September 20, 2021, from http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress"
#referenceentities = [{'entity': 'organisation', 'entity_text': 'American Psychological Association', 'entity_start': 0, 'entity_end': 34}, {'entity': 'year', 'entity_text': '2021', 'entity_start': 37, 'entity_end': 41}, {'entity': 'title', 'entity_text': 'One year later, a new wave of pandemic health concerns', 'entity_start': 44, 'entity_end': 98}, {'entity': 'link', 'entity_text': 'http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress', 'entity_start': 138, 'entity_end': 213}]

# fictive reference extraction in same format, but which matches paragraph[1] content
referencetext = "The 1619 Project. New York Times Magazine. August 18, 2019. https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf."
referenceentities = [{'entity': 'title', 'entity_text': 'The 1619 Project', 'entity_start': 0, 'entity_end': 15},
                     {'entity': 'organisation', 'entity_text': 'New York Times Magazine', 'entity_start': 18, 'entity_end': 40},
                     {'entity': 'date', 'entity_text': 'August 18, 2019', 'entity_start': 43, 'entity_end': 57},
                     {'entity': 'link', 'entity_text': 'https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf', 'entity_start': 60, 'entity_end': 140},
                     {'entity': 'misc', 'entity_text': 'cm2', 'entity_start': 143, 'entity_end': 146}]


# grab 1st paragraph matching the reference above
doc_paras = document.paragraphs[1]
# print(doc_paras.text)

# generate 'word' list with character start, end and its run properties

# get elements
elements = doc_paras._element.xpath('.//w:t')

# get runs
runs = doc_paras.runs

run_lists = []
charcount = 0
for i in range(len(elements)):
    run_index = {}
    run_index['text'] = elements[i].text
    run_index['start'] = charcount
    run_index['end'] = charcount + len(run_index['text']) - 1
    
    if ((i < len(runs)) and (runs[i].italic)):
        run_index['run'] = 'italic'
    elif ((i < len(runs)) and (runs[i].bold)):
        run_index['run'] = 'bold'
    elif ((i < len(runs)) and (runs[i].font.superscript)):
        run_index['run'] = 'superscript'
    else:
        run_index['run'] = ''

    run_lists.append(run_index)
    charcount = run_index['end'] + 1

newpar = outdoc.add_paragraph()

for e in referenceentities:
    nchar = e['entity_end'] - e['entity_start'] + 1
    # print(e)
    # print(nchar)
        
    # seek the start of the block
    for i in range(len(run_lists)):
        if run_lists[i]['start'] == e['entity_start']:
            # print("Found entity start at word " + str(i))
            run_style = 'normal'
            if run_lists[i]['run'] == 'italic':
                run_style = 'italic'
            if run_lists[i]['run'] == 'bold':
                run_style = 'bold'
            if run_lists[i]['run'] == 'superscript':
                run_style = 'superscript'
            # print("    -> extracted entity run_style: " + run_style)
                   
            text_runs = newpar.add_run(e['entity_text'])
            if run_style == 'italic':
                text_runs.italic = True
            elif run_style == 'bold':
                text_runs.bold = True
            elif run_style == 'superscript':
                text_runs.font.superscript = True
                
            if e['entity'] == 'title':
                text_runs.font.color.rgb = docx.shared.RGBColor(0xFF, 0x00, 0x00)
                tag = text_runs._r

                # Create XML element for shading
                text_runs.font.size = Pt(12)
                runs_shade = OxmlElement('w:shd')
                # Add attributes to the element
                # runs_shade.set(qn('w:val'), 'clear')
                # runs_shade.set(qn('w:color'), 'auto')
                runs_shade.set(qn('w:fill'), 'FFAAAA')
                tag.rPr.append(runs_shade)

            elif e['entity'] == 'organisation':
                text_runs.font.color.rgb = docx.shared.RGBColor(0x00, 0x00, 0xFF)
                tag = text_runs._r
                # Create XML element for shading
                text_runs.font.size = Pt(12)
                runs_shade = OxmlElement('w:shd')
                # Add attributes to the element
                # runs_shade.set(qn('w:val'), 'clear')
                # runs_shade.set(qn('w:color'), 'auto')
                runs_shade.set(qn('w:fill'), 'FF336B')
                tag.rPr.append(runs_shade)
        newpar.add_run(" ")

outdoc.save('testout.docx')   
        
    
