## Extract specific run property (italic) and write out reference in new document with colour/run as required

In [99]:
##
## reference extraction example
##


import docx
from docx.shared import RGBColor
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt



# opening an existing document
document = docx.Document('refeence_data.docx')

# start a new one for output
outdoc = docx.Document()

# example of format from CSV
#referencetext = "American Psychological Association. (2021). One year later, a new wave of pandemic health concerns. Retrieved on September 20, 2021, from http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress"
#referenceentities = [{'entity': 'organisation', 'entity_text': 'American Psychological Association', 'entity_start': 0, 'entity_end': 34}, {'entity': 'year', 'entity_text': '2021', 'entity_start': 37, 'entity_end': 41}, {'entity': 'title', 'entity_text': 'One year later, a new wave of pandemic health concerns', 'entity_start': 44, 'entity_end': 98}, {'entity': 'link', 'entity_text': 'http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress', 'entity_start': 138, 'entity_end': 213}]

# fictive reference extraction in same format, but which matches paragraph[1] content
referencetext = "The 1619 Project. New York Times Magazine. August 18, 2019. https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf."
referenceentities = [{'entity': 'title', 'entity_text': 'The 1619 Project', 'entity_start': 0, 'entity_end': 15},
                     {'entity': 'organisation', 'entity_text': 'New York Times Magazine', 'entity_start': 18, 'entity_end': 40},
                     {'entity': 'date', 'entity_text': 'August 18, 2019', 'entity_start': 43, 'entity_end': 57},
                     {'entity': 'link', 'entity_text': 'https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf', 'entity_start': 60, 'entity_end': 140},
                     {'entity': 'misc', 'entity_text': 'cm2', 'entity_start': 143, 'entity_end': 146}]


# grab 1st paragraph matching the reference above
p = document.paragraphs[1]
print(p.text)

# generate 'word' list with character start/end and run

# get elements
elements = p._element.xpath('.//w:t')

# get runs
runs = p.runs

wordlist = []
charcount = 0
for i in range(len(elements)):
    w = {}
    w['text'] = elements[i].text
    w['start'] = charcount
    w['end']   = charcount + len(w['text']) - 1
    
    if ((i < len(runs)) and (runs[i].italic)):
        w['run'] = 'italic'
    elif ((i < len(runs)) and (runs[i].bold)):
        w['run'] = 'bold'
    elif ((i < len(runs)) and (runs[i].font.superscript)):
        w['run'] = 'superscript'
    else:
        w['run'] = ''

    wordlist.append(w)
    charcount = w['end'] + 1

newpar = outdoc.add_paragraph()

for e in referenceentities:
    nchar = e['entity_end'] - e['entity_start'] + 1
    # print(e)
    # print(nchar)
    
    
    # seek the start of the block
    for i in range(len(wordlist)):
        if wordlist[i]['start'] == e['entity_start']:
            # print("Found entity start at word " + str(i))
            style = 'normal'
            if wordlist[i]['run'] == 'italic':
                style = 'italic'
            if wordlist[i]['run'] == 'bold':
                style = 'bold'
            if wordlist[i]['run'] == 'superscript':
                style = 'superscript'
            # print("    -> extracted entity style: " + style)
                   
            r = newpar.add_run(e['entity_text'])
            if style == 'italic':
                r.italic = True
            elif style == 'bold':
                r.bold = True
            elif style == 'superscript':
                r.font.superscript = True
                
            if e['entity'] == 'title':
                r.font.color.rgb = docx.shared.RGBColor(0xFF, 0x00, 0x00)
                tag = r._r
                # Create XML element
                r.font.size = Pt(12)
                
                shd = OxmlElement('w:shd')
                # Add attributes to the element
                # shd.set(qn('w:val'), 'clear')
                # shd.set(qn('w:color'), 'auto')
                shd.set(qn('w:fill'), 'FFAAAA')
                tag.rPr.append(shd)

            elif e['entity'] == 'organisation':
                r.font.color.rgb = docx.shared.RGBColor(0x00, 0x00, 0xFF)
                tag = r._r
                # Create XML element
                r.font.size = Pt(12)
                shd = OxmlElement('w:shd')
                # Add attributes to the element
                # shd.set(qn('w:val'), 'clear')
                # shd.set(qn('w:color'), 'auto')
                shd.set(qn('w:fill'), 'FF336B')
                tag.rPr.append(shd)
        newpar.add_run(" ")

outdoc.save('testout.docx')
        
        
    


The 1619 Project. New York Times Magazine. August 18, 2019. . cm2


In [76]:
import docx
from docx.shared import RGBColor
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Pt

document = docx.Document("Testing.docx")

p = document.paragraphs[0]

elements = p._element.xpath('.//w:t')

# get runs
runs = p.runs

print(runs[0].text)

# Get the XML tag
tag = runs[0]._r
# print(runs[0].element.xml)

# Create XML element
shd = OxmlElement('w:shd')
italic = OxmlElement('w:i')

# Add attributes to the element
shd.set(qn('w:val'), 'clear')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:fill'), 'FFAAAA')

# Set the font size - this is important! Without this step the
# tag.rPr value below will be None.
runs[0].font.size = Pt(12)
# print(run.element.xml)
tag.rPr.append(italic)
tag.rPr.append(shd)
document.save("Testing1.docx")

Senthil


In [None]:
wordlist

In [None]:
test =  p._element.xpath('.//w:r')

for i in range(6, 11):
    print( test[i].text )
    

### Final script for live colour update based on entity type, preserving all other run properties

In [None]:
##
## reference extraction example
##


import docx


# opening an existing document
document = docx.Document('refeence_data.docx')

# start a new one for output
outdoc = docx.Document()

# example of format from CSV
#reference_paragraphs = "American Psychological Association. (2021). One year later, a new wave of pandemic health concerns. Retrieved on September 20, 2021, from http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress"
#reference_entities = [{'entity': 'organisation', 'entity_text': 'American Psychological Association', 'entity_start': 0, 'entity_end': 34}, {'entity': 'year', 'entity_text': '2021', 'entity_start': 37, 'entity_end': 41}, {'entity': 'title', 'entity_text': 'One year later, a new wave of pandemic health concerns', 'entity_start': 44, 'entity_end': 98}, {'entity': 'link', 'entity_text': 'http://www.apa.org/news/press/releases/stress/2021/one-year-pandemic-stress', 'entity_start': 138, 'entity_end': 213}]

# fictive reference extraction in same format, but which matches paragraph[1] content
reference_paragraphs = "The 1619 Project. New York Times Magazine. August 18, 2019. https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf."
reference_entities = [{'entity': 'title', 'entity_text': 'The 1619 Project', 'entity_start': 0, 'entity_end': 15},
                     {'entity': 'organisation', 'entity_text': 'New York Times Magazine', 'entity_start': 18, 'entity_end': 40},
                     {'entity': 'date', 'entity_text': 'August 18, 2019', 'entity_start': 43, 'entity_end': 57},
                     {'entity': 'link', 'entity_text': 'https://pulitzercenter.org/sites/default/files/full_issue_of_the_1619_project.pdf', 'entity_start': 60, 'entity_end': 140}]


# grab 1st paragraph matching the reference above
doc_paras = document.paragraphs[1]

# generate 'word' list with character start/end and run

# get document paragraph text elements
doc_para_texts = doc_paras._element.xpath('.//w:t')

# get runs
runs = doc_paras.runs

wordlist = []
charcount = 0
for para in range(len(doc_para_texts)):
    w = {}
    w['text'] = doc_para_texts[para].text
    w['start'] = charcount
    w['end']   = charcount + len(w['text']) - 1
    
    wordlist.append(w)
    charcount = w['end'] + 1


print(wordlist)
for e in reference_entities:
    nchar = e['entity_end'] - e['entity_start'] + 1
    print(e)
    print(nchar)
    
    
    # seek each wordlist element included in the entity
    for i in range(len(wordlist)):
        if wordlist[i]['start'] >= e['entity_start']:
            print("Found element from entity at index " + str(i))
                
            if e['entity'] == 'title':
                # try changing colour in the current doc
                doc_paras.runs[i].font.color.rgb = docx.shared.RGBColor(0xFF, 0x00, 0x00)
                
            elif e['entity'] == 'organisation':
                # try changing colour in the current doc
                doc_paras.runs[i].font.color.rgb = docx.shared.RGBColor(0x00, 0x00, 0xFF)
        if wordlist[i]['end'] >= e['entity_end']:
            break

document.save('testout.docx')
        
        