In [None]:
import docx


def parse_paragraph_runs(paragraph):
    run_dict = {}
    run_texts = []
    run_start_indices = []
    run_end_indices = []
    run_properties = []
    combined_properties = []
    text = paragraph.text
    for run in paragraph.runs:
        run_texts.append(run.text)
        run_start_indices.append(text.index(run.text))
        run_end_indices.append(text.index(run.text) + len(run.text))
        run_properties.append({
            'italic': run.italic,
            'bold': run.bold,
            'superscript': run.style.font.superscript,
            'subscript': run.style.font.subscript
        })
        combined_properties.append({
            'italic_bold': run.italic and run.bold,
            'italic_superscript': run.italic and run.style.font.superscript,
            'bold_superscript': run.bold and run.style.font.superscript
        })
    run_dict['run_texts'] = run_texts
    run_dict['run_start_indices'] = run_start_indices
    run_dict['run_end_indices'] = run_end_indices
    run_dict['run_properties'] = run_properties
    run_dict['combined_properties'] = combined_properties
    print(run_dict)
    return run_dict

document = docx.Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/RL_06_ELMU_REF.docx')
for paragraph in document.paragraphs:
    parse_paragraph_runs(paragraph)


In [2]:
import docx
import csv
import pandas as pd

def parse_paragraph_runs(paragraph, filename):
    run_dict = {}
    run_texts = []
    run_start_indices = []
    run_end_indices = []
    run_properties = []
    combined_properties = []
    text = paragraph.text
    for run in paragraph.runs:
        run_texts.append(run.text)
        run_start_indices.append(text.index(run.text))
        run_end_indices.append(text.index(run.text) + len(run.text))
        prop = {}
        if run.italic:
            prop['italic'] = run.italic
        if run.bold:
            prop['bold'] = run.bold
        if run.style.font.superscript:
            prop['superscript'] = run.style.font.superscript
        if run.style.font.subscript:
            prop['subscript'] = run.style.font.subscript
        run_properties.append(prop)
        
        c_prop = {}
        if run.italic and run.bold:
            c_prop['italic_bold'] = run.italic and run.bold
        if run.italic and run.style.font.superscript:
            c_prop['italic_superscript'] = run.italic and run.style.font.superscript
        if run.bold and run.style.font.superscript:
            c_prop['bold_superscript'] = run.bold and run.style.font.superscript
        combined_properties.append(c_prop)
    run_dict['run_texts'] = run_texts
    run_dict['run_start_indices'] = run_start_indices
    run_dict['run_end_indices'] = run_end_indices
    run_dict['run_properties'] = run_properties
    run_dict['combined_properties'] = combined_properties
    
    with open(filename, 'a', newline='') as csvfile:
        run_df = pd.json_normalize(run_dict)
        if csvfile.tell() == 0:
            run_df.to_csv(csvfile, header=True, index=False)
        else:
            run_df.to_csv(csvfile, header=False, index=False, mode='a')
        print(f"Data written to {filename}")


document = docx.Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/RL_06_ELMU_REF.docx')

for paragraph in document.paragraphs:
    parse_paragraph_runs(paragraph, "paragraphs_data.csv")



In [76]:
import docx
import csv
import pandas as pd
import os

def parse_paragraph_runs(paragraph, paragraph_id, filename):
    run_texts = []
    run_start_indices = []
    run_end_indices = []
    text = paragraph.text
    curr_index = 0

    for run in paragraph.runs:
        run_texts.append(run.text)
        run_start_indices.append(curr_index)
        curr_index += len(run.text)
        run_end_indices.append(curr_index)
    df = pd.DataFrame({
        'paragraph_id': paragraph_id,
        'run_texts': run_texts,
        'run_start_indices': run_start_indices,
        'run_end_indices': run_end_indices,
        'italic': [run.italic for run in paragraph.runs],
        'bold': [run.bold for run in paragraph.runs],
        'superscript': [run.style.font.superscript for run in paragraph.runs],
        'subscript': [run.style.font.subscript for run in paragraph.runs],
        'italic_bold': [run.italic and run.bold for run in paragraph.runs],
        'italic_superscript': [run.italic and run.style.font.superscript for run in paragraph.runs],
        'bold_superscript': [run.bold and run.style.font.superscript for run in paragraph.runs],
    })
    


    if not os.path.exists(filename):
        df.to_csv(filename, index=False, mode='w')
    else:
        df.to_csv(filename, index=False, mode='a', header=False)
    # print(f"Data written to {filename}")

document = docx.Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/color_italic.docx')

for i, paragraph in enumerate(document.paragraphs):
    parse_paragraph_runs(paragraph, i, "paragraphs_data.csv")


In [56]:
import pandas as pd
from docx import Document

def apply_properties_to_document(filename):
    df = pd.read_csv(filename)
    df = df.fillna(False)
    document = Document("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/coloured_RL_06_ELMU_REF copy.docx")
    for i, paragraph in enumerate(document.paragraphs):
        runs_data = df[df['paragraph_id'] == i]
        for j, run in enumerate(paragraph.runs):
            run_data = runs_data[runs_data['run_texts'] == run.text]
            if run_data.empty:
                continue
            run.italic = run_data['italic'].values[0] or run_data['italic_bold'].values[0] or run_data['italic_superscript'].values[0]
            run.bold = run_data['bold'].values[0] or run_data['italic_bold'].values[0] or run_data['bold_superscript'].values[0]
            run.style.font.superscript = run_data['superscript'].values[0] or run_data['italic_superscript'].values[0] or run_data['bold_superscript'].values[0]
            run.style.font.subscript = run_data['subscript'].values[0]
    document.save("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/RL_06_ELMU_REF_colored.docx")

apply_properties_to_document("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/paragraphs_data.csv")


In [57]:
import pandas as pd
from docx import Document

def apply_properties_to_document(filename):
    df = pd.read_csv(filename)
    df = df.fillna(False)
    document = Document("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/coloured_RL_06_ELMU_REF copy.docx")

    # Create paragraph Ids
    for i, paragraph in enumerate(document.paragraphs):
        paragraph._element.attrib["id"] = str(i)
        print()

    # Check if last paragraph id in CSV matches the number of paragraphs in the document
    if int(df.iloc[-1]["paragraph_id"]) != len(document.paragraphs) - 1:
        print("Last paragraph id in CSV does not match the number of paragraphs in the document")
        return

    for i, paragraph in enumerate(document.paragraphs):
        runs_data = df[df['paragraph_id'] == i]
        text = paragraph.text
        for j, run in enumerate(paragraph.runs):
            # Locate the run based on start and end indices
            run_data = runs_data[(runs_data['run_start_indices'] <= j) & (runs_data['run_end_indices'] > j)]
            if run_data.empty:
                continue
            run.italic = run_data['italic'].values[0]
            run.bold = run_data['bold'].values[0]
            run.style.font.superscript = run_data['superscript'].values[0]
            run.style.font.subscript = run_data['subscript'].values[0]
    document.save("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/RL_06_ELMU_REF_colored.docx")

apply_properties_to_document("/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/paragraphs_data.csv")


In [105]:
from docx import Document
from docx.shared import RGBColor
from docx.enum.text import WD_COLOR_INDEX


# Open the document
document = Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/color_italic.docx')

# Iterate through the paragraphs in the document
for paragraph in document.paragraphs:
    # Iterate through the runs in the paragraph
    for run in paragraph.runs:
        # Check if the run's color matches the search color
        print(run.font.color.rgb)

        if run.font.color:
            print(run.font.color.rgb)
            if run.font.color.rgb == "#3791BD":
                print("hi")
                # Set the run's italic property to True
                run.italic = True

# Save the document
document.save('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/modified_document.docx')


3791BD
3791BD


In [127]:
from docx import Document
import xml.etree.ElementTree as ET

document = Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/color_italic.docx')

for paragraph in document.paragraphs:
    for run in paragraph.runs:
        color = run._element.find('.//w:color', run._element.nsmap)
        # print (color.attrib)
        # if color is not None:
        #     print(color.attrib)
        if color is not None and color.get('w:val') == '3791BD':
            run._element.set('w:val', 'A29D96')  # change the color val to black
            run.italic = True
document.save('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/modified_document.docx')


In [137]:
from docx import Document
import xml.etree.ElementTree as ET

document = Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/coloured_RL_06_ELMU_REF.docx')

color_counter = 0
for paragraph in document.paragraphs:
    color_counter = 0
    for run in paragraph.runs:
        xml_element = ET.fromstring(run._element.xml)
        color = xml_element.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
        if color is not None and color.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') == 'A29D96':
            color_counter += 1
            if color_counter == 2:
                run.italic = True
                color_counter = 0
    if color_counter == 1:
        run.italic = True
document.save('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/modified_document.docx')


In [139]:
from docx import Document
import xml.etree.ElementTree as ET

document = Document('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/coloured_RL_06_ELMU_REF.docx')

color_to_match = 'A29D96'
last_matched_run = None

for paragraph in document.paragraphs:
    for run in paragraph.runs:
        xml_element = ET.fromstring(run._element.xml)
        color = xml_element.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}color')
        if color is not None and color.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') == color_to_match:
            last_matched_run = run
    if last_matched_run:
        last_matched_run.italic = True
        last_matched_run = None

document.save('/Users/senthil/Desktop/Senthil/myTesting/python_scripts/reference_coloring/modified_document.docx')
