## Append the class name inside docx

In [1]:
import os
import json
from docx import Document
from docx.shared import Pt, RGBColor

def process_folder(folder_path):
    output_folder = os.path.join(folder_path, 'updated_docx')
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                docx_path = os.path.join(root, file)
                json_path = os.path.join(root, file.replace('.docx', '.json'))
                if os.path.exists(json_path):
                    process_docx_with_json(docx_path, json_path, output_folder)

def process_docx_with_json(docx_path, json_path, output_folder):
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)

    doc = Document(docx_path)

    # Remove empty paragraphs before appending class names
    remove_empty_paragraphs(doc)

    for para_data in data.get('LdxData', []):
        para_id = para_data.get('ParaID')
        if para_id and para_id.isdigit():
            para_id = int(para_id)
            if 1 <= para_id <= len(doc.paragraphs):
                para = doc.paragraphs[para_id - 1]
                class_name = para_data.get('IdentifyStyle')
                if class_name:
                    append_class_name(para, class_name)

    output_path = os.path.join(output_folder, os.path.basename(docx_path))
    doc.save(output_path)
    print(f"Processed: {docx_path}, Updated file saved to: {output_path}")

def append_class_name(para, class_name):
    # Append class name at the end of the paragraph
    run = para.add_run(f" {{{{{class_name}}}}}")
    # Preserve the original paragraph style properties
    run.bold = para.style.font.bold
    run.italic = para.style.font.italic
    run.underline = para.style.font.underline
    run.font.size = para.style.font.size
    run.font.color.rgb = para.style.font.color.rgb

def remove_empty_paragraphs(doc):
    for para in doc.paragraphs[:]:
        if not para.text.strip():
            doc._element.body.remove(para._element)

if __name__ == "__main__":
    folder_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/test'
    process_folder(folder_path)


Processed: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/ABCR_20230217/ABCR_20230217.docx, Updated file saved to: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/updated_docx/ABCR_20230217.docx
Processed: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/ABCR_20230161/ABCR_20230161.docx, Updated file saved to: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/updated_docx/ABCR_20230161.docx
Processed: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/ABCR_20230180/ABCR_20230180.docx, Updated file saved to: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/updated_docx/ABCR_20230180.docx
Processed: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/ABCR_20230173/ABCR_20230173.docx, Updated file saved to: /Users/senthil/Downloads/JPT/journals_data_element_prediction/test/updated_docx/ABCR_20230173.docx
Processed: /Users/senthil/Downloads/JPT/journals_data_element_pr

### Collect the uniqure class name from the property json

In [21]:
import os
import json

def collect_identify_styles(folder_path):
    unique_styles = set()

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(root, file)
                styles = extract_identify_styles(json_path)
                unique_styles.update(styles)

    return list(unique_styles)

def extract_identify_styles(json_path):
    styles = set()
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
        for item in data.get('LdxData', []):
            style = item.get('IdentifyStyle')
            if style:
                styles.add(style)
    return styles

if __name__ == "__main__":
    folder_path = '/Users/senthil/Downloads/JPT/journals_data_element_prediction/dataset prep'
    unique_styles = collect_identify_styles(folder_path)
    print(unique_styles)


['AFF', 'SUPPLFIG', 'REF', 'ABSTRACTTEXT', 'ACK', 'VOLUME', 'REF_TITLE', 'SUB_LI', 'FN', 'DOCHEAD', 'BL', 'LI', 'DOI', 'LI_PARA', 'H2', 'AFFS', 'SUPPLTBL', 'EXTRACT', 'LRH', 'BIOGRAPHY_PARA', 'EPIGRAPH', 'EQUATION', 'DATE', 'ISSUE', 'COPYRIGHT', 'EXAMPLE_GROUP', 'SUBJECTEDITOR', 'TRANSABSTRACTTEXT', 'EPIGRAPH_AUTHOR', 'OA', 'ABSTRACTH1', 'TITLE', 'AUTHORS', 'H1', 'H4', 'TBL', 'CORRESPONDENCE', 'PARANOTES', 'TRANSTITLE', '[DELETE]', 'PUBLICATIONDATE', 'TRANSABSTRACTHEAD', 'H3', 'SUPPLTBLFN', 'BOX_TEXT', 'PARA', 'KEYWORDS', 'ABSTRACTHEAD', 'FIG', 'RRH', 'TBLFN', 'BOX_TITLE']
