In [13]:
print ('Word Analysis Prototype 2')

# ! pip install python-docx
from docx import Document


Word Analysis Prototype 2


In [14]:
# functions

def analyze_word_document(file_path):
    """
    Analyze a Word document and extract its elements along with font details.
    
    Args:
        file_path (str): Path to the Word document.
    
    Returns:
        dict: A dictionary containing lists of document elements and their font details.
    """
    try:
        doc = Document(file_path)
        document_elements = {
            "headers": [],
            "subheaders": [],
            "paragraphs": [],
            "bullet_lists": [],
            "images": []
        }
        for paragraph in doc.paragraphs:
            # Extract font details
            if paragraph.runs:  # Ensure the paragraph has runs
                font_details = {
                    "text": paragraph.text.strip(),
                    "font_name": paragraph.runs[0].font.name if paragraph.runs[0].font.name else "Default",
                    "font_size": paragraph.runs[0].font.size.pt if paragraph.runs[0].font.size else "Default",
                    "bold": any(run.bold for run in paragraph.runs),
                    "italic": any(run.italic for run in paragraph.runs),
                    "underline": any(run.underline for run in paragraph.runs),
                }
            else:
                font_details = {"text": paragraph.text.strip(), "font_name": "N/A", "font_size": "N/A", "bold": False, "italic": False, "underline": False}
            # Determine the style of the paragraph
            style = paragraph.style.name
            # Categorize the paragraph based on its style
            if "Heading 1" in style:
                document_elements["headers"].append(font_details)
            elif "Heading" in style:
                document_elements["subheaders"].append(font_details)
            elif paragraph.text.strip() != "":
                document_elements["paragraphs"].append(font_details)
        # Analyze bullet lists
        for paragraph in doc.paragraphs:
            if paragraph.style.name.startswith("List") and paragraph.text.strip():
                font_details = {
                    "text": paragraph.text.strip(),
                    "font_name": paragraph.runs[0].font.name if paragraph.runs[0].font.name else "Default",
                    "font_size": paragraph.runs[0].font.size.pt if paragraph.runs[0].font.size else "Default",
                    "bold": any(run.bold for run in paragraph.runs),
                    "italic": any(run.italic for run in paragraph.runs),
                    "underline": any(run.underline for run in paragraph.runs),
                }
                document_elements["bullet_lists"].append(font_details)
        # Analyze images
        for rel in doc.part.rels.values():
            if "image" in rel.target_ref:
                document_elements["images"].append(rel.target_ref)
        return document_elements
    except Exception as e:
        print(f"An error occurred while processing the document: {e}")
        return None




In [15]:
# Example usage

file_path = "example_one.docx"  # Replace with your Word document path
results = analyze_word_document(file_path)
if results:
    print("Document Elements Found:")
    print("\nHeaders:")
    for header in results["headers"]:
        print(f"  - {header['text']} (Font: {header['font_name']}, Size: {header['font_size']}, Bold: {header['bold']}, Italic: {header['italic']}, Underline: {header['underline']})")
    print("\nSubheaders:")
    for subheader in results["subheaders"]:
        print(f"  - {subheader['text']} (Font: {subheader['font_name']}, Size: {subheader['font_size']}, Bold: {subheader['bold']}, Italic: {subheader['italic']}, Underline: {subheader['underline']})")
    print("\nParagraphs:")
    for paragraph in results["paragraphs"]:
        print(f"  - {paragraph['text']} (Font: {paragraph['font_name']}, Size: {paragraph['font_size']}, Bold: {paragraph['bold']}, Italic: {paragraph['italic']}, Underline: {paragraph['underline']})")
    print("\nBullet Lists:")
    for bullet in results["bullet_lists"]:
        print(f"  - {bullet['text']} (Font: {bullet['font_name']}, Size: {bullet['font_size']}, Bold: {bullet['bold']}, Italic: {bullet['italic']}, Underline: {bullet['underline']})")
    print("\nImages:")
    for image in results["images"]:
        print(f"  - {image}")

Document Elements Found:

Headers:

Subheaders:

Paragraphs:
  - What I Hope to Achieve in This Course (Font: Default, Size: Default, Bold: False, Italic: False, Underline: False)
  - I hope to get a good grade and learn some stuff about computers. (Font: Calibri, Size: Default, Bold: False, Italic: False, Underline: False)
  - Big computers (Font: Calibri, Size: Default, Bold: False, Italic: False, Underline: False)
  - Little computers (Font: Calibri, Size: Default, Bold: False, Italic: False, Underline: False)

Bullet Lists:
  - Big computers (Font: Calibri, Size: Default, Bold: False, Italic: False, Underline: False)
  - Little computers (Font: Calibri, Size: Default, Bold: False, Italic: False, Underline: False)

Images:
  - ../media/image.jpg
