In [1]:
import zipfile
import os

def extract_docx_to_xml(docx_path, output_dir):
    # 创建输出目录，如果不存在
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 打开并解压 .docx 文件
    with zipfile.ZipFile(docx_path, 'r') as docx:
        # 列出所有文件
        file_list = docx.namelist()
        
        # 只提取 .xml 文件
        for file_name in file_list:
            if file_name.endswith('.xml'):
                # 提取文件到指定输出目录
                docx.extract(file_name, output_dir)
                print(f'Extracted {file_name} to {output_dir}')

    print("Extraction complete!")

# 使用示例
docx_path = 'tt.docx'  # 替换为你的 .docx 文件路径
output_dir = 'tt_xml'  # 替换为你想保存 XML 文件的目录
extract_docx_to_xml(docx_path, output_dir)


Extracted [Content_Types].xml to tt_xml
Extracted customXml/item1.xml to tt_xml
Extracted customXml/item2.xml to tt_xml
Extracted customXml/itemProps1.xml to tt_xml
Extracted customXml/itemProps2.xml to tt_xml
Extracted docProps/app.xml to tt_xml
Extracted docProps/core.xml to tt_xml
Extracted docProps/custom.xml to tt_xml
Extracted word/document.xml to tt_xml
Extracted word/fontTable.xml to tt_xml
Extracted word/footer1.xml to tt_xml
Extracted word/header1.xml to tt_xml
Extracted word/numbering.xml to tt_xml
Extracted word/settings.xml to tt_xml
Extracted word/styles.xml to tt_xml
Extracted word/theme/theme1.xml to tt_xml
Extraction complete!


In [3]:
import re
import csv
from docx import Document


def extract_person_info(text):
    zong_name_match = re.search(r'项目总负责人：([^\n]+)', text)
    dan_name_match = re.search(r'单项设计负责人：([^\n]+)',text)
    jian_name_match = re.search(r'建设单位联系人：([^\n]+)',text)
    tel_match = re.findall(r'电话：([^\n]+)', text)
    mail_match = re.findall(r'电子邮箱：([^\n]+)', text)

    persons = []
    if zong_name_match and len(tel_match)>0 and len(mail_match)>0:
        person1={
            'name':zong_name_match.group(1).strip(),
            'tel':tel_match[0].strip(),
            'mail':mail_match[0].strip()
        }
        persons.append(person1)
    if dan_name_match and len(tel_match) > 1 and len(mail_match) > 1:
        person2={
            'name':dan_name_match.group(1).strip(),
            'tel':tel_match[1].strip(),
            'mail':mail_match[1].strip()
        }   
        persons.append(person2)
    if jian_name_match and len(tel_match) > 2 and len(mail_match) > 2 :
        person3={
            'name':jian_name_match.group(1).strip(),
            'tel':tel_match[2].strip(),
            'mail':mail_match[2].strip()
        }
        persons.append(person3)
    return persons if persons else None

def find_table_after_paragraph(docx_path, paragraph_text):
    doc = Document(docx_path)
    
    # 标记找到指定段落后的第一个表格
    found_paragraph = False
    
    # 遍历所有段落和表格
    for para in doc.paragraphs:
        if paragraph_text in para.text:
            found_paragraph = True
        
        # 如果已经找到了指定段落，检查下一个表格
        if found_paragraph:
            for table in doc.tables:
                if para._element.getnext() is table._element:
                    # 获取表格的最右下角单元格内容
                    last_row = table.rows[-1]
                    last_cell = last_row.cells[-1]
                    return last_cell.text.strip()
    
    # 如果没有找到对应表格，返回空字符串或其他提示信息
    return None

# 使用示例
docx_path = 'test.docx'  # 替换为你的文件路径
paragraph_text = '设计文件分发表'  # 替换为段落中的目标文字

content = find_table_after_paragraph(docx_path, paragraph_text)
# if content:
#     print(content)
# else:
#     print(f"未找到位于'{paragraph_text}'段落后面的表格。")
persons = extract_person_info(content)
print(persons)


def find_person_in_csv(csv_path, persons):
    # 读取 CSV 文件
    with open(csv_path, newline='', encoding='GBK') as csvfile:
        reader = csv.DictReader(csvfile)
        
        # 将 CSV 文件中的数据转换为列表
        data = list(reader)
        
    # 查找并输出不完全符合的人员姓名
    unmatched_persons = []
    for person in persons:
        matched = False
        for row in data:
            if (row['姓名'].strip() == person['name'] and
                row['手机号'].strip() == person['tel'] and
                row['邮箱'].strip() == person['mail']):
                matched = True
                break
        if not matched:
            unmatched_persons.append(person['name'])
    
    return unmatched_persons

# 使用示例
csv_path = 'information.csv'  # 替换为你的CSV文件路径

unmatched = find_person_in_csv(csv_path, persons)
if unmatched:
    print("以下人员的信息在 CSV 文件中不完全符合:")
    for name in unmatched:
        print(name)
else:
    print("所有人员的信息都完全符合。")

[{'name': '蒋献周', 'tel': '13548010827', 'mail': 'jiangxianzhou.cicdi@chinaccs.cn'}, {'name': '齐天宇', 'tel': '17805101835', 'mail': 'qitianyu2007@126.com'}]
以下人员的信息在 CSV 文件中不完全符合:
蒋献周


In [7]:
import xml.etree.ElementTree as ET

def parse_header_style(xml_path):
    # 解析 XML 文件
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # 定义命名空间
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    }
    
    # 提取页眉样式
    header_style = {
        'font_size': None,
        'alignment': None,
        'highlight': None
    }
    
    # 查找页眉部分
    p = root.find('.//w:p', namespaces)
    if p is not None:
        # 查找段落属性
        pPr = p.find('.//w:pPr', namespaces)
        if pPr is not None:
            # 提取对齐方式
            jc = pPr.find('.//w:jc', namespaces)
            if jc is not None:
                header_style['alignment'] = jc.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
            
            # 查找运行属性
            rPr = pPr.find('.//w:rPr', namespaces)
            if rPr is not None:
                # 提取字体大小
                sz = rPr.find('.//w:sz', namespaces)
                if sz is not None:
                    header_style['font_size'] = sz.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
                
                # 提取高亮
                highlight = rPr.find('.//w:highlight', namespaces)
                if highlight is not None:
                    header_style['highlight'] = highlight.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')

    return header_style

def main():
    # 替换为你的 XML 文件路径
    xml_path = './tt_xml/word/header1.xml'
    header_style = parse_header_style(xml_path)
    
    # 打印页眉样式信息
    print(f"页眉字体大小: {header_style['font_size']}")
    print(f"页眉对齐方式: {header_style['alignment']}")
    print(f"页眉高亮: {header_style['highlight']}")

if __name__ == "__main__":
    main()


页眉字体大小: 21
页眉对齐方式: right
页眉高亮: None


In [9]:
import zipfile
import xml.etree.ElementTree as ET

def get_header_parts(docx_path):
    # 读取 DOCX 文件
    with zipfile.ZipFile(docx_path) as docx:
        # 提取 header.xml 文件
        header_files = [name for name in docx.namelist() if 'header' in name]
        
        headers = []
        for header_file in header_files:
            with docx.open(header_file) as f:
                tree = ET.parse(f)
                root = tree.getroot()
                headers.append(root)
        return headers

def check_headers(headers):
    header_status = []
    for i, header in enumerate(headers):
        # 假设 header 在所有页上都存在（通常如此）
        header_style = parse_header_style_from_root(header)
        header_status.append({
            'page': i + 1,
            'has_header': header_style is not None
        })
    return header_status

def parse_header_style_from_root(root):
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    }
    # 提取页眉样式信息
    p = root.find('.//w:p', namespaces)
    if p is not None:
        pPr = p.find('.//w:pPr', namespaces)
        if pPr is not None:
            jc = pPr.find('.//w:jc', namespaces)
            sz = pPr.find('.//w:rPr/w:sz', namespaces)
            return {
                'alignment': jc.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') if jc is not None else None,
                'font_size': sz.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') if sz is not None else None
            }
    return None

def main():
    docx_path = 'test.docx'
    headers = get_header_parts(docx_path)
    header_status = check_headers(headers)
    
    for status in header_status:
        print(f"页 {status['page']} {'有' if status['has_header'] else '没有'} 页眉")

if __name__ == "__main__":
    main()


页 1 有 页眉
页 2 有 页眉
页 3 有 页眉


In [6]:
import xml.etree.ElementTree as ET

def find_catalog_and_check_format(xml_file):
    # 解析 XML 文件
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # 定义命名空间
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

    # 查找所有段落
    for paragraph in root.findall('.//w:p', namespaces):
        texts = paragraph.findall('.//w:t', namespaces)
        if texts:
            combined_text = ''.join([t.text for t in texts])
            if '目' in combined_text and '录' in combined_text:
                print(f'Found "目录": {combined_text}')
                # 检测字号和行距
                rPr = paragraph.find('.//w:rPr', namespaces)
                if rPr is not None:
                    sz = rPr.find('w:sz', namespaces)
                    spacing = paragraph.find('.//w:spacing', namespaces)
                    if sz is not None and spacing is not None:
                        size = sz.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
                        line_spacing = spacing.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}line')
                        print(f'Font size: {size}, Line spacing: {line_spacing}')
                    else:
                        print('No font size or line spacing found.')
                else:
                    print('No text formatting found for this paragraph.')
                break

# 调用函数并传入XML文件路径
find_catalog_and_check_format('./tt_xml/word/document.xml')


Found "目录": 目      录
Font size: 24, Line spacing: 360


In [2]:

import xml.etree.ElementTree as ET  

def check_directory_and_font(file_path):
    with open(file_path, 'rb') as file:  
        xml_content = file.read()  
    root = ET.fromstring(xml_content) 
    
    namespaces = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    }

    for paragraph in root.findall('.//w:p', namespaces=namespaces): 
        texts = [node.text for node in paragraph.findall('.//w:t', namespaces=namespaces) if node.text]  # 查找段落中的所有文本节点并合并
        combined_text = ''.join(texts)
        
        if '目' in combined_text and '录' not in combined_text: 
            print("未找到目录内容") 
        else:  
            rPr = paragraph.find('.//w:rPr', namespaces=namespaces) 
            if rPr is not None:  
                # 检测字体
                rFonts = rPr.find('.//w:rFonts', namespaces=namespaces)
                if rFonts is not None: 
                    font_eastAsia = rFonts.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}eastAsia')  # 获取东亚字体
                if font_eastAsia != '宋体':  
                    print(f'目录字体为{font_eastAsia},请设置为宋体.') 
            else:
                print('未找到目录字体格式，请检查')

                
                # 检测字号
                sz = rPr.find('.//w:sz', namespaces=namespaces) 
                szCs = rPr.find('.//w:szCs', namespaces=namespaces) 
                if sz is not None and szCs is not None:  # 如果找到字号元素
                    size = int(sz.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 0)) / 2
                    sizeCs = int(szCs.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 0)) / 2
    
                    # 检查是否为小四（12磅）
                    if size != 12 or sizeCs != 12:
                        print(f'目录字体大小为 {size} 磅，请设置为小四。')
                else: 
                    print('未设置目录文本字体大小，请检查')

                
                # 检测行距
                # 检测行距
                spacing = paragraph.find('.//w:spacing', namespaces=namespaces)
                if spacing is not None: 
                    line_spacing = int(spacing.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}line', 0))
                    if line_spacing == 360:
                        print('行距为1.5倍行距。')
                    else:
                        actual_line_spacing = line_spacing / 240.0  # 将行距值转换为倍数
                        print(f'目录行距为{actual_line_spacing:.1f}倍行距，请设置为1.5倍行距。')
                else:  # 如果没有找到行距元素
                    print('No line spacing information found.')  # 输出没有找到行距信息的消息
            
            break  # 处理完一个段落后跳出循环

# 文件路径
file_path = './tt_xml/word/document.xml'  # 设置要处理的XML文件路径

# 检查目录和字体格式
check_directory_and_font(file_path)  # 调用函数处理文件



Found "目录": 目      录
Fonts - EastAsia: 宋体, ASCII: Times New Roman, hAnsi: Times New Roman, CS: Times New Roman
Font size: 12.0 pt (EastAsia: 12.0 pt)
Line spacing: 360
