In [None]:
from docx import Document
import re
import jieba

# File path (modify according to actual location)
doc_path = r"D:\Desktop\ policy documents\ERP-2006.docx"

# Mapping of U.S. key departments and their associated keywords
# This can be replaced with the corresponding department and keywords in Appendix C
department_keywords = {
    "Public administration and defence; compulsory social security": ["public administration", "public sector", "defense", "military", "social security"],
    "Pharmaceuticals, medicinal chemical and botanical products": ["pharmaceuticals", "medicine", "drugs", "biotech", "vaccines"],
    "Manufacturing nec; repair and installation of machinery and equipment": ["repair", "installation", "maintenance", "industrial services"],
    "Mining support service activities": ["drilling", "mining services", "support services", "exploration", "geophysical"],
    "Administrative and support services": ["admin services", "support services", "administrative services "],
    "Agriculture, hunting, forestry": ["agriculture", "farming", "forestry", "rural", "livestock", "hunting"],
    "Mining and quarrying, energy producing products": ["mining", "coal", "oil", "natural gas", "fossil fuels", "extraction"],
    "Coke and refined petroleum products": ["petroleum", "refining", "coke", "gasoline", "diesel", "oil refinery"],
    "Chemical and chemical products": ["chemical", "chemicals", "industrial chemicals", "synthetic", "fertilizers"],
    "Basic metals": ["basic metals", "steel", "aluminum", "copper", "smelting", "metal refining"]
}

# Load the Word document
doc = Document(doc_path)

# Extract and combine all text into a single string
full_text = "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])

# Use re.split to divide text into sentences based on punctuation
sentences = re.split(r'[. ！？]', full_text)
sentences = [s.strip() for s in sentences if s.strip()]

# Analyze and count matching results
result = {}

for dept, keywords in department_keywords.items():
    matched_sentences = []
    for i, s in enumerate(sentences):
        # Count how many keywords appear in the sentence
        keyword_count = sum(1 for kw in keywords if kw in s)
        if keyword_count >= 2:  # At least 2 keywords required to match
            matched_sentences.append(s)
        elif keyword_count == 1:  # If only one keyword, check the context
            # Get one sentence before and after (if available)
            context = " ".join(sentences[max(0, i-1):i+2])
            context_keyword_count = sum(1 for kw in keywords if kw in context)
            if context_keyword_count >= 1:  # At least one more keyword in the context
                matched_sentences.append(s)
    
    total_chars = sum(len(s) for s in matched_sentences)
    result[dept] = {
        "Matched Sentences": len(matched_sentences),
        "Total Characters": total_chars
    }

# Print the statistics
print("【Statistics of Key Department Matches】\n")
for dept, stats in result.items():
    print(f"{dept}: Matched Sentences = {stats['Matched Sentences']}, Total Characters = {stats['Total Characters']}")