In [9]:
import re
from collections import Counter

# 定義法文字詞的清單
french_words = {"d’ecroissance"}

# 判斷是否是法文字
def is_french(word):
    return word in french_words

# 將縮寫拆分為兩個單字
def split_contractions(text):
    contractions = {
        "it's": "it is",
        "don't": "do not",
        "can't": "cannot",
        "I've": "I have",
        "you're": "you are",
        "he's": "he is",
        "she's": "she is",
        "we're": "we are",
        "they're": "they are",
        "isn't": "is not",
        "wasn't": "was not",
    }
    
    # 替換縮寫為完整形式
    for contraction, full_form in contractions.items():
        text = re.sub(r"\b" + contraction + r"\b", full_form, text)
    
    return text

# 主要函數，進行處理並建立字典
def analyze_text(file_content):
    text = file_content.lower()  # 將所有文本轉為小寫
    text = split_contractions(text)  # 拆分縮寫
    
    # 使用正則表達式進行自定義分詞，避免拆分 d'ecroissance
    words = re.findall(r"d'ecroissance|\b[a-zA-Z'’]+\b", text)

    word_counter = Counter(words)  # 計算頻率
    
    result = {}
    
    for word in word_counter.keys():
        result[word] = {
            "frequency": word_counter[word],  # 單字頻率
            "POS": "english" if word.isalpha() else "unknown",  # 詞性 (假設非法文字為英文)
            "length": len(word),  # 單字長度
            "lowercase": word,  # 小寫形式
            "starts_with_a_to_z": 'a' <= word[0] <= 'z',  # 是否以 a 到 z 開頭
            "type": "french" if is_french(word) else "english"  # 判斷是否是法文字
        }
        """
        # 確保 d'ecroissance 被標記為法文字
        if word == "d'ecroissance":
            result[word] = {
                "frequency": word_counter[word],  # 單字頻率
                "POS": "french",  # 直接設置詞性為法語
                "length": len(word),  # 單字長度
                "lowercase": word,  # 小寫形式
                "starts_with_a_to_z": 'a' <= word[0] <= 'z',  # 是否以 a 到 z 開頭
                "type": "french"  # 判斷為法文字
            }
        else:
            # 對於其他單字，使用字母的對應規則來推斷詞性
            result[word] = {
                "frequency": word_counter[word],  # 單字頻率
                "POS": "english" if word.isalpha() else "unknown",  # 詞性 (假設非法文字為英文)
                "length": len(word),  # 單字長度
                "lowercase": word,  # 小寫形式
                "starts_with_a_to_z": 'a' <= word[0] <= 'z',  # 是否以 a 到 z 開頭
                "type": "french" if is_french(word) else "english"  # 判斷是否是法文字
            }
        """
    
    return result

# 讀取文本檔案
file_path = 't1.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# 使用 analyze_text 函數來分析文本
result = analyze_text(text)

# 顯示結果
for word, info in result.items():
    print(f"Word: {word}")
    for key, value in info.items():
        print(f"  {key}: {value}")
    print()


Word: the
  frequency: 18
  POS: english
  length: 3
  lowercase: the
  starts_with_a_to_z: True
  type: english

Word: idea
  frequency: 2
  POS: english
  length: 4
  lowercase: idea
  starts_with_a_to_z: True
  type: english

Word: of
  frequency: 15
  POS: english
  length: 2
  lowercase: of
  starts_with_a_to_z: True
  type: english

Word: d’ecroissance
  frequency: 11
  POS: unknown
  length: 13
  lowercase: d’ecroissance
  starts_with_a_to_z: True
  type: french

Word: emerged
  frequency: 1
  POS: english
  length: 7
  lowercase: emerged
  starts_with_a_to_z: True
  type: english

Word: in
  frequency: 12
  POS: english
  length: 2
  lowercase: in
  starts_with_a_to_z: True
  type: english

Word: france
  frequency: 1
  POS: english
  length: 6
  lowercase: france
  starts_with_a_to_z: True
  type: english

Word: mid
  frequency: 1
  POS: english
  length: 3
  lowercase: mid
  starts_with_a_to_z: True
  type: english

Word: as
  frequency: 10
  POS: english
  length: 2
  lowerc